Warp All-Reduce (max / sum) 코드 분석 - warp 내 lane 들의 동시 동작, __shfl_ )
// ---------- warp reductions ----------__inline__ __device__ float warp_allreduce_max(float v) { unsigned mask = 0xffffffffu; v = fmaxf(v, __shfl_xor_sync(mask, v, 16)); v = fmaxf(v, __shfl_xor_sync(mask, v, 8)); v = fmaxf(v, __shfl_xor_sync(mask, v, 4)); v = fmaxf(v, __shfl_xor_sync(mask, v, 2)); v = fmaxf(v, __shfl_xor_sync(mask, v, 1)); return v;}__inline__ __device__ fl..