실험적 test 코드 작성 - Thread / Block / Grid 인덱싱 감각 잡기

threadIdx.x , blockIdx.x , blockDim.x 가 실제로 어떤 값들을 가지는지,

global_id = blockIdx.x * blockDim.x + threadIdx.x 패턴이 메모리 인덱스랑 어떻게 연결되는지

#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <iomanip>

#define CUDA_CHECK(call)                                      \
    do {                                                      \
        cudaError_t err__ = (call);                           \
        if (err__ != cudaSuccess) {                           \
            std::cerr << "CUDA error: "                       \
                      << cudaGetErrorString(err__)            \
                      << " at " << __FILE__ << ":" << __LINE__\
                      << std::endl;                           \
            std::exit(1);                                     \
        }                                                     \
    } while (0)

__global__ void fill_indices(
    int* out_global,   // global thread id
    int* out_block,    // blockIdx.x
    int* out_thread,   // threadIdx.x
    int* out_warp,     // warp id within block
    int   N)
{
    int tid_in_block = threadIdx.x;
    int bid          = blockIdx.x;
    int bdim         = blockDim.x;
    int gid          = bid * bdim + tid_in_block;  // global thread id

    if (gid >= N) return;

    int warp_id_in_block = tid_in_block / 32;      // warp size = 32 가정

    out_global[gid] = gid;
    out_block[gid]  = bid;
    out_thread[gid] = tid_in_block;
    out_warp[gid]   = warp_id_in_block;
}

int main()
{
    // ===== 실험 파라미터 =====
    const int N = 128;          // 전체 “스레드 수” 느낌으로 생각
    const int block_size = 32;  // 여기 값을 32, 64, 128 등으로 바꿔볼 예정
    int grid_size = (N + block_size - 1) / block_size;

    std::cout << "N = " << N
              << ", block_size = " << block_size
              << ", grid_size = " << grid_size << std::endl;

    // ===== host 메모리 =====
    std::vector<int> h_global(N), h_block(N), h_thread(N), h_warp(N);

    // ===== device 메모리 =====
    int *d_global = nullptr, *d_block = nullptr, *d_thread = nullptr, *d_warp = nullptr;
    CUDA_CHECK(cudaMalloc(&d_global, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_block,  N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_thread, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_warp,   N * sizeof(int)));

    // ===== 커널 런치 =====
    fill_indices<<<grid_size, block_size>>>(d_global, d_block, d_thread, d_warp, N);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    // ===== 결과 복사 =====
    CUDA_CHECK(cudaMemcpy(h_global.data(), d_global, N * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_block.data(),  d_block,  N * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_thread.data(), d_thread, N * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_warp.data(),   d_warp,   N * sizeof(int), cudaMemcpyDeviceToHost));

    // ===== 앞부분만 출력 =====
    std::cout << "idx | global | block | thread | warp_in_block\n";
    std::cout << "----+--------+-------+--------+--------------\n";

    int print_N = std::min(N, 64);  // 앞 64개만 보기
    for (int i = 0; i < print_N; ++i) {
        std::cout << std::setw(3) << i << " | "
                  << std::setw(6) << h_global[i] << " | "
                  << std::setw(5) << h_block[i]  << " | "
                  << std::setw(6) << h_thread[i] << " | "
                  << std::setw(12) << h_warp[i]  << "\n";
    }

    // ===== 정리 =====
    CUDA_CHECK(cudaFree(d_global));
    CUDA_CHECK(cudaFree(d_block));
    CUDA_CHECK(cudaFree(d_thread));
    CUDA_CHECK(cudaFree(d_warp));

    return 0;
}




//////////////////////////////////////////////////////////////////////////////////////////////////
N = 128, block_size = 32, grid_size = 4
idx | global | block | thread | warp_in_block
----+--------+-------+--------+--------------
  0 |      0 |     0 |      0 |            0
  1 |      1 |     0 |      1 |            0
  2 |      2 |     0 |      2 |            0
  3 |      3 |     0 |      3 |            0
  4 |      4 |     0 |      4 |            0
  5 |      5 |     0 |      5 |            0
  6 |      6 |     0 |      6 |            0
  7 |      7 |     0 |      7 |            0
  8 |      8 |     0 |      8 |            0
  9 |      9 |     0 |      9 |            0
 10 |     10 |     0 |     10 |            0
 11 |     11 |     0 |     11 |            0
 12 |     12 |     0 |     12 |            0
 13 |     13 |     0 |     13 |            0
 14 |     14 |     0 |     14 |            0
 15 |     15 |     0 |     15 |            0
 16 |     16 |     0 |     16 |            0
 17 |     17 |     0 |     17 |            0
 18 |     18 |     0 |     18 |            0
 19 |     19 |     0 |     19 |            0
 20 |     20 |     0 |     20 |            0
 21 |     21 |     0 |     21 |            0
 22 |     22 |     0 |     22 |            0
 23 |     23 |     0 |     23 |            0
 24 |     24 |     0 |     24 |            0
 25 |     25 |     0 |     25 |            0
 26 |     26 |     0 |     26 |            0
 27 |     27 |     0 |     27 |            0
 28 |     28 |     0 |     28 |            0
 29 |     29 |     0 |     29 |            0
 30 |     30 |     0 |     30 |            0
 31 |     31 |     0 |     31 |            0
 32 |     32 |     1 |      0 |            0
 33 |     33 |     1 |      1 |            0
 34 |     34 |     1 |      2 |            0
 35 |     35 |     1 |      3 |            0
 36 |     36 |     1 |      4 |            0
 37 |     37 |     1 |      5 |            0
 38 |     38 |     1 |      6 |            0
 39 |     39 |     1 |      7 |            0
 40 |     40 |     1 |      8 |            0
 41 |     41 |     1 |      9 |            0
 42 |     42 |     1 |     10 |            0
 43 |     43 |     1 |     11 |            0
 44 |     44 |     1 |     12 |            0
 45 |     45 |     1 |     13 |            0
 46 |     46 |     1 |     14 |            0
 47 |     47 |     1 |     15 |            0
 48 |     48 |     1 |     16 |            0
 49 |     49 |     1 |     17 |            0
 50 |     50 |     1 |     18 |            0
 51 |     51 |     1 |     19 |            0
 52 |     52 |     1 |     20 |            0
 53 |     53 |     1 |     21 |            0
 54 |     54 |     1 |     22 |            0
 55 |     55 |     1 |     23 |            0
 56 |     56 |     1 |     24 |            0
 57 |     57 |     1 |     25 |            0
 58 |     58 |     1 |     26 |            0
 59 |     59 |     1 |     27 |            0
 60 |     60 |     1 |     28 |            0
 61 |     61 |     1 |     29 |            0
 62 |     62 |     1 |     30 |            0
 63 |     63 |     1 |     31 |            0

각 블록에는 스레드가 존재한다는 개념 인지,

warp 가 블록안에서 32 단위로 자른다는 거 확인

block_size 를 크게 하면 block 안 warp 개수가 늘어난다.

'dev_AI_framework' 카테고리의 다른 글

현재 구현된 gemm 의 fwd 부분 커널 코드 확인 (0)	2025.11.24
Shared Memory - Bank Conflict (0)	2025.11.23
GPU 관련 개념 꽉 잡기 (0)	2025.11.16
1024, 1024 gemm 벤치 코드 테스트 (0)	2025.11.16
ncu 실제 분석 내용 ( 최적화 항목 확인 ) - 다음 단계 내용 포함 (0)	2025.11.16

뜻 지, 가르칠 훈

실험적 test 코드 작성 - Thread / Block / Grid 인덱싱 감각 잡기

'dev_AI_framework' 카테고리의 다른 글

티스토리툴바

실험적 test 코드 작성 - Thread / Block / Grid 인덱싱 감각 잡기

'dev_AI_framework' 카테고리의 다른 글

'dev_AI_framework' Related Articles

티스토리툴바