본문 바로가기

dev_AI_framework

CUDA 기반 loss function 계산

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cuda_runtime.h>
#include <cmath>

namespace py = pybind11;

// MSE 손실 계산 커널
__global__ void mseLossKernel(const float* y_true, const float* y_pred, float* loss, int n) {
    __shared__ float partial_sum[256];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    float diff = 0.0f;
    if (idx < n) {
        diff = y_pred[idx] - y_true[idx];
        partial_sum[tid] = diff * diff;
    } else {
        partial_sum[tid] = 0.0f;
    }

    __syncthreads();

    // 병렬 reduction (block 내부 합산)
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            partial_sum[tid] += partial_sum[tid + stride];
        }
        __syncthreads();
    }

    // 블록마다 하나의 partial sum 저장
    if (tid == 0) {
        atomicAdd(loss, partial_sum[0]);
    }
}

// Binary Cross Entropy 손실 계산 커널
__global__ void bceLossKernel(const float* y_true, const float* y_pred, float* loss, int n) {
    __shared__ float partial_sum[256];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    float val = 0.0f;
    if (idx < n) {
        float y = y_true[idx];
        float p = fminf(fmaxf(y_pred[idx], 1e-7f), 1.0f - 1e-7f);  // 수치 안정성
        val = - (y * logf(p) + (1.0f - y) * logf(1.0f - p));
        partial_sum[tid] = val;
    } else {
        partial_sum[tid] = 0.0f;
    }

    __syncthreads();

    // 병렬 reduction
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            partial_sum[tid] += partial_sum[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        atomicAdd(loss, partial_sum[0]);
    }
}

// 손실 함수 계산 실행 함수
float computeLoss(const float* h_y_true, const float* h_y_pred, int n, const std::string& loss_type) {
    float* d_y_true;
    float* d_y_pred;
    float* d_loss;
    float h_loss = 0.0f;

    cudaMalloc((void**)&d_y_true, n * sizeof(float));
    cudaMalloc((void**)&d_y_pred, n * sizeof(float));
    cudaMalloc((void**)&d_loss, sizeof(float));
    cudaMemcpy(d_y_true, h_y_true, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y_pred, h_y_pred, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_loss, &h_loss, sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    if (loss_type == "mse") {
        mseLossKernel<<<gridSize, blockSize>>>(d_y_true, d_y_pred, d_loss, n);
    } else if (loss_type == "bce") {
        bceLossKernel<<<gridSize, blockSize>>>(d_y_true, d_y_pred, d_loss, n);
    } else {
        throw std::invalid_argument("지원하지 않는 손실 함수입니다. 'mse', 'bce' 중 선택하세요.");
    }

    cudaMemcpy(&h_loss, d_loss, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_y_true);
    cudaFree(d_y_pred);
    cudaFree(d_loss);

    return h_loss / n;
}

// Pybind 래퍼
float compute_loss(py::array_t<float> y_true, py::array_t<float> y_pred, std::string loss_type) {
    py::buffer_info y_true_buf = y_true.request();
    py::buffer_info y_pred_buf = y_pred.request();

    if (y_true_buf.size != y_pred_buf.size) {
        throw std::invalid_argument("y_true와 y_pred의 크기가 일치하지 않습니다.");
    }

    float* h_y_true = static_cast<float*>(y_true_buf.ptr);
    float* h_y_pred = static_cast<float*>(y_pred_buf.ptr);
    int n = y_true_buf.size;

    return computeLoss(h_y_true, h_y_pred, n, loss_type);
}

// Pybind 모듈 정의
PYBIND11_MODULE(losses_cuda, m) {
    m.def("compute_loss", &compute_loss, "CUDA 기반 손실 함수 계산",
          py::arg("y_true"), py::arg("y_pred"), py::arg("loss_type"));
}

전체 구조 개요

CUDA 커널 ( __global__ ) : GPU 에서 병렬로 손실 값을 계산하는 함수들

computeLoss 함수 : CUDA 커널을 실행하고 결과를 반환하는 메인 함수

compute_loss 함수 : pybind11 로 Python 과 연결하는 래퍼 함수

PYBIND11_MODULE Python 에서 import 할 수 있도록 모듈 생성

 

python setup.py build_ext --name losses_cuda --sources losses_cuda.cu

 

test_setup 을 통한 losses_cuda_test 코드 작성

import numpy as np
import os
import sys

# ✅ test_setup.py 경로 추가
sys.path.insert(0, os.path.abspath("C:/Users/owner/Desktop/AI_framework-dev/dev/tests"))

from test_setup import import_cuda_module

# ✅ losses_cuda 모듈 import
losses_cuda = import_cuda_module(
    module_name="losses_cuda",
    build_dir="C:/Users/owner/Desktop/AI_framework-dev/dev/backend/backend_ops/losses/build/lib.win-amd64-cpython-312"
)

def test_mse_loss():
    y_true = np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32)
    y_pred = np.array([0.9, 0.1, 0.8, 0.2], dtype=np.float32)

    loss = losses_cuda.compute_loss(y_true, y_pred, "mse")
    expected = np.mean((y_true - y_pred) ** 2)

    print(f"[MSE Loss] CUDA: {loss:.6f}, NumPy: {expected:.6f}")
    assert abs(loss - expected) < 1e-6

def test_bce_loss():
    y_true = np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32)
    y_pred = np.array([0.9, 0.1, 0.8, 0.2], dtype=np.float32)

    eps = 1e-7
    y_pred_clipped = np.clip(y_pred, eps, 1 - eps)
    expected = -np.mean(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))

    loss = losses_cuda.compute_loss(y_true, y_pred, "bce")

    print(f"[BCE Loss] CUDA: {loss:.6f}, NumPy: {expected:.6f}")
    assert abs(loss - expected) < 1e-6

if __name__ == "__main__":
    test_mse_loss()
    test_bce_loss()
    print("✅ 모든 손실 함수 테스트 통과")
PS C:\Users\owner\Desktop\AI_framework-dev> & C:/Users/owner/AppData/Local/Programs/Python/Python312/python.exe c:/Users/owner/Desktop/AI_framework-dev/dev/backend/backend_ops/losses/tests/losses_cuda_test.py
✅ .pyd 경로 등록됨: C:/Users/owner/Desktop/AI_framework-dev/dev/backend/backend_ops/losses/build/lib.win-amd64-cpython-312
✅ CUDA 모듈 'losses_cuda' import 성공!
[MSE Loss] CUDA: 0.025000, NumPy: 0.025000
[BCE Loss] CUDA: 0.164252, NumPy: 0.164252
✅ 모든 손실 함수 테스트 통과