본문 바로가기

dev_AI_framework

비용 함수 값 연산 및 계산 그래프 생성 부분

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cuda_runtime.h>
#include <cmath>

namespace py = pybind11;

__global__ void mseLossKernel(const float* y_true, const float* y_pred, float* loss, int n) {
    __shared__ float partial_sum[256];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    float diff = 0.0f;
    if (idx < n) {
        diff = y_pred[idx] - y_true[idx];
        partial_sum[tid] = diff * diff;
    } else {
        partial_sum[tid] = 0.0f;
    }

    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            partial_sum[tid] += partial_sum[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        atomicAdd(loss, partial_sum[0]);
    }
}

__global__ void bceLossKernel(const float* y_true, const float* y_pred, float* loss, int n) {
    __shared__ float partial_sum[256];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    float val = 0.0f;
    if (idx < n) {
        float y = y_true[idx];
        float p = fminf(fmaxf(y_pred[idx], 1e-7f), 1.0f - 1e-7f);
        val = - (y * logf(p) + (1.0f - y) * logf(1.0f - p));
        partial_sum[tid] = val;
    } else {
        partial_sum[tid] = 0.0f;
    }

    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            partial_sum[tid] += partial_sum[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        atomicAdd(loss, partial_sum[0]);
    }
}

// 공통 CUDA 실행 함수
float launchLossKernel(const float* h_y_true, const float* h_y_pred, int n, void(*kernel)(const float*, const float*, float*, int)) {
    float* d_y_true;
    float* d_y_pred;
    float* d_loss;
    float h_loss = 0.0f;

    cudaMalloc(&d_y_true, n * sizeof(float));
    cudaMalloc(&d_y_pred, n * sizeof(float));
    cudaMalloc(&d_loss, sizeof(float));
    cudaMemcpy(d_y_true, h_y_true, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y_pred, h_y_pred, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_loss, &h_loss, sizeof(float), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    kernel<<<gridSize, blockSize>>>(d_y_true, d_y_pred, d_loss, n);

    cudaMemcpy(&h_loss, d_loss, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_y_true);
    cudaFree(d_y_pred);
    cudaFree(d_loss);

    return h_loss / n;
}

// ==========================
//      Pybind 함수 정의
// ==========================

float mse_loss(py::array_t<float> y_true, py::array_t<float> y_pred) {
    auto y_true_buf = y_true.request();
    auto y_pred_buf = y_pred.request();

    if (y_true_buf.size != y_pred_buf.size) {
        throw std::invalid_argument("y_true와 y_pred의 크기가 일치하지 않습니다.");
    }

    return launchLossKernel(
        static_cast<float*>(y_true_buf.ptr),
        static_cast<float*>(y_pred_buf.ptr),
        y_true_buf.size,
        mseLossKernel
    );
}

float binary_crossentropy(py::array_t<float> y_true, py::array_t<float> y_pred) {
    auto y_true_buf = y_true.request();
    auto y_pred_buf = y_pred.request();

    if (y_true_buf.size != y_pred_buf.size) {
        throw std::invalid_argument("y_true와 y_pred의 크기가 일치하지 않습니다.");
    }

    return launchLossKernel(
        static_cast<float*>(y_true_buf.ptr),
        static_cast<float*>(y_pred_buf.ptr),
        y_true_buf.size,
        bceLossKernel
    );
}

// Placeholder for Categorical Crossentropy (추후 구현 예정)
float categorical_crossentropy(py::array_t<float> y_true, py::array_t<float> y_pred) {
    throw std::runtime_error("Categorical Crossentropy는 아직 구현되지 않았습니다.");
}

// ==========================
//        모듈 등록
// ==========================
PYBIND11_MODULE(losses_cuda, m) {
    m.def("mse_loss", &mse_loss, "MSE 손실 계산");
    m.def("binary_crossentropy", &binary_crossentropy, "Binary Crossentropy 계산");
    m.def("categorical_crossentropy", &categorical_crossentropy, "Categorical Crossentropy 계산");
}

losses_cuda 코드 작성, 및 빌드

 

python setup.py build_ext --name losses_cuda --sources losses_cuda.cu

 

setup 코드에 맞게 빌드,

# dev/losses/__init__.py

import numpy as np
import sys
import os

# ✅ 프로젝트 루트 (AI_framework-dev)를 sys.path에 삽입
cur = os.path.abspath(__file__)
while True:
    cur = os.path.dirname(cur)
    if os.path.basename(cur) == "AI_framework-dev":
        if cur not in sys.path:
            sys.path.insert(0, cur)
        break
    if cur == os.path.dirname(cur):
        raise RuntimeError("프로젝트 루트(AI_framework-dev)를 찾을 수 없습니다.")

# ✅ 공통 경로 설정
from dev.tests.test_setup import setup_paths
setup_paths()

# ✅ CUDA 기반 손실 함수 모듈 import
try:
    import losses_cuda
    print("✅ losses_cuda 모듈 로드 성공")
except ImportError as e:
    print("❌ losses_cuda import 실패:", e)
    sys.exit(1)

# ✅ CUDA 손실 함수 래퍼
def mse(y_true, y_pred):
    return losses_cuda.mse_loss(y_true.astype(np.float32), y_pred.astype(np.float32))

def binary_crossentropy(y_true, y_pred):
    return losses_cuda.binary_crossentropy(y_true.astype(np.float32), y_pred.astype(np.float32))

def categorical_crossentropy(y_true, y_pred):
    return losses_cuda.categorical_crossentropy(y_true.astype(np.float32), y_pred.astype(np.float32))

# ✅ 이름 → 함수 매핑
ALL_LOSSES_DICT = {
    "mse": mse,
    "binary_crossentropy": binary_crossentropy,
    "categorical_crossentropy": categorical_crossentropy,
}

def get(identifier):
    """
    손실 함수 이름 또는 함수 객체를 받아 반환
    """
    if isinstance(identifier, str):
        identifier = identifier.lower()
        loss_fn = ALL_LOSSES_DICT.get(identifier)
        if callable(loss_fn):
            return loss_fn

    if callable(identifier):
        return identifier

    raise ValueError(
        f"Invalid loss function identifier: '{identifier}'. "
        f"Available options: {', '.join(ALL_LOSSES_DICT.keys())}."
    )​

 

손실 함수 매핑 __init__ 코드

 

    def compute_loss_and_metrics(self, y_pred, y_true):
        self.loss_value = self.loss(y_true, y_pred)
        self.loss_node_list = []
        self.metric_value = self.metric(y_pred, y_true)
        return self.loss_value

compile 을 통해 모델의 비용 함수를 정의 했고,

self.loss_value 전달을 통해 비용 함수 값의 계산을 수행

그래프 최적화 + Lazy Execution 이거 알아보고 추가해야겠네