cuda, pybind 파일의 build, 버전 호환 및 경로 지정 문제 해결...

dev_AI_framework

cuda, pybind 파일의 build, 버전 호환 및 경로 지정 문제 해결...

명징직조지훈 2024. 12. 19. 09:51

Pybind 11 을 사용하여 C++ 코드와 Python간에 인터페이스 연결,

CUDA 코드를 Pybind11 을 통해 Python 에서 호출하도록 한다.

1. 프로젝트 구조

project/
│-- pybind_cuda/
│   │-- cuda_add.cu       # CUDA 및 Pybind11 C++ 코드
│   │-- setup.py          # Python 빌드 스크립트
│   └-- __init__.py       # 패키지 초기화
└-- main.py               # Python 코드에서 호출

2. CUDA 코드 및 Pybind11 인터페이스

#include <cuda_runtime.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

namspace py = pybind11;

// CUDA 커널 함수
__global__ void addKernel(const float* a, const float* b, float* c, int size) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
    	c[idx] = a[idx] + b[idx];
    }
}

//pybind11 로 python 에서 호출할 함수
void vector_add(py::array_t<float> input1, py::array_t<float> input2, py::array_t<float> output) {
	//Python 배열 데이터에 접근
    auto buf1 = input1.request();
    auto buf2 = input2.request();
    auto buf_out = output.request();
    
    float* ptr1 = static_cast<float*>(buf1.ptr);
    float* ptr2 = static_cast<float*>(buf2.ptr);
    float* ptr_out = static_cast<float*>(buf_out.ptr);
    
    int size = buf1.size;
    
    // GPU 메모리 할당
    float *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, size * sizeof(float));
    cudaMalloc((void**)&d_b, size * sizeof(float));
    cudaMalloc((void**)&d_c, size * sizeof(float));
    
    // 데이터 복사 : CPU -> GPU
    cudaMemcpy(d_a, ptr1, size * sizeof(float), cudaMemcpyHostToDeivce);
    cudaMemcpy(d_b, ptr1, size * sizeof(float), cudaMemcpyHostToDeivce);
    
    // 커널 실행
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    addKernel<<<numBlocks, blockSize>>>(d_a, d_b, d_c, size);
    
    // 결과 복사 : GPU -> CPU
    cudaMemcpy(ptr_out, d_c, size * sizeof(float), cudaMemcpyDeviceToHost);
    
    // GPU 메모리 해제
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

// Pybind11 모듈 설정
PYBIND11_MODULE(cuda_add, m) {
	m.def("vector_add", &vector_add, "Add two vectors using CUDA");
}

3. Python 빌드 스크립트

Python 에서 CUDA 코드를 빌드하기 위한 setup.py 스크립트

from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
import subprocess
import os

class BuildExtWithNvcc(build_ext):
    def build_extensions(self):
        for ext in self.extensions:
            if isinstance(ext, CUDAExtension):
                self.build_cuda_extension(ext)
            else:
                super().build_extensions()

    def build_cuda_extension(self, ext):
        sources = ext.sources
        output_file = self.get_ext_fullpath(ext.name)

        # Pybind11 및 Python 경로 설정
        pybind_include = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\Lib\site-packages\pybind11\include"
        python_include = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\include"
        python_lib = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\libs"

        # CUDA 경로 설정 (따옴표 추가)
        cuda_lib_path = r'"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\lib\x64"'
        cuda_bin_path = r'"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"'

        # NVCC 명령 설정
        nvcc_cmd = [
            "nvcc",
            "-shared",                      # 공유 라이브러리 생성
            "-O2",                          # 최적화
            "-x", "cu",                     # CUDA 코드
            f'-I"{pybind_include}"',        # Pybind11 헤더 경로
            f'-I"{python_include}"',        # Python 헤더 경로
            f'-L"{python_lib}"',            # Python 라이브러리 경로 추가
            f'-L{cuda_lib_path}',           # CUDA 라이브러리 경로 추가
            "--compiler-options", "/MD",    # MSVC 컴파일러 옵션
            "-lcudart",                     # CUDA 런타임 라이브러리
            "-lpython312",                  # Python 라이브러리 연결
            "-o", f'"{output_file}"'        # 출력 파일 설정
        ] + sources

        print("Running NVCC:", " ".join(nvcc_cmd))

        # CUDA DLL 경로를 PATH에 추가
        os.environ["PATH"] += os.pathsep + cuda_bin_path.strip('"')

        # NVCC 실행
        subprocess.check_call(" ".join(nvcc_cmd), shell=True)

class CUDAExtension(Extension):
    def __init__(self, name, sources):
        super().__init__(name, sources)
        self.sources = sources

setup(
    name="cuda_add_example",
    version="0.1",
    ext_modules=[
        CUDAExtension("cuda_add", ["cuda_add.cu"])  # CUDA 확장 모듈
    ],
    cmdclass={"build_ext": BuildExtWithNvcc},       # 빌드 확장 클래스 등록
    zip_safe=False,
)

BuildExtWithNvcc 클래스

build_ext 를 상속받아 커스텀 빌드 클래스 생성

build_extensions 메서드

def build_extensions(self):
    for ext in self.extensions:
        if isinstance(ext, CUDAExtension):
            self.build_cuda_extension(ext)
        else:
            super().build_extensions()

확장 모듈의 CUDAExtension 확인

build_cuda_extansion 메서드

def build_cuda_extension(self, ext):
    sources = ext.sources
    output_file = self.get_ext_fullpath(ext.name)

    # Pybind11 및 Python 경로 설정
    pybind_include = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\Lib\site-packages\pybind11\include"
    python_include = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\include"
    python_lib = r"C:\Users\owner\AppData\Local\Programs\Python\Python312\libs"

    # CUDA 경로 설정
    cuda_lib_path = r'"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\lib\x64"'
    cuda_bin_path = r'"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"'

각 경로의 지정

nvcc 명령 생성

    nvcc_cmd = [
        "nvcc",
        "-shared",                      # 공유 라이브러리 생성
        "-O2",                          # 최적화
        "-x", "cu",                     # CUDA 코드
        f'-I"{pybind_include}"',        # Pybind11 헤더 경로
        f'-I"{python_include}"',        # Python 헤더 경로
        f'-L"{python_lib}"',            # Python 라이브러리 경로 추가
        f'-L{cuda_lib_path}',           # CUDA 라이브러리 경로 추가
        "--compiler-options", "/MD",    # MSVC 컴파일러 옵션
        "-lcudart",                     # CUDA 런타임 라이브러리
        "-lpython312",                  # Python 라이브러리 연결
        "-o", f'"{output_file}"'        # 출력 파일 설정
    ] + sources

-shared : 동적 라이브러리 .dll 생성

-O2 최적화 레벨 설정

-x cu : CUDA 소스 코드 지정

-I, -L 헤더와 라이브러리 경로 추가

환경 변수 설정 및 NVCC 실행

    os.environ["PATH"] += os.pathsep + cuda_bin_path.strip('"')

    subprocess.check_call(" ".join(nvcc_cmd), shell=True)

PATH 설정과 생성한 NVCC 명령 실행

빌드 명령 실행

python setup.py build_ext --inplace

.pyd 생성의 확인

test 코드 확인

DLL 오류로 인해 해당 경로의 명시적 추가

import os
import sys
import numpy as np

# CUDA DLL 경로 명시적 추가 (Python 3.8 이상)
cuda_path = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
if hasattr(os, "add_dll_directory"):
    os.add_dll_directory(cuda_path)
else:
    os.environ["PATH"] = cuda_path + os.pathsep + os.environ["PATH"]

sys.path.append("build/lib.win-amd64-cpython-312")

# CUDA 확장 모듈 불러오기
try:
    import cuda_add
    print("cuda_add module imported successfully.")
except ImportError as e:
    print("ImportError:", e)

# 테스트 데이터 생성
size = 1024
a = np.ones(size, dtype=np.float32)
b = np.ones(size, dtype=np.float32)
c = np.zeros(size, dtype=np.float32)

# CUDA 함수 호출
cuda_add.vector_add(a, b, c)

# 결과 확인
print("Result:", c[:10])


>>>
cuda_add module imported successfully.
Result: [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]

빌드에 있어서 정말 많은 오류가 있었음... 겨우 해결했네...