현재 GEMM 구현 내용
#pragma once#include #include #include "../ge/cuda_check.cuh"// Row-major 매핑 래퍼들// 단일 배치 GEMM (TF32)inline void gemm_rm_tf32( cublasHandle_t h, bool transA, bool transB, int M, int N, int K, const float* A, int lda, const float* B, int ldb, float* C, int ldc, float alpha=1.f, float beta=0.f){ cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t..