epilogue 실행 방식의 결정 : launcher - device - inline

이전에 구현한 방식의 Epilogue<AK, BM, HasC, SaveZ>::apply() 가 한 함수에서 모든 일을 다 해주는 것이 아닌, 단계를 더 잘게 쪼개서 기능 모듈로 합성하는 구조로 바꿔 길이 감소, 확장/재사용/테스트가 쉽도록

Launcher : 런타임 옵션을 컴파일타임 정책으로 내려보내는 곳
global 커널 : 타일 메인루프 + (m, n) 인덱싱까지
device 인라인스테이지들 : alpha scale - beta, C- add - Bias - SaveZ - Act - Store 를 작은 장치 함수로 분리
Epilogue 파이프라인 : 위 스테이지들을 컴파일타임으로 합성해 한 줄로 실행

1. 스테이지 단위 분리

namespace regemm {

// 0) 공통 헬퍼
template<typename T>
__device__ __forceinline__ int eff_ld(int ld, int fallback) { return ld ? ld : fallback; }

// 1) α 스케일
struct StAlpha {
  template<class P>
  __device__ __forceinline__
  static float run(float acc, const P& p) {
    return (p.alpha == 1.f) ? acc : p.alpha * acc;
  }
};

// 2) β*C 더하기 (컴파일타임 토글)
template<bool HasC>
struct StAddC;
template<> struct StAddC<true> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int m, int n, const float* __restrict__ C, int ldc, const P& p) {
    return fmaf(p.beta, C[m * ldc + n], pre);
  }
};
template<> struct StAddC<false> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int, int, const float*, int, const P&) { return pre; }
};

// 3) Bias (PerN/PerM/Scalar/None)
template<BiasMode BM>
struct StBias;

template<> struct StBias<BiasMode::PerN> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int, int, const P&, float bias_j, float) { return pre + bias_j; }
};
template<> struct StBias<BiasMode::PerM> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int, int, const P&, float, float bias_m) { return pre + bias_m; }
};
template<> struct StBias<BiasMode::Full> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int m, int n, const P& p, float, float) { return pre + load_bias(p, m, n); }
};
template<> struct StBias<BiasMode::None> {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, int, int, const P&, float, float) { return pre; }
};

// 4) Z 저장 (Save pre-activation)
template<bool SaveZ>
struct StSaveZ;
template<> struct StSaveZ<true> {
  template<class P>
  __device__ __forceinline__
  static void store(float pre, int m, int n, float* __restrict__ Z, int ldZ, int ldd, const P&) {
    if (Z) Z[m * eff_ld(ldZ, ldd) + n] = pre;
  }
};
template<> struct StSaveZ<false> {
  template<class P>
  __device__ __forceinline__
  static void store(float, int, int, float*, int, int, const P&) {}
};

// 5) 활성화
template<ActKind AK>
struct StAct {
  template<class P>
  __device__ __forceinline__
  static float run(float pre, const P& p) { return act_apply<AK>(pre, p.leaky_slope); }
};

// 6) 최종 저장(벡터화/정렬 확장은 여기서)
struct StStoreD {
  __device__ __forceinline__
  static void store(float y, int m, int n, float* __restrict__ D, int ldd) {
    D[m * ldd + n] = y;
  }
};

} // namespace regemm

2. 파이프라인 조립

namespace regemm {

template<ActKind AK, BiasMode BM, bool HasC, bool SaveZ>
struct Epilogue {
  template<class P>
  __device__ __forceinline__
  static void apply(float* __restrict__ D, int ldd,
                    const float* __restrict__ C, int ldc,
                    float* __restrict__ Z, int ldZ,
                    const P& p, int m, int n,
                    float acc, float bias_j, float bias_m)
  {
    float pre = StAlpha::run(acc, p);
    pre = StAddC<HasC>::run(pre, m, n, C, ldc, p);
    pre = StBias<BM>::run(pre, m, n, p, bias_j, bias_m);
    StSaveZ<SaveZ>::store(pre, m, n, Z, ldZ, ldd, p);
    float y = StAct<AK>::run(pre, p);
    StStoreD::store(y, m, n, D, ldd);
  }
};

} // namespace regemm

각 기능별 파일 분리 가능 : stages/alpha.h, addc.h, bias.h, savez.h, act.h, store.h
신규 기능은 동일 패턴으로 새 스테이지 추가

3. 호출 계층 정리 ( launcher - global - device stages )

Launcher
- 런타임 파라미터를 컴파일타임 정책으로 내려 선택
얇은 global 커널
device 인라인 스테이지

'dev_AI_framework' 카테고리의 다른 글

정책화된 커널 내부 에필로그 호출 체계 policy-based epilogue invocation (0)	2025.10.19
RDC + Device LTO(-dlto) 란?? (0)	2025.10.19
정책화된 커널 내부 에필로그 호출 방식 (0)	2025.10.19
템플릿화된 최적화!!! (0)	2025.10.16
사용자 정의 조합, 연산들에 대해 Epilogue 확장 가이드 ( graph_capture-safe, epilogue condition 조건 만족) (0)	2025.10.16

뜻 지, 가르칠 훈

epilogue 실행 방식의 결정 : launcher - device - inline - 각 기능

1. 스테이지 단위 분리

2. 파이프라인 조립

3. 호출 계층 정리 ( launcher - global - device stages )

'dev_AI_framework' 카테고리의 다른 글

티스토리툴바

epilogue 실행 방식의 결정 : launcher - device - inline - 각 기능

1. 스테이지 단위 분리

2. 파이프라인 조립

3. 호출 계층 정리 ( launcher - global - device stages )

'dev_AI_framework' 카테고리의 다른 글

'dev_AI_framework' Related Articles

티스토리툴바