[0] type=5, input=input, output=flatten_1653668585376_out
[1] type=0, input=flatten_1653668585376_out, output=dense_1653689541376_linear
[2] type=1, input=dense_1653689541376_linear, output=dense_1653689541376_out
[ADD] input=dense_1653689541376_linear + param=dense_1653689541376_b -> output=dense_1653689541376_out
[3] type=3, input=dense_1653689541376_out, output=activation_1653690315120_out
[4] type=0, input=activation_1653690315120_out, output=dense_1653690315312_linear
[5] type=1, input=dense_1653690315312_linear, output=dense_1653690315312_out
[ADD] input=dense_1653690315312_linear + param=dense_1653690315312_b -> output=dense_1653690315312_out
[6] type=3, input=dense_1653690315312_out, output=activation_1653689397952_out
[7] type=7, input=activation_1653689397952_out, output=loss
[SHAPE][LOSS] input_id=activation_1653689397952_out, shape=(1,1), sz=1
[SHAPE][LOSS] y_true ptr=0000000706000400, y_pred ptr=0000000706032400
[SHAPE][OP] op_type=7, output_id=loss, input_id=activation_1653689397952_out
input shape=(1,1), size=1
output shape=(1,1), size=1
grad_out ptr=0000000706032A00
[DEBUG] grad_out values (first 1): 1.74268
[SHAPE][OP] op_type=3, output_id=activation_1653689397952_out, input_id=dense_1653690315312_out
input shape=(1,1), size=1
output shape=(1,1), size=1
grad_out ptr=0000000706032A00
[DEBUG] grad_out values (first 1): 1.74268
[DEBUG] grad_input initial values (first 1): 0.00000
[DEBUG][POST] grad_input values (first 1): 0.42617
[SHAPE][OP] op_type=1, output_id=dense_1653690315312_out, input_id=dense_1653690315312_linear
input shape=(1,1), size=1
output shape=(1,1), size=1
grad_out ptr=0000000706032C00
[DEBUG] grad_out values (first 1): 0.42617
[DEBUG] grad_input initial values (first 1): 0.00000
[add_backward_input] d_input[0] = 0.426173 (from d_out[0] = 0.426173)
[add_backward_bias] d_bias[0] = 0.426173
[GRADIENT] dense_1653690315312_b grad ??min=0.426173, max=0.426173, mean=0.426173
[0] = 0.426173
[SHAPE][OP] op_type=0, output_id=dense_1653690315312_linear, input_id=activation_1653690315120_out
input shape=(1,4), size=4
output shape=(1,1), size=1
grad_out ptr=0000000706032E00
[DEBUG] grad_out values (first 1): 0.42617
[DEBUG] grad_input initial values (first 4): 0.00000 0.00000 0.00000 0.00000
[transpose] input[0] = -0.156815 -> output[0] = -0.156815
[run_graph_backward] launching matmul_backward_input_simple | M=1 N=1 K=4
[matmul_bw_input_simple] M=1, N=1, K=4
[matmul_bw_input_simple] M=1, N=1, K=4
[matmul_bw_input_simple] M=1, N=1, K=4
[matmul_bw_input_simple] M=1, N=1, K=4
[DEBUG] d_input[0] = -0.066830
[DEBUG] d_input[0] = -0.268121
[DEBUG] d_input[0] = 0.323190
[DEBUG] d_input[0] = -0.242140
[DEBUG] W_T[0~3] = -0.156815 -0.629138 0.758355 -0.568175
[DEBUG] W_T[0~3] = -0.156815 -0.629138 0.758355 -0.568175
[DEBUG] W_T[0~3] = -0.156815 -0.629138 0.758355 -0.568175
[DEBUG] W_T[0~3] = -0.156815 -0.629138 0.758355 -0.568175
[DEBUG] d_out[0~3] = 0.426173 0.000000 0.000000 0.000000
[DEBUG] d_out[0~3] = 0.426173 0.000000 0.000000 0.000000
[DEBUG] d_out[0~3] = 0.426173 0.000000 0.000000 0.000000
[DEBUG] d_out[0~3] = 0.426173 0.000000 0.000000 0.000000
[transpose] input[0] = 0.500154 -> output[0] = 0.500154
[transpose] input[1] = 0.500019 -> output[1] = 0.500019
[transpose] input[2] = 0.499939 -> output[2] = 0.499939
[transpose] input[3] = 0.500049 -> output[3] = 0.500049
[matmul_bw_weight] d_weight[0] = 0.213152, input_T[0] = 0.500154, d_out[0] = 0.426173
[GRADIENT] dense_1653690315312_W grad ??min=0.21306, max=0.213152, mean=0.213103
[0] = 0.213152
[1] = 0.213094
[2] = 0.21306
[3] = 0.213107
[SHAPE][OP] op_type=3, output_id=activation_1653690315120_out, input_id=dense_1653689541376_out
input shape=(1,4), size=4
output shape=(1,4), size=4
grad_out ptr=0000000706033200
[DEBUG] grad_out values (first 4): -0.06683 -0.26812 0.32319 -0.24214
[DEBUG] grad_input initial values (first 4): 0.50015 0.50002 0.49994 0.50005
[DEBUG][POST] grad_input values (first 4): -0.01671 -0.06703 0.08080 -0.06054
[SHAPE][OP] op_type=1, output_id=dense_1653689541376_out, input_id=dense_1653689541376_linear
input shape=(1,4), size=4
output shape=(1,4), size=4
grad_out ptr=0000000706033400
[DEBUG] grad_out values (first 4): -0.01671 -0.06703 0.08080 -0.06054
[DEBUG] grad_input initial values (first 4): 0.00000 0.00000 0.00000 0.00000
[add_backward_input] d_input[0] = -0.016708 (from d_out[0] = -0.016708)
[add_backward_bias] d_bias[0] = -0.016708
[GRADIENT] dense_1653689541376_b grad ??min=-0.0670303, max=0.0807976, mean=-0.0158688
[0] = -0.0167075
[1] = -0.0670303
[2] = 0.0807976
[3] = -0.0605351
[SHAPE][OP] op_type=0, output_id=dense_1653689541376_linear, input_id=flatten_1653668585376_out
input shape=(1,2), size=2
output shape=(1,4), size=4
grad_out ptr=0000000706033800
[DEBUG] grad_out values (first 4): -0.01671 -0.06703 0.08080 -0.06054
[DEBUG] grad_input initial values (first 2): 0.00000 0.00000
[transpose] input[0] = -0.031772 -> output[0] = -0.031772
[transpose] input[1] = 0.558842 -> output[2] = 0.558842
[transpose] input[2] = -0.246018 -> output[4] = -0.246018
[transpose] input[3] = 0.365366 -> output[6] = 0.365366
[run_graph_backward] launching matmul_backward_input_simple | M=1 N=4 K=2
[matmul_bw_input_simple] M=1, N=4, K=2
[matmul_bw_input_simple] M=1, N=4, K=2
[DEBUG] d_input[0] = -0.078924
[DEBUG] d_input[0] = -0.145432
[DEBUG] W_T[0~3] = -0.031772 -0.496071 0.000000 0.000000
[DEBUG] W_T[0~3] = -0.031772 -0.496071 0.000000 0.000000
[DEBUG] d_out[0~3] = -0.016708 -0.067030 0.080798 -0.060535
[DEBUG] d_out[0~3] = -0.016708 -0.067030 0.080798 -0.060535
[transpose] input[0] = 0.000000 -> output[0] = 0.000000
[transpose] input[1] = 0.000000 -> output[1] = 0.000000
[matmul_bw_weight] d_weight[0] = 0.000000, input_T[0] = 0.000000, d_out[0] = -0.016708
[GRADIENT] dense_1653689541376_W grad ??min=0, max=0, mean=0
[0] = 0
[1] = 0
[2] = 0
[3] = 0
[4] = 0
[5] = 0
[6] = 0
[7] = 0
[SHAPE][OP] op_type=5, output_id=flatten_1653668585376_out, input_id=input
input shape=(1,2), size=2
output shape=(1,2), size=2
grad_out ptr=0000000706033C00
[DEBUG] grad_out values (first 2): -0.07892 -0.14543
📊 최종 평가 메트릭 (BCE): 0.255483
🔍 XOR 예측 결과:
====================================
입력 | 정답 | 예측값
---------------|--------|----------
[0.0, 0.0] | 0.0 | 0.4260
[0.0, 1.0] | 1.0 | 0.3497
[1.0, 0.0] | 1.0 | 0.3821
[1.0, 1.0] | 0.0 | 0.3164
====================================
1) Option A 체인 정상
- LOSS에서 dL/da가 1.74268 → SIGMOID backward 후 grad_input=0.42617.
- a(출력)=~0.426, y=0라고 보면
dL/da = (a - y)/(a(1-a)) ≈ 0.426/(0.426·0.574) ≈ 1.742 ✓
그리고 dL/dz = dL/da · a(1-a) ≈ 1.742 · 0.2449 ≈ 0.426 ✓
즉 BCE(dL/da) → Sigmoid’(a(1-a)) → dL/dz 체인이 정확히 맞습니다.
2) 각 레이어 전파 흐름도 정상
- activation → add → matmul 순으로
grad_out(=dL/doutput)가 들어와 **grad_input(=dL/dinput)**으로 잘 바뀌고,
- 파라미터가 있는 곳에서는 dW = X^T·dY, **db = reduce(dY)**가 잘 찍힙니다.
- 마지막 Dense의 W gradient가 약 0.213대로 모두 동일하게 나오는 것도
입력이 거의 균일(≈0.5)하고 dY가 스칼라인 상황이라 자연스러운 결과입니다.
3) 첫 번째 Dense의 W gradient가 0인 이유
- 로그에 input_T가 전부 0으로 나옵니다. 현재 샘플이 **[0, 0]**인 XOR의 첫 케이스라면,
- dW = X^T·dY에서 X가 0 → **dW = 0**가 맞습니다. 버그 아님.
- 배치에 다른 샘플(예: [0,1], [1,0], [1,1])이 함께 들어오면 그에 비례해 dW가 생깁니다.
4) 스케일/리덕션 체크
- 지금 bce_loss_backward에서 평균 리덕션으로 size(=rows*cols)로 나누고 있습니다.
실제 학습에서 배치 단위 평균을 원하면, size 대신 batch_size(혹은 유효 샘플 수)로 나누는 게 더 일관적입니다. (현재는 1×1이라 동일)
5) 현재 로그로 본 요약 판단
- 수치·부호·스케일 모두 일관: Option A 구현이 의도대로 작동.
- 전파(grad_input)도 정상: 이전 레이어로 기대대로 넘어감.
- 0 gradient 케이스는 입력이 0인 샘플 때문: 정상.