Skip to content

Commit

Permalink
Merge pull request karpathy#495 from ChrisDryden/shared_memory
Browse files Browse the repository at this point in the history
Removed unnecesary shared memory due to blockreduce using static defined shared memory
  • Loading branch information
karpathy authored Jun 1, 2024
2 parents 9cf8c2f + 5450632 commit a26041a
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion dev/cuda/classifier_fused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ void fused_classifier5(float* dlogits, float* losses,
int B, int T, int V, int P, int block_size) {
const int N = B * T;
const int grid_size = N;
fused_classifier_kernel5<true,false><<<grid_size, block_size, 512>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
fused_classifier_kernel5<true,false><<<grid_size, block_size>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
cudaCheck(cudaGetLastError());
}

Expand Down
2 changes: 1 addition & 1 deletion train_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1874,7 +1874,7 @@ void fused_classifier(Type* logits, Type* losses,
const int block_size = 1024;
const int N = B * T;
const int grid_size = N;
fused_classifier_kernel5<<<grid_size, block_size, 512>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
fused_classifier_kernel5<<<grid_size, block_size>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
cudaCheck(cudaGetLastError());
}

Expand Down

0 comments on commit a26041a

Please sign in to comment.