CUDA Atomics

Explanation Video Link on Youtube

B站中文解说视频链接

%%cu
#include <stdio.h>
#include <chrono>
#define N (1000000)
#define M (256)

__global__ void hist_kernel(unsigned int *buffer, unsigned int *hist) {
  int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
  __shared__ unsigned int temp[M];
  temp[threadIdx.x] = 0;
  __syncthreads();
  int total_num_threads = blockDim.x * gridDim.x;

  while (thread_id < N) {
    atomicAdd(&temp[buffer[thread_id]], 1);
    thread_id += total_num_threads;
  }

  __syncthreads();
  atomicAdd(&(hist[threadIdx.x]), temp[threadIdx.x]); 
}

int main(void) {
  std::chrono::steady_clock::time_point cpu_start = std::chrono::steady_clock::now();
  unsigned int buffer[N];
  unsigned int cpu_hist[M] = {0};
  for (int i = 0; i < N; i++) {
      buffer[i] = rand() % M;
      cpu_hist[buffer[i]]++;
  }
  
  std::chrono::steady_clock::time_point cpu_end = std::chrono::steady_clock::now();
  double cpu_elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(cpu_end - cpu_start).count();
  printf("CPU Time taken: %3.1f ms\n", cpu_elapsed_time);

  cudaEvent_t start, stop;
  float elapsedTime;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

  unsigned int *dev_buffer;
  unsigned int *dev_hist;
  cudaMalloc((void**)&dev_buffer, N * sizeof(int));
  cudaMemcpy(dev_buffer, buffer, N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMalloc((void**)&dev_hist, M * sizeof(int));
  cudaMemset(dev_hist, 0, M * sizeof(int));

  hist_kernel<<<100, M>>>(dev_buffer, dev_hist);

  unsigned int gpu_hist[M];
  cudaMemcpy(gpu_hist, dev_hist, M * sizeof(int), cudaMemcpyDeviceToHost);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&elapsedTime, start, stop);
  printf("GPU Time taken: %3.1f ms\n", elapsedTime);

  for (int i = 0; i < M; i++) {
      if (gpu_hist[i] != cpu_hist[i]) {
          printf("Mistmatch index: %d CPU: %d GPU: %d \n", i, cpu_hist[i], gpu_hist[i]);
      }
  }

  cudaEventDestroy(start);
  cudaEventDestroy(stop);
  cudaFree(dev_hist);
  cudaFree(dev_buffer);
  return 0;
}

Last updated

Was this helpful?