From freelancer, 3 Weeks ago, written in C++.
Embed
  1. #include "cuda_helper.cuh"
  2.  
  3. __global__
  4. void VectorAddKernel(float *A, float *B, float *C, int N) {
  5.         int i = blockIdx.x * blockDim.x + threadIdx.x;
  6.         if (i < N) {
  7.                 C[i] = A[i] + B[i];
  8.         }
  9. }
  10.  
  11. /*
  12. void handleCudaError(cudaError error) {
  13.         if (error != cudaSuccess) {
  14.                 fprintf(stderr, "CUDA: %s!\n", cudaGetErrorString(error));
  15.                 exit(EXIT_FAILURE);
  16.         }
  17. }
  18. */
  19.  
  20. void cudaVectorAdd(float *a, float *b, float *c, int numElements, int repetitions, bool warmup) {
  21.         float hostToDevice = 0;
  22.         float compute = 0;
  23.         float deviceToHost = 0;
  24.         float total = 0;
  25.  
  26.         size_t size = numElements * sizeof(float);
  27.         float *d_a, *d_b, *d_c;
  28.  
  29.         clock_t start = clock();
  30.  
  31.         handleCudaError(cudaMalloc(&d_a, size));
  32.         handleCudaError(cudaMalloc(&d_b, size));
  33.         handleCudaError(cudaMalloc(&d_c, size));
  34.         handleCudaError(cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice));
  35.         handleCudaError(cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice));
  36.  
  37.         hostToDevice = float(clock() - start) / (CLOCKS_PER_SEC * repetitions);
  38.  
  39.         int blockSize = 1024;
  40.         int gridSize = (numElements + blockSize - 1) / blockSize;
  41.  
  42.         for (int i = 0; i < repetitions; i++)
  43.         {
  44.                 // TODO: Implement a parallel vector addition on CUDA
  45.                 VectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, numElements);
  46.                 handleCudaError(cudaGetLastError());
  47.         }
  48.         compute = float(clock() - hostToDevice) / (CLOCKS_PER_SEC * repetitions);
  49.  
  50.         handleCudaError(cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost));
  51.         handleCudaError(cudaFree(d_a));
  52.         handleCudaError(cudaFree(d_b));
  53.         handleCudaError(cudaFree(d_c));
  54.  
  55.         deviceToHost = float(clock() - compute) / (CLOCKS_PER_SEC * repetitions);
  56.         total = float(clock() - start) / (CLOCKS_PER_SEC * repetitions);
  57.  
  58.         if (!warmup)
  59.         {
  60.                 printf("CUDA: %.4lf milliseconds\n", total);
  61.                 printf("CUDA: Copy input to device: %.4lf milliseconds\n", hostToDevice / (1000 * repetitions));
  62.                 printf("CUDA: Compute time: %.4lf milliseconds\n", compute / repetitions);
  63.                 printf("CUDA: Copy output to host: %.4lf milliseconds\n", deviceToHost / repetitions);
  64.         }
  65. }
  66.  
  67. void fillRandomArray(float *a, int numElements) {
  68.         for (int i = 0; i < numElements; i++) {
  69.                 a[i] = rand() / (float)RAND_MAX;
  70.         }
  71. }
  72.  
  73. void verifyResults(float *a, float *b, float *c, int numElements) {
  74.         for (int i = 0; i < numElements; i++) {
  75.         if (fabs(a[i] + b[i] - c[i]) > 1e-5) {
  76.             fprintf(stderr, "Result verification failed at element %d!\n", i);
  77.             exit(EXIT_FAILURE);
  78.         }
  79.     }
  80. }
  81.  
  82. void sequentialVectorAdd(float *a, float *b, float *c, int numElements) {
  83.         clock_t start = clock();
  84.  
  85.         for (int i = 0; i < numElements; i++) {
  86.                 c[i] = a[i] + b[i];
  87.         }
  88.  
  89.         float diff = float(clock() - start) / CLOCKS_PER_SEC;
  90.         printf("Sequential: %.3lf seconds\n", diff);
  91. }
  92.  
  93. int main() {
  94.         int N = 10000000;
  95.         size_t size = N * sizeof(float);
  96.  
  97.         float *h_a = (float *)malloc(size);
  98.         handleAllocationError(h_a);
  99.         fillRandomArray(h_a, N);
  100.        
  101.         float *h_b = (float *)malloc(size);
  102.         handleAllocationError(h_b);
  103.         fillRandomArray(h_b, N);
  104.        
  105.         float *h_c = (float *)malloc(size);
  106.         handleAllocationError(h_c);
  107.  
  108.         cudaVectorAdd(h_a, h_b, h_c, N, 100, true);
  109.         verifyResults(h_a, h_b, h_c, N);
  110.         cudaVectorAdd(h_a, h_b, h_c, N, 1000, false);
  111.  
  112.         sequentialVectorAdd(h_a, h_b, h_c, N);
  113.  
  114.         free(h_a);
  115.         free(h_b);
  116.         free(h_c);
  117.  
  118.         return 0;
  119. }
  120.