04 - Run Your First GPU Benchmark

In this tutorial, we will walk you through setting up a GPU workflow using CUDA.

Step 1: Create a GitHub Repository

Start by creating a new GitHub repository. For this guide, we will name the repository matmul-benchmark-gpu.

Go to GitHub and create a repository named matmul-benchmark-gpu.
Clone the repository locally:

git clone https://github.com/<your-username>/matmul-benchmark-gpu.git
cd matmul-benchmark-gpu

Step 2: Add a CUDA Benchmark

Create a file named matmul.cu at the root of your repository with the following matrix multiplication implemented in CUDA:

##include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_SIZE 16

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void matrixMulKernel(float *A, float *B, float *C, int N) {
    __shared__ float sharedA[TILE_SIZE][TILE_SIZE];
    __shared__ float sharedB[TILE_SIZE][TILE_SIZE];

    int tx = threadIdx.x, ty = threadIdx.y;
    int row = blockIdx.y * TILE_SIZE + ty;
    int col = blockIdx.x * TILE_SIZE + tx;
    float sum = 0.0f;

    for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
        if (row < N && t * TILE_SIZE + tx < N)
            sharedA[ty][tx] = A[row * N + t * TILE_SIZE + tx];
        else
            sharedA[ty][tx] = 0.0f;

        if (col < N && t * TILE_SIZE + ty < N)
            sharedB[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];
        else
            sharedB[ty][tx] = 0.0f;

        __syncthreads();

        for (int k = 0; k < TILE_SIZE; k++) {
            sum += sharedA[ty][k] * sharedB[k][tx];
        }

        __syncthreads();
    }

    if (row < N && col < N)
        C[row * N + col] = sum;
}

void matrixMultiply(float *h_A, float *h_B, float *h_C, int N) {
    float *d_A, *d_B, *d_C;
    size_t size = N * N * sizeof(float);

    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 blockSize(TILE_SIZE, TILE_SIZE);
    dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);

    matrixMulKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);

    cudaDeviceSynchronize();
    gpuAssert(cudaGetLastError(), __FILE__, __LINE__);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    int N = 64; // Matrix size
    size_t size = N * N * sizeof(float);

    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() % 10;
        h_B[i] = rand() % 10;
    }

    matrixMultiply(h_A, h_B, h_C, N);

    printf("Result matrix:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%0.1f ", h_C[i * N + j]);
        }
        printf("\n");
    }

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Step 3: Define a GPU Workflow

Create a directory called .daisy/ at the root of your repository:

mkdir .daisy

Inside .daisy/, create a workflow file named benchmark-gpu.yml with the following configuration:

on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, reopened, synchronize, ready_for_review]

parameters:
  timeout: 10
  partitions:
    - tansy

steps:
  build: |
    nvcc -O1 -g -o matmul.out matmul.cu

  run:
    matmul_cu:
      command: matmul.out
      measurements: 1
      profiler: nsys

This workflow:

Triggers automatically upon every push to the main branch.
Runs your benchmark on the tansy partition of our cluster.
Compiles the benchmark using nvcc.
Profiles the program using Nvidia’s nsys profiler.

Step 4: Commit and Push the Changes

Commit the files and push them to GitHub:

git add matmul.c .daisy/benchmark-gpu.yml
git commit -m "Add matmul benchmark and workflow"
git push origin main

Extension: GPU Utilization

To obtain GPU-specific performance metrics, enable the kernels flag for your benchmark:

  run:
    matmul_cu:
      command: matmul.out
      measurements: 1
      profiler: nsys
      kernels: true

You can find the measured performance metrics under the Kernels tab.

Navigation Kernels

Kernels result

Can you improve the code’s efficiency for the given hardware? Open new pull requests for your changes and monitor the difference in runtime calculated by the Daisytuner bot. Happy tuning! 🎉