Concepts Added:
- 2D grid/block configuration
- Matrix multiplication basics
- Each thread computes one output element
Key Pattern:
// Each thread computes C[row][col]
for (int k = 0; k < K; k++) {
sum += A[row][k] * B[k][col];
}
.=== Thread Mapping Visualization ===
Each thread computes one C[i][j]:
Block(0,0) Block(1,0)
┌─────────┐ ┌─────────┐
│T00 T01..│ │T00 T01..│
│T10 T11..│ │T10 T11..│
│... ... ..│ │... ... ..│
└─────────┘ └─────────┘
↓ ↓
C[0:16,0:16] C[0:16,16:32]
Each thread's work:
for k in 0..K:
sum += A[row][k] * B[k][col]
C[row][col] = sum
=== Step 2: Simple GEMM ===
Matrix multiply: (64x64) * (64x64) = (64x64)
Launching with grid(4,4), block(16,16)
Result: CORRECT
Time: 0.4232 ms
Performance: 1.23887 GFLOPS
=== Step 2: Simple GEMM ===
Matrix multiply: (128x128) * (128x128) = (128x128)
Launching with grid(8,8), block(16,16)
Result: CORRECT
Time: 0.03824 ms
Performance: 109.684 GFLOPS
Key Concepts Added:
1. 2D grid/block configuration
2. Each thread computes one output element
3. Row-major vs column-major layouts
4. Performance measurement (GFLOPS)
..
code
.
// Step 2: Simple GEMM (Matrix Multiplication)
// Building on Step 1, now each thread computes one output element
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
// ============================================
// PART 1: Kernel Arguments
// ============================================
struct SimpleGemmKernelArgs {
const float* a_ptr; // M x K matrix
const float* b_ptr; // K x N matrix
float* c_ptr; // M x N matrix
int M;
int N;
int K;
SimpleGemmKernelArgs(const float* a, const float* b, float* c,
int m, int n, int k)
: a_ptr(a), b_ptr(b), c_ptr(c), M(m), N(n), K(k) {}
};
// ============================================
// PART 2: The Kernel (One thread per output)
// ============================================
struct SimpleGemmKernel {
static dim3 GridSize(const SimpleGemmKernelArgs& args) {
// 16x16 threads per block
int grid_m = (args.M + 15) / 16;
int grid_n = (args.N + 15) / 16;
return dim3(grid_n, grid_m, 1); // Note: x=N, y=M
}
static dim3 BlockSize() {
return dim3(16, 16, 1); // 16x16 = 256 threads
}
__device__ void operator()(const SimpleGemmKernelArgs& args) const {
// Each thread computes one element of C
int col = blockIdx.x * blockDim.x + threadIdx.x; // N dimension
int row = blockIdx.y * blockDim.y + threadIdx.y; // M dimension
// Bounds check
if (row >= args.M || col >= args.N) return;
// Compute dot product for C[row][col]
float sum = 0.0f;
for (int k = 0; k < args.K; k++) {
// A is row-major: A[row][k] = A[row * K + k]
// B is column-major: B[k][col] = B[k + col * K]
float a_val = args.a_ptr[row * args.K + k];
float b_val = args.b_ptr[k + col * args.K];
sum += a_val * b_val;
}
// Store result (C is row-major)
args.c_ptr[row * args.N + col] = sum;
}
};
// ============================================
// PART 3: Host Code
// ============================================
__global__ void simple_gemm_kernel(SimpleGemmKernelArgs args) {
SimpleGemmKernel kernel;
kernel(args);
}
void run_simple_gemm(int M, int N, int K) {
std::cout << "\n=== Step 2: Simple GEMM ===\n";
std::cout << "Matrix multiply: (" << M << "x" << K << ") * ("
<< K << "x" << N << ") = (" << M << "x" << N << ")\n";
// Allocate host memory
std::vector<float> h_a(M * K);
std::vector<float> h_b(K * N);
std::vector<float> h_c(M * N, 0.0f);
// Initialize with simple values
for (int i = 0; i < M * K; i++) h_a[i] = 1.0f;
for (int i = 0; i < K * N; i++) h_b[i] = 2.0f;
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, M * K * sizeof(float));
hipMalloc(&d_b, K * N * sizeof(float));
hipMalloc(&d_c, M * N * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), M * K * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), K * N * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
SimpleGemmKernelArgs args(d_a, d_b, d_c, M, N, K);
// Get launch configuration
dim3 grid = SimpleGemmKernel::GridSize(args);
dim3 block = SimpleGemmKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "," << grid.y
<< "), block(" << block.x << "," << block.y << ")\n";
// Launch kernel
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
simple_gemm_kernel<<<grid, block>>>(args);
hipEventRecord(stop);
hipEventSynchronize(stop);
float milliseconds = 0;
hipEventElapsedTime(&milliseconds, start, stop);
// Copy result back
hipMemcpy(h_c.data(), d_c, M * N * sizeof(float), hipMemcpyDeviceToHost);
// Verify (each element should be K * 1.0 * 2.0 = 2K)
float expected = 2.0f * K;
bool correct = true;
for (int i = 0; i < std::min(10, M*N); i++) {
if (h_c[i] != expected) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "Time: " << milliseconds << " ms\n";
// Calculate FLOPS
double flops = 2.0 * M * N * K; // 2 ops per multiply-add
double gflops = (flops / milliseconds) / 1e6;
std::cout << "Performance: " << gflops << " GFLOPS\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
hipEventDestroy(start);
hipEventDestroy(stop);
}
// ============================================
// VISUALIZATION: How threads map to output
// ============================================
void visualize_thread_mapping() {
std::cout << "\n=== Thread Mapping Visualization ===\n";
std::cout << "Each thread computes one C[i][j]:\n\n";
std::cout << " Block(0,0) Block(1,0)\n";
std::cout << " ┌─────────┐ ┌─────────┐\n";
std::cout << " │T00 T01..│ │T00 T01..│\n";
std::cout << " │T10 T11..│ │T10 T11..│\n";
std::cout << " │... ... ..│ │... ... ..│\n";
std::cout << " └─────────┘ └─────────┘\n";
std::cout << " ↓ ↓\n";
std::cout << " C[0:16,0:16] C[0:16,16:32]\n\n";
std::cout << "Each thread's work:\n";
std::cout << " for k in 0..K:\n";
std::cout << " sum += A[row][k] * B[k][col]\n";
std::cout << " C[row][col] = sum\n";
}
// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 2: Simple GEMM\n";
std::cout << "======================================\n";
visualize_thread_mapping();
// Run with different sizes
run_simple_gemm(64, 64, 64);
run_simple_gemm(128, 128, 128);
std::cout << "\nKey Concepts Added:\n";
std::cout << "1. 2D grid/block configuration\n";
std::cout << "2. Each thread computes one output element\n";
std::cout << "3. Row-major vs column-major layouts\n";
std::cout << "4. Performance measurement (GFLOPS)\n";
std::cout << "\nProblem: Each thread reads K elements from A and B\n";
std::cout << " → Poor memory reuse!\n";
std::cout << "Next: Add tiling and shared memory for efficiency\n";
return 0;
}
..
ππ»♂️
MareArts
No comments:
Post a Comment