.
Concepts:
- Basic kernel structure: Args → Kernel → operator()
- Grid/Block configuration
- One thread per element processing
Key Code:
struct VectorAddKernel {
__device__ void operator()(args) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx] = a[idx] + b[idx];
}
};
..
code
..
// Step 1: Simplest CK Tile Kernel - Vector Addition
// This demonstrates the absolute basics of CK Tile
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
// ============================================
// PART 1: Kernel Arguments (Host → Device)
// ============================================
struct VectorAddKernelArgs {
const float* a_ptr;
const float* b_ptr;
float* c_ptr;
int n;
// Constructor from host arguments
VectorAddKernelArgs(const float* a, const float* b, float* c, int size)
: a_ptr(a), b_ptr(b), c_ptr(c), n(size) {}
};
// ============================================
// PART 2: The Kernel
// ============================================
struct VectorAddKernel {
// Static method to get grid size (how many blocks)
static dim3 GridSize(const VectorAddKernelArgs& args) {
// 256 threads per block, divide work
int blocks = (args.n + 255) / 256;
return dim3(blocks, 1, 1);
}
// Static method to get block size (threads per block)
static dim3 BlockSize() {
return dim3(256, 1, 1);
}
// The actual kernel function - called by each thread
__device__ void operator()(const VectorAddKernelArgs& args) const {
// Calculate global thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Check bounds
if (idx < args.n) {
// Each thread does one element
args.c_ptr[idx] = args.a_ptr[idx] + args.b_ptr[idx];
}
}
};
// ============================================
// PART 3: Host Launch Function
// ============================================
__global__ void vector_add_kernel(VectorAddKernelArgs args) {
VectorAddKernel kernel;
kernel(args);
}
void run_vector_add(int n) {
std::cout << "\n=== Step 1: Vector Addition ===\n";
std::cout << "Adding two vectors of size " << n << "\n";
// Allocate host memory
std::vector<float> h_a(n, 1.0f);
std::vector<float> h_b(n, 2.0f);
std::vector<float> h_c(n, 0.0f);
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, n * sizeof(float));
hipMalloc(&d_b, n * sizeof(float));
hipMalloc(&d_c, n * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), n * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), n * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
VectorAddKernelArgs args(d_a, d_b, d_c, n);
// Get launch configuration
dim3 grid = VectorAddKernel::GridSize(args);
dim3 block = VectorAddKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "), block(" << block.x << ")\n";
// Launch kernel
vector_add_kernel<<<grid, block>>>(args);
// Copy result back
hipMemcpy(h_c.data(), d_c, n * sizeof(float), hipMemcpyDeviceToHost);
// Verify
bool correct = true;
for (int i = 0; i < std::min(10, n); i++) {
if (h_c[i] != 3.0f) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "First 5 elements: ";
for (int i = 0; i < std::min(5, n); i++) {
std::cout << h_c[i] << " ";
}
std::cout << "\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
}
// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 1: Vector Addition\n";
std::cout << "==========================================\n";
// Run with different sizes
run_vector_add(1024);
run_vector_add(10000);
std::cout << "\nKey Concepts Demonstrated:\n";
std::cout << "1. Kernel structure: Args → Kernel → operator()\n";
std::cout << "2. Grid/Block configuration\n";
std::cout << "3. Each thread processes one element\n";
std::cout << "4. Bounds checking for safety\n";
return 0;
}
...
Result
CK Tile Tutorial - Step 1: Vector Addition
==========================================
=== Step 1: Vector Addition ===
Adding two vectors of size 1024
Launching with grid(4), block(256)
Result: CORRECT
First 5 elements: 3 3 3 3 3
=== Step 1: Vector Addition ===
Adding two vectors of size 10000
Launching with grid(40), block(256)
Result: CORRECT
First 5 elements: 3 3 3 3 3
Key Concepts Demonstrated:
1. Kernel structure: Args → Kernel → operator()
2. Grid/Block configuration
3. Each thread processes one element
4. Bounds checking for safety