Showing posts with label Hip programming. Show all posts
Showing posts with label Hip programming. Show all posts

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.

.

Concepts:

  • Basic kernel structure: Args → Kernel → operator()
  • Grid/Block configuration
  • One thread per element processing

Key Code:

struct VectorAddKernel {
    __device__ void operator()(args) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        c[idx] = a[idx] + b[idx];
    }
};


..

code

..

// Step 1: Simplest CK Tile Kernel - Vector Addition
// This demonstrates the absolute basics of CK Tile

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments (Host → Device)
// ============================================
struct VectorAddKernelArgs {
const float* a_ptr;
const float* b_ptr;
float* c_ptr;
int n;
// Constructor from host arguments
VectorAddKernelArgs(const float* a, const float* b, float* c, int size)
: a_ptr(a), b_ptr(b), c_ptr(c), n(size) {}
};

// ============================================
// PART 2: The Kernel
// ============================================
struct VectorAddKernel {
// Static method to get grid size (how many blocks)
static dim3 GridSize(const VectorAddKernelArgs& args) {
// 256 threads per block, divide work
int blocks = (args.n + 255) / 256;
return dim3(blocks, 1, 1);
}
// Static method to get block size (threads per block)
static dim3 BlockSize() {
return dim3(256, 1, 1);
}
// The actual kernel function - called by each thread
__device__ void operator()(const VectorAddKernelArgs& args) const {
// Calculate global thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Check bounds
if (idx < args.n) {
// Each thread does one element
args.c_ptr[idx] = args.a_ptr[idx] + args.b_ptr[idx];
}
}
};

// ============================================
// PART 3: Host Launch Function
// ============================================
__global__ void vector_add_kernel(VectorAddKernelArgs args) {
VectorAddKernel kernel;
kernel(args);
}

void run_vector_add(int n) {
std::cout << "\n=== Step 1: Vector Addition ===\n";
std::cout << "Adding two vectors of size " << n << "\n";
// Allocate host memory
std::vector<float> h_a(n, 1.0f);
std::vector<float> h_b(n, 2.0f);
std::vector<float> h_c(n, 0.0f);
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, n * sizeof(float));
hipMalloc(&d_b, n * sizeof(float));
hipMalloc(&d_c, n * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), n * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), n * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
VectorAddKernelArgs args(d_a, d_b, d_c, n);
// Get launch configuration
dim3 grid = VectorAddKernel::GridSize(args);
dim3 block = VectorAddKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "), block(" << block.x << ")\n";
// Launch kernel
vector_add_kernel<<<grid, block>>>(args);
// Copy result back
hipMemcpy(h_c.data(), d_c, n * sizeof(float), hipMemcpyDeviceToHost);
// Verify
bool correct = true;
for (int i = 0; i < std::min(10, n); i++) {
if (h_c[i] != 3.0f) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "First 5 elements: ";
for (int i = 0; i < std::min(5, n); i++) {
std::cout << h_c[i] << " ";
}
std::cout << "\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
}

// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 1: Vector Addition\n";
std::cout << "==========================================\n";
// Run with different sizes
run_vector_add(1024);
run_vector_add(10000);
std::cout << "\nKey Concepts Demonstrated:\n";
std::cout << "1. Kernel structure: Args → Kernel → operator()\n";
std::cout << "2. Grid/Block configuration\n";
std::cout << "3. Each thread processes one element\n";
std::cout << "4. Bounds checking for safety\n";
return 0;
}

...


Result

CK Tile Tutorial - Step 1: Vector Addition

==========================================


=== Step 1: Vector Addition ===

Adding two vectors of size 1024

Launching with grid(4), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3 


=== Step 1: Vector Addition ===

Adding two vectors of size 10000

Launching with grid(40), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3 


Key Concepts Demonstrated:

1. Kernel structure: Args → Kernel → operator()

2. Grid/Block configuration

3. Each thread processes one element

4. Bounds checking for safety