MareArts Computer Vision Study.: Hip programming

Showing posts with label Hip programming. Show all posts

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.

Concepts:

Basic kernel structure: Args → Kernel → operator()
Grid/Block configuration
One thread per element processing

Key Code:

struct VectorAddKernel {
    __device__ void operator()(args) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        c[idx] = a[idx] + b[idx];
    }
};

code

// Step 1: Simplest CK Tile Kernel - Vector Addition
// This demonstrates the absolute basics of CK Tile

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments (Host → Device)
// ============================================
struct VectorAddKernelArgs {
    const float* a_ptr;
    const float* b_ptr;
    float* c_ptr;
    int n;
    
    // Constructor from host arguments
    VectorAddKernelArgs(const float* a, const float* b, float* c, int size)
        : a_ptr(a), b_ptr(b), c_ptr(c), n(size) {}
};

// ============================================
// PART 2: The Kernel
// ============================================
struct VectorAddKernel {
    
    // Static method to get grid size (how many blocks)
    static dim3 GridSize(const VectorAddKernelArgs& args) {
        // 256 threads per block, divide work
        int blocks = (args.n + 255) / 256;
        return dim3(blocks, 1, 1);
    }
    
    // Static method to get block size (threads per block)
    static dim3 BlockSize() {
        return dim3(256, 1, 1);
    }
    
    // The actual kernel function - called by each thread
    __device__ void operator()(const VectorAddKernelArgs& args) const {
        // Calculate global thread index
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        
        // Check bounds
        if (idx < args.n) {
            // Each thread does one element
            args.c_ptr[idx] = args.a_ptr[idx] + args.b_ptr[idx];
        }
    }
};

// ============================================
// PART 3: Host Launch Function
// ============================================
__global__ void vector_add_kernel(VectorAddKernelArgs args) {
    VectorAddKernel kernel;
    kernel(args);
}

void run_vector_add(int n) {
    std::cout << "\n=== Step 1: Vector Addition ===\n";
    std::cout << "Adding two vectors of size " << n << "\n";
    
    // Allocate host memory
    std::vector<float> h_a(n, 1.0f);
    std::vector<float> h_b(n, 2.0f);
    std::vector<float> h_c(n, 0.0f);
    
    // Allocate device memory
    float *d_a, *d_b, *d_c;
    hipMalloc(&d_a, n * sizeof(float));
    hipMalloc(&d_b, n * sizeof(float));
    hipMalloc(&d_c, n * sizeof(float));
    
    // Copy to device
    hipMemcpy(d_a, h_a.data(), n * sizeof(float), hipMemcpyHostToDevice);
    hipMemcpy(d_b, h_b.data(), n * sizeof(float), hipMemcpyHostToDevice);
    
    // Create kernel arguments
    VectorAddKernelArgs args(d_a, d_b, d_c, n);
    
    // Get launch configuration
    dim3 grid = VectorAddKernel::GridSize(args);
    dim3 block = VectorAddKernel::BlockSize();
    
    std::cout << "Launching with grid(" << grid.x << "), block(" << block.x << ")\n";
    
    // Launch kernel
    vector_add_kernel<<<grid, block>>>(args);
    
    // Copy result back
    hipMemcpy(h_c.data(), d_c, n * sizeof(float), hipMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < std::min(10, n); i++) {
        if (h_c[i] != 3.0f) {
            correct = false;
            break;
        }
    }
    
    std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
    std::cout << "First 5 elements: ";
    for (int i = 0; i < std::min(5, n); i++) {
        std::cout << h_c[i] << " ";
    }
    std::cout << "\n";
    
    // Cleanup
    hipFree(d_a);
    hipFree(d_b);
    hipFree(d_c);
}

// ============================================
// PART 4: Main
// ============================================
int main() {
    std::cout << "MareArts CK Tile Tutorial - Step 1: Vector Addition\n";
    std::cout << "==========================================\n";
    
    // Run with different sizes
    run_vector_add(1024);
    run_vector_add(10000);
    
    std::cout << "\nKey Concepts Demonstrated:\n";
    std::cout << "1. Kernel structure: Args → Kernel → operator()\n";
    std::cout << "2. Grid/Block configuration\n";
    std::cout << "3. Each thread processes one element\n";
    std::cout << "4. Bounds checking for safety\n";
    
    return 0;
}

...

Result

CK Tile Tutorial - Step 1: Vector Addition

==========================================

=== Step 1: Vector Addition ===

Adding two vectors of size 1024

Launching with grid(4), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3

=== Step 1: Vector Addition ===

Adding two vectors of size 10000

Launching with grid(40), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3

Key Concepts Demonstrated:

1. Kernel structure: Args → Kernel → operator()

2. Grid/Block configuration

3. Each thread processes one element

4. Bounds checking for safety

Pages

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.