Showing posts with label HIP. Show all posts
Showing posts with label HIP. Show all posts

8/25/2025

CK Tile Tutorial Day 2 (AMD hip programming) - Simple GEMM.

 Concepts Added:

  • 2D grid/block configuration
  • Matrix multiplication basics
  • Each thread computes one output element

Key Pattern:

// Each thread computes C[row][col]
for (int k = 0; k < K; k++) {
    sum += A[row][k] * B[k][col];
}
.
=== Thread Mapping Visualization ===
Each thread computes one C[i][j]:

  Block(0,0)        Block(1,0)
  ┌─────────┐      ┌─────────┐
  │T00 T01..│      │T00 T01..│
  │T10 T11..│      │T10 T11..│
  │... ... ..│      │... ... ..│
  └─────────┘      └─────────┘
       ↓                ↓
  C[0:16,0:16]    C[0:16,16:32]

Each thread's work:
  for k in 0..K:
    sum += A[row][k] * B[k][col]
  C[row][col] = sum

=== Step 2: Simple GEMM ===
Matrix multiply: (64x64) * (64x64) = (64x64)
Launching with grid(4,4), block(16,16)
Result: CORRECT
Time: 0.4232 ms
Performance: 1.23887 GFLOPS

=== Step 2: Simple GEMM ===
Matrix multiply: (128x128) * (128x128) = (128x128)
Launching with grid(8,8), block(16,16)
Result: CORRECT
Time: 0.03824 ms
Performance: 109.684 GFLOPS

Key Concepts Added:
1. 2D grid/block configuration
2. Each thread computes one output element
3. Row-major vs column-major layouts
4. Performance measurement (GFLOPS)
..

code
.
// Step 2: Simple GEMM (Matrix Multiplication)
// Building on Step 1, now each thread computes one output element

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments
// ============================================
struct SimpleGemmKernelArgs {
const float* a_ptr; // M x K matrix
const float* b_ptr; // K x N matrix
float* c_ptr; // M x N matrix
int M;
int N;
int K;
SimpleGemmKernelArgs(const float* a, const float* b, float* c,
int m, int n, int k)
: a_ptr(a), b_ptr(b), c_ptr(c), M(m), N(n), K(k) {}
};

// ============================================
// PART 2: The Kernel (One thread per output)
// ============================================
struct SimpleGemmKernel {
static dim3 GridSize(const SimpleGemmKernelArgs& args) {
// 16x16 threads per block
int grid_m = (args.M + 15) / 16;
int grid_n = (args.N + 15) / 16;
return dim3(grid_n, grid_m, 1); // Note: x=N, y=M
}
static dim3 BlockSize() {
return dim3(16, 16, 1); // 16x16 = 256 threads
}
__device__ void operator()(const SimpleGemmKernelArgs& args) const {
// Each thread computes one element of C
int col = blockIdx.x * blockDim.x + threadIdx.x; // N dimension
int row = blockIdx.y * blockDim.y + threadIdx.y; // M dimension
// Bounds check
if (row >= args.M || col >= args.N) return;
// Compute dot product for C[row][col]
float sum = 0.0f;
for (int k = 0; k < args.K; k++) {
// A is row-major: A[row][k] = A[row * K + k]
// B is column-major: B[k][col] = B[k + col * K]
float a_val = args.a_ptr[row * args.K + k];
float b_val = args.b_ptr[k + col * args.K];
sum += a_val * b_val;
}
// Store result (C is row-major)
args.c_ptr[row * args.N + col] = sum;
}
};

// ============================================
// PART 3: Host Code
// ============================================
__global__ void simple_gemm_kernel(SimpleGemmKernelArgs args) {
SimpleGemmKernel kernel;
kernel(args);
}

void run_simple_gemm(int M, int N, int K) {
std::cout << "\n=== Step 2: Simple GEMM ===\n";
std::cout << "Matrix multiply: (" << M << "x" << K << ") * ("
<< K << "x" << N << ") = (" << M << "x" << N << ")\n";
// Allocate host memory
std::vector<float> h_a(M * K);
std::vector<float> h_b(K * N);
std::vector<float> h_c(M * N, 0.0f);
// Initialize with simple values
for (int i = 0; i < M * K; i++) h_a[i] = 1.0f;
for (int i = 0; i < K * N; i++) h_b[i] = 2.0f;
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, M * K * sizeof(float));
hipMalloc(&d_b, K * N * sizeof(float));
hipMalloc(&d_c, M * N * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), M * K * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), K * N * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
SimpleGemmKernelArgs args(d_a, d_b, d_c, M, N, K);
// Get launch configuration
dim3 grid = SimpleGemmKernel::GridSize(args);
dim3 block = SimpleGemmKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "," << grid.y
<< "), block(" << block.x << "," << block.y << ")\n";
// Launch kernel
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
simple_gemm_kernel<<<grid, block>>>(args);
hipEventRecord(stop);
hipEventSynchronize(stop);
float milliseconds = 0;
hipEventElapsedTime(&milliseconds, start, stop);
// Copy result back
hipMemcpy(h_c.data(), d_c, M * N * sizeof(float), hipMemcpyDeviceToHost);
// Verify (each element should be K * 1.0 * 2.0 = 2K)
float expected = 2.0f * K;
bool correct = true;
for (int i = 0; i < std::min(10, M*N); i++) {
if (h_c[i] != expected) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "Time: " << milliseconds << " ms\n";
// Calculate FLOPS
double flops = 2.0 * M * N * K; // 2 ops per multiply-add
double gflops = (flops / milliseconds) / 1e6;
std::cout << "Performance: " << gflops << " GFLOPS\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
hipEventDestroy(start);
hipEventDestroy(stop);
}

// ============================================
// VISUALIZATION: How threads map to output
// ============================================
void visualize_thread_mapping() {
std::cout << "\n=== Thread Mapping Visualization ===\n";
std::cout << "Each thread computes one C[i][j]:\n\n";
std::cout << " Block(0,0) Block(1,0)\n";
std::cout << " ┌─────────┐ ┌─────────┐\n";
std::cout << " │T00 T01..│ │T00 T01..│\n";
std::cout << " │T10 T11..│ │T10 T11..│\n";
std::cout << " │... ... ..│ │... ... ..│\n";
std::cout << " └─────────┘ └─────────┘\n";
std::cout << " ↓ ↓\n";
std::cout << " C[0:16,0:16] C[0:16,16:32]\n\n";
std::cout << "Each thread's work:\n";
std::cout << " for k in 0..K:\n";
std::cout << " sum += A[row][k] * B[k][col]\n";
std::cout << " C[row][col] = sum\n";
}

// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 2: Simple GEMM\n";
std::cout << "======================================\n";
visualize_thread_mapping();
// Run with different sizes
run_simple_gemm(64, 64, 64);
run_simple_gemm(128, 128, 128);
std::cout << "\nKey Concepts Added:\n";
std::cout << "1. 2D grid/block configuration\n";
std::cout << "2. Each thread computes one output element\n";
std::cout << "3. Row-major vs column-major layouts\n";
std::cout << "4. Performance measurement (GFLOPS)\n";
std::cout << "\nProblem: Each thread reads K elements from A and B\n";
std::cout << " → Poor memory reuse!\n";
std::cout << "Next: Add tiling and shared memory for efficiency\n";
return 0;
}
..

🙇🏻‍♂️
MareArts

9/18/2024

AMD Distributed Training Overview

# AMD Distributed Training Overview

AMD's approach to distributed training leverages its high-performance CPUs and GPUs, along with software frameworks, to enable efficient scaling of machine learning workloads across multiple nodes. Key aspects include:

1. **Hardware Solutions:**
   - AMD EPYC CPUs: Provide high core counts and memory bandwidth.
   - AMD Instinct GPUs: Accelerators designed for HPC and AI workloads.
   - AMD Infinity Fabric: High-speed interconnect for multi-GPU and multi-node systems.

2. **Software Framework:**
   - ROCm (Radeon Open Compute): Open-source software stack for GPU computing.
   - HIP (Heterogeneous-Compute Interface for Portability): C++ runtime API for GPU programming.
   - AMD's optimized libraries for deep learning frameworks like TensorFlow and PyTorch.

3. **Distributed Training Techniques:**
   - Data Parallelism: Distributing batches of training data across multiple GPUs or nodes.
   - Model Parallelism: Splitting large models across multiple devices.
   - Pipeline Parallelism: Dividing model layers across devices and processing in a pipelined fashion.

4. **Communication Optimization:**
   - RCCL (ROCm Communication Collectives Library): Optimized multi-GPU and multi-node collective communications.
   - Support for high-speed networking technologies like InfiniBand.

5. **Scalability:**
   - Support for scaling from single-node multi-GPU systems to large clusters.
   - Integration with job schedulers and resource managers for cluster environments.

6. **Ecosystem Integration:**
   - Compatibility with popular ML frameworks and distributed training tools.
   - Support for containers and orchestration platforms like Docker and Kubernetes.

7. **Performance Optimization:**
   - Mixed-precision training support.
   - Memory management techniques for large model training.
   - Automatic performance tuning tools.

AMD's distributed training solutions aim to provide high performance, scalability, and ease of use for researchers and organizations working on large-scale machine learning projects.

9/17/2024

HIP kernel for matrix multiplication that can leverage Matrix Cores

Here's an example of a custom HIP kernel for matrix multiplication that can leverage Matrix Cores:



```cpp
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <iostream>

// Define matrix dimensions
#define M 16
#define N 16
#define K 16

// HIP kernel for matrix multiplication
__global__ void matrixMulKernel(half* A, half* B, float* C) {
// Shared memory for tile of A and B
__shared__ half As[M][K];
__shared__ half Bs[K][N];

int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block
int aBegin = K * M * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + K - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = M;

// Index of the first sub-matrix of B processed by the block
int bBegin = N * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = K * N;

// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;

// Loop over all the sub-matrices of A and B
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Load the matrices from device memory to shared memory
As[ty][tx] = A[a + K * ty + tx];
Bs[ty][tx] = B[b + N * ty + tx];

// Synchronize to make sure the matrices are loaded
__syncthreads();

// Multiply the two matrices
#pragma unroll
for (int k = 0; k < K; ++k) {
Csub += __half2float(As[ty][k]) * __half2float(Bs[k][tx]);
}

// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}

// Write the block sub-matrix to device memory
// each thread writes one element
int c = N * M * by + M * bx;
C[c + N * ty + tx] = Csub;
}

// Host function to launch the kernel
void launchMatrixMulKernel(half* A, half* B, float* C, int numBlocks) {
dim3 threadsPerBlock(M, N);
dim3 blocksPerGrid(numBlocks, numBlocks);
hipLaunchKernelGGL(matrixMulKernel, blocksPerGrid, threadsPerBlock, 0, 0, A, B, C);
}

// Error checking macro
#define HIP_CHECK(call) { hipError_t err = call; if (err != hipSuccess) { std::cerr << "HIP error: " << hipGetErrorString(err) << std::endl; exit(1); } }

int main() {
// Allocate memory
half *A, *B;
float *C;
HIP_CHECK(hipMalloc(&A, M * K * sizeof(half)));
HIP_CHECK(hipMalloc(&B, K * N * sizeof(half)));
HIP_CHECK(hipMalloc(&C, M * N * sizeof(float)));

// Initialize matrices (you would typically do this on the GPU)
// ...

// Launch kernel
launchMatrixMulKernel(A, B, C, 1); // Assuming 1 block for simplicity

// Clean up
HIP_CHECK(hipFree(A));
HIP_CHECK(hipFree(B));
HIP_CHECK(hipFree(C));

return 0;
}

```


Key points about this example:

1. It uses `half` precision for input matrices A and B, which can potentially benefit from Matrix Core acceleration.

2. The kernel is designed for 16x16 matrices, which is a common size for Matrix Core operations.

3. Shared memory is used to improve performance by reducing global memory accesses.

4. The main computation loop uses `__half2float` conversions. On GPUs with native FP16 support, these conversions might be optimized out.

5. The kernel uses a tiled approach, which is generally efficient for matrix multiplication.

6. Error checking is included for HIP calls.

Important considerations:

1. This kernel doesn't guarantee the use of Matrix Cores. The actual use of Matrix Cores depends on the GPU architecture and the HIP compiler's optimizations.

2. For larger matrices, you'd need to implement a more sophisticated tiling strategy.

3. Performance tuning is crucial. You might need to experiment with different block sizes and memory access patterns for optimal performance.

4. The HIP runtime and compiler will attempt to optimize this code for the target GPU, potentially leveraging Matrix Cores if available.

5. For production use, you should implement proper error handling and potentially use more sophisticated synchronization methods.

To fully leverage Matrix Cores, you might need to use specific intrinsics or rely on compiler optimizations. The exact method can vary depending on the GPU architecture and HIP version. Always profile your code to ensure you're getting the expected performance benefits.

8/22/2024

ROCm HIP asynchronous operation sample code

 





HIP (Heterogeneous-Compute Interface for Portability) provides similar functionality to CUDA streams for asynchronous execution. The concepts and usage are very similar, making it easier to port CUDA code to HIP. Here's an overview of HIP's equivalent features for asynchronous execution:

1. HIP Streams:
In HIP, streams are represented by the `hipStream_t` type, which is analogous to CUDA's `cudaStream_t`.

2. Creating and Destroying Streams:
```cpp
hipStream_t stream;
hipError_t hipStreamCreate(hipStream_t* stream);
hipError_t hipStreamDestroy(hipStream_t stream);
```

3. Asynchronous Memory Operations:
```cpp
hipError_t hipMemcpyAsync(void* dst, const void* src, size_t count, hipMemcpyKind kind, hipStream_t stream);
hipError_t hipMemsetAsync(void* dst, int value, size_t count, hipStream_t stream);
```

4. Launching Kernels on Streams:
```cpp
hipLaunchKernelGGL(kernel, dim3(gridSize), dim3(blockSize), 0, stream, /* kernel arguments */);
```

5. Stream Synchronization:
```cpp
hipError_t hipStreamSynchronize(hipStream_t stream);
hipError_t hipDeviceSynchronize();
```

6. Stream Query:
```cpp
hipError_t hipStreamQuery(hipStream_t stream);
```

7. Stream Callbacks:
```cpp
hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData, unsigned int flags);
```

8. Stream Priorities:
```cpp
hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
```

Here's a simple example demonstrating asynchronous execution with HIP streams:

```cpp
#include <hip/hip_runtime.h>
#include <stdio.h>

#define N 1000000
#define STREAMS 4

__global__ void vectorAdd(float* a, float* b, float* c, int numElements) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < numElements) {
c[idx] = a[idx] + b[idx];
}
}

int main() {
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
size_t size = N * sizeof(float);

// Allocate host memory
h_a = (float*)malloc(size);
h_b = (float*)malloc(size);
h_c = (float*)malloc(size);

// Initialize host arrays
for (int i = 0; i < N; i++) {
h_a[i] = rand() / (float)RAND_MAX;
h_b[i] = rand() / (float)RAND_MAX;
}

// Allocate device memory
hipMalloc(&d_a, size);
hipMalloc(&d_b, size);
hipMalloc(&d_c, size);

// Create streams
hipStream_t streams[STREAMS];
for (int i = 0; i < STREAMS; i++) {
hipStreamCreate(&streams[i]);
}

// Launch kernel on multiple streams
int streamSize = N / STREAMS;
dim3 blockSize(256);
dim3 gridSize((streamSize + blockSize.x - 1) / blockSize.x);

for (int i = 0; i < STREAMS; i++) {
int offset = i * streamSize;
hipMemcpyAsync(&d_a[offset], &h_a[offset], streamSize * sizeof(float), hipMemcpyHostToDevice, streams[i]);
hipMemcpyAsync(&d_b[offset], &h_b[offset], streamSize * sizeof(float), hipMemcpyHostToDevice, streams[i]);
hipLaunchKernelGGL(vectorAdd, gridSize, blockSize, 0, streams[i], &d_a[offset], &d_b[offset], &d_c[offset], streamSize);
hipMemcpyAsync(&h_c[offset], &d_c[offset], streamSize * sizeof(float), hipMemcpyDeviceToHost, streams[i]);
}

// Synchronize all streams
for (int i = 0; i < STREAMS; i++) {
hipStreamSynchronize(streams[i]);
}

// Verify results
for (int i = 0; i < N; i++) {
if (fabs(h_c[i] - (h_a[i] + h_b[i])) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(1);
}
}

printf("Test PASSED\n");

// Clean up
for (int i = 0; i < STREAMS; i++) {
hipStreamDestroy(streams[i]);
}
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
free(h_a);
free(h_b);
free(h_c);

return 0;
}
```

This example demonstrates how to use multiple streams to overlap computation and data transfer, similar to the CUDA example in the article you referenced. The key points are:

1. Creating multiple streams
2. Using `hipMemcpyAsync` for asynchronous data transfer
3. Launching kernels on specific streams
4. Synchronizing streams after all operations are queued

By using streams, you can potentially improve performance by overlapping operations and utilizing the GPU more efficiently.

hpcc install on cuda system. version 2

Please following the process

1. First, add the ROCm repository (if you haven't already):
wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list

2. Update your package list:
sudo apt update

3. Install only the HIP compiler and development tools:
sudo apt install hip-base hip-doc
This should install the basic HIP tools without the full runtime that caused issues before.

4. After installation, add the HIP binaries to your PATH. Add this line to your ~/.bashrc file:
export PATH=$PATH:/opt/rocm/bin

5. Then, apply the changes:
source ~/.bashrc

6. Verify the installation:
hipcc --version

Install HIP (ROCm) compiler on CUDA system.


Try this process.


1. First, add the ROCm repository to your system. For Ubuntu, you can use these commands:

wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list


2. Update your package list:

sudo apt update


3. Install the HIP runtime and compiler for CUDA:

sudo apt install hip-runtime-nvidia hip-dev


4. Set up environment variables. Add these lines to your `~/.bashrc` file:

export HIP_PLATFORM=nvidia
export PATH=$PATH:/opt/rocm/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib

Then run `source ~/.bashrc` to apply the changes.


5. Verify the installation:

hipconfig --full


6. Now try compiling your code again:

hipcc vector_add.cpp -o vector_add