MareArts Computer Vision Study.: GPU

Showing posts with label GPU. Show all posts

8/25/2025

CK Tile Tutorial Day 2 (AMD hip programming) - Simple GEMM.

Concepts Added:

2D grid/block configuration
Matrix multiplication basics
Each thread computes one output element

Key Pattern:

// Each thread computes C[row][col]
for (int k = 0; k < K; k++) {
    sum += A[row][k] * B[k][col];
}

=== Thread Mapping Visualization ===

Each thread computes one C[i][j]:

Block(0,0) Block(1,0)

┌─────────┐ ┌─────────┐

│T00 T01..│ │T00 T01..│

│T10 T11..│ │T10 T11..│

│... ... ..│ │... ... ..│

└─────────┘ └─────────┘

↓ ↓

C[0:16,0:16] C[0:16,16:32]

Each thread's work:

for k in 0..K:

sum += A[row][k] * B[k][col]

C[row][col] = sum

=== Step 2: Simple GEMM ===

Matrix multiply: (64x64) * (64x64) = (64x64)

Launching with grid(4,4), block(16,16)

Result: CORRECT

Time: 0.4232 ms

Performance: 1.23887 GFLOPS

=== Step 2: Simple GEMM ===

Matrix multiply: (128x128) * (128x128) = (128x128)

Launching with grid(8,8), block(16,16)

Result: CORRECT

Time: 0.03824 ms

Performance: 109.684 GFLOPS

Key Concepts Added:

1. 2D grid/block configuration

2. Each thread computes one output element

3. Row-major vs column-major layouts

4. Performance measurement (GFLOPS)

code

// Step 2: Simple GEMM (Matrix Multiplication)
// Building on Step 1, now each thread computes one output element

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments
// ============================================
struct SimpleGemmKernelArgs {
    const float* a_ptr;  // M x K matrix
    const float* b_ptr;  // K x N matrix
    float* c_ptr;        // M x N matrix
    int M;
    int N; 
    int K;
    
    SimpleGemmKernelArgs(const float* a, const float* b, float* c,
                         int m, int n, int k)
        : a_ptr(a), b_ptr(b), c_ptr(c), M(m), N(n), K(k) {}
};

// ============================================
// PART 2: The Kernel (One thread per output)
// ============================================
struct SimpleGemmKernel {
    
    static dim3 GridSize(const SimpleGemmKernelArgs& args) {
        // 16x16 threads per block
        int grid_m = (args.M + 15) / 16;
        int grid_n = (args.N + 15) / 16;
        return dim3(grid_n, grid_m, 1);  // Note: x=N, y=M
    }
    
    static dim3 BlockSize() {
        return dim3(16, 16, 1);  // 16x16 = 256 threads
    }
    
    __device__ void operator()(const SimpleGemmKernelArgs& args) const {
        // Each thread computes one element of C
        int col = blockIdx.x * blockDim.x + threadIdx.x;  // N dimension
        int row = blockIdx.y * blockDim.y + threadIdx.y;  // M dimension
        
        // Bounds check
        if (row >= args.M || col >= args.N) return;
        
        // Compute dot product for C[row][col]
        float sum = 0.0f;
        for (int k = 0; k < args.K; k++) {
            // A is row-major: A[row][k] = A[row * K + k]
            // B is column-major: B[k][col] = B[k + col * K]
            float a_val = args.a_ptr[row * args.K + k];
            float b_val = args.b_ptr[k + col * args.K];
            sum += a_val * b_val;
        }
        
        // Store result (C is row-major)
        args.c_ptr[row * args.N + col] = sum;
    }
};

// ============================================
// PART 3: Host Code
// ============================================
__global__ void simple_gemm_kernel(SimpleGemmKernelArgs args) {
    SimpleGemmKernel kernel;
    kernel(args);
}

void run_simple_gemm(int M, int N, int K) {
    std::cout << "\n=== Step 2: Simple GEMM ===\n";
    std::cout << "Matrix multiply: (" << M << "x" << K << ") * (" 
              << K << "x" << N << ") = (" << M << "x" << N << ")\n";
    
    // Allocate host memory
    std::vector<float> h_a(M * K);
    std::vector<float> h_b(K * N);
    std::vector<float> h_c(M * N, 0.0f);
    
    // Initialize with simple values
    for (int i = 0; i < M * K; i++) h_a[i] = 1.0f;
    for (int i = 0; i < K * N; i++) h_b[i] = 2.0f;
    
    // Allocate device memory
    float *d_a, *d_b, *d_c;
    hipMalloc(&d_a, M * K * sizeof(float));
    hipMalloc(&d_b, K * N * sizeof(float));
    hipMalloc(&d_c, M * N * sizeof(float));
    
    // Copy to device
    hipMemcpy(d_a, h_a.data(), M * K * sizeof(float), hipMemcpyHostToDevice);
    hipMemcpy(d_b, h_b.data(), K * N * sizeof(float), hipMemcpyHostToDevice);
    
    // Create kernel arguments
    SimpleGemmKernelArgs args(d_a, d_b, d_c, M, N, K);
    
    // Get launch configuration
    dim3 grid = SimpleGemmKernel::GridSize(args);
    dim3 block = SimpleGemmKernel::BlockSize();
    
    std::cout << "Launching with grid(" << grid.x << "," << grid.y 
              << "), block(" << block.x << "," << block.y << ")\n";
    
    // Launch kernel
    hipEvent_t start, stop;
    hipEventCreate(&start);
    hipEventCreate(&stop);
    
    hipEventRecord(start);
    simple_gemm_kernel<<<grid, block>>>(args);
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    
    float milliseconds = 0;
    hipEventElapsedTime(&milliseconds, start, stop);
    
    // Copy result back
    hipMemcpy(h_c.data(), d_c, M * N * sizeof(float), hipMemcpyDeviceToHost);
    
    // Verify (each element should be K * 1.0 * 2.0 = 2K)
    float expected = 2.0f * K;
    bool correct = true;
    for (int i = 0; i < std::min(10, M*N); i++) {
        if (h_c[i] != expected) {
            correct = false;
            break;
        }
    }
    
    std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
    std::cout << "Time: " << milliseconds << " ms\n";
    
    // Calculate FLOPS
    double flops = 2.0 * M * N * K;  // 2 ops per multiply-add
    double gflops = (flops / milliseconds) / 1e6;
    std::cout << "Performance: " << gflops << " GFLOPS\n";
    
    // Cleanup
    hipFree(d_a);
    hipFree(d_b);
    hipFree(d_c);
    hipEventDestroy(start);
    hipEventDestroy(stop);
}

// ============================================
// VISUALIZATION: How threads map to output
// ============================================
void visualize_thread_mapping() {
    std::cout << "\n=== Thread Mapping Visualization ===\n";
    std::cout << "Each thread computes one C[i][j]:\n\n";
    std::cout << "  Block(0,0)        Block(1,0)\n";
    std::cout << "  ┌─────────┐      ┌─────────┐\n";
    std::cout << "  │T00 T01..│      │T00 T01..│\n";
    std::cout << "  │T10 T11..│      │T10 T11..│\n";
    std::cout << "  │... ... ..│      │... ... ..│\n";
    std::cout << "  └─────────┘      └─────────┘\n";
    std::cout << "       ↓                ↓\n";
    std::cout << "  C[0:16,0:16]    C[0:16,16:32]\n\n";
    
    std::cout << "Each thread's work:\n";
    std::cout << "  for k in 0..K:\n";
    std::cout << "    sum += A[row][k] * B[k][col]\n";
    std::cout << "  C[row][col] = sum\n";
}

// ============================================
// PART 4: Main
// ============================================
int main() {
    std::cout << "MareArts CK Tile Tutorial - Step 2: Simple GEMM\n";
    std::cout << "======================================\n";
    
    visualize_thread_mapping();
    
    // Run with different sizes
    run_simple_gemm(64, 64, 64);
    run_simple_gemm(128, 128, 128);
    
    std::cout << "\nKey Concepts Added:\n";
    std::cout << "1. 2D grid/block configuration\n";
    std::cout << "2. Each thread computes one output element\n";
    std::cout << "3. Row-major vs column-major layouts\n";
    std::cout << "4. Performance measurement (GFLOPS)\n";
    std::cout << "\nProblem: Each thread reads K elements from A and B\n";
    std::cout << "         → Poor memory reuse!\n";
    std::cout << "Next: Add tiling and shared memory for efficiency\n";
    
    return 0;
}

🙇🏻‍♂️

MareArts

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.

Concepts:

Basic kernel structure: Args → Kernel → operator()
Grid/Block configuration
One thread per element processing

Key Code:

struct VectorAddKernel {
    __device__ void operator()(args) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        c[idx] = a[idx] + b[idx];
    }
};

code

// Step 1: Simplest CK Tile Kernel - Vector Addition
// This demonstrates the absolute basics of CK Tile

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments (Host → Device)
// ============================================
struct VectorAddKernelArgs {
    const float* a_ptr;
    const float* b_ptr;
    float* c_ptr;
    int n;
    
    // Constructor from host arguments
    VectorAddKernelArgs(const float* a, const float* b, float* c, int size)
        : a_ptr(a), b_ptr(b), c_ptr(c), n(size) {}
};

// ============================================
// PART 2: The Kernel
// ============================================
struct VectorAddKernel {
    
    // Static method to get grid size (how many blocks)
    static dim3 GridSize(const VectorAddKernelArgs& args) {
        // 256 threads per block, divide work
        int blocks = (args.n + 255) / 256;
        return dim3(blocks, 1, 1);
    }
    
    // Static method to get block size (threads per block)
    static dim3 BlockSize() {
        return dim3(256, 1, 1);
    }
    
    // The actual kernel function - called by each thread
    __device__ void operator()(const VectorAddKernelArgs& args) const {
        // Calculate global thread index
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        
        // Check bounds
        if (idx < args.n) {
            // Each thread does one element
            args.c_ptr[idx] = args.a_ptr[idx] + args.b_ptr[idx];
        }
    }
};

// ============================================
// PART 3: Host Launch Function
// ============================================
__global__ void vector_add_kernel(VectorAddKernelArgs args) {
    VectorAddKernel kernel;
    kernel(args);
}

void run_vector_add(int n) {
    std::cout << "\n=== Step 1: Vector Addition ===\n";
    std::cout << "Adding two vectors of size " << n << "\n";
    
    // Allocate host memory
    std::vector<float> h_a(n, 1.0f);
    std::vector<float> h_b(n, 2.0f);
    std::vector<float> h_c(n, 0.0f);
    
    // Allocate device memory
    float *d_a, *d_b, *d_c;
    hipMalloc(&d_a, n * sizeof(float));
    hipMalloc(&d_b, n * sizeof(float));
    hipMalloc(&d_c, n * sizeof(float));
    
    // Copy to device
    hipMemcpy(d_a, h_a.data(), n * sizeof(float), hipMemcpyHostToDevice);
    hipMemcpy(d_b, h_b.data(), n * sizeof(float), hipMemcpyHostToDevice);
    
    // Create kernel arguments
    VectorAddKernelArgs args(d_a, d_b, d_c, n);
    
    // Get launch configuration
    dim3 grid = VectorAddKernel::GridSize(args);
    dim3 block = VectorAddKernel::BlockSize();
    
    std::cout << "Launching with grid(" << grid.x << "), block(" << block.x << ")\n";
    
    // Launch kernel
    vector_add_kernel<<<grid, block>>>(args);
    
    // Copy result back
    hipMemcpy(h_c.data(), d_c, n * sizeof(float), hipMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < std::min(10, n); i++) {
        if (h_c[i] != 3.0f) {
            correct = false;
            break;
        }
    }
    
    std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
    std::cout << "First 5 elements: ";
    for (int i = 0; i < std::min(5, n); i++) {
        std::cout << h_c[i] << " ";
    }
    std::cout << "\n";
    
    // Cleanup
    hipFree(d_a);
    hipFree(d_b);
    hipFree(d_c);
}

// ============================================
// PART 4: Main
// ============================================
int main() {
    std::cout << "MareArts CK Tile Tutorial - Step 1: Vector Addition\n";
    std::cout << "==========================================\n";
    
    // Run with different sizes
    run_vector_add(1024);
    run_vector_add(10000);
    
    std::cout << "\nKey Concepts Demonstrated:\n";
    std::cout << "1. Kernel structure: Args → Kernel → operator()\n";
    std::cout << "2. Grid/Block configuration\n";
    std::cout << "3. Each thread processes one element\n";
    std::cout << "4. Bounds checking for safety\n";
    
    return 0;
}

...

Result

CK Tile Tutorial - Step 1: Vector Addition

==========================================

=== Step 1: Vector Addition ===

Adding two vectors of size 1024

Launching with grid(4), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3

=== Step 1: Vector Addition ===

Adding two vectors of size 10000

Launching with grid(40), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3

Key Concepts Demonstrated:

1. Kernel structure: Args → Kernel → operator()

2. Grid/Block configuration

3. Each thread processes one element

4. Bounds checking for safety

3/07/2025

9/17/2024

IREE-Turbine is a package or toolset that combines PyTorch, Torch-MLIR, IREE, and additional tools to provide a comprehensive solution for compiling, optimizing, and executing PyTorch models using IREE's infrastructure. Based on the information in the image, IREE-Turbine offers the following key features:

1. AOT Export: This allows for Ahead-Of-Time compilation of PyTorch modules (nn.Modules) into deployment-ready artifacts. These compiled artifacts can then take full advantage of IREE's runtime features.

2. Eager Execution: It provides a torch.compile backend and a Turbine Tensor/Device for interactive PyTorch sessions. This enables users to work with PyTorch in a familiar environment while leveraging IREE's optimization capabilities.

3. Custom Ops: IREE-Turbine offers integration for defining custom PyTorch operations and implementing them using either IREE's backend IR or the Pythonic kernel language. This allows for extending PyTorch's functionality while maintaining compatibility with IREE's optimization pipeline.

In essence, IREE-Turbine acts as a bridge between PyTorch and IREE, allowing PyTorch users to benefit from IREE's advanced compilation and runtime features while maintaining a familiar PyTorch-based workflow. It aims to provide a seamless experience for compiling PyTorch models to run efficiently on various hardware targets supported by IREE.

HIP kernel for matrix multiplication that can leverage Matrix Cores

Here's an example of a custom HIP kernel for matrix multiplication that can leverage Matrix Cores:

```cpp
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <iostream>

// Define matrix dimensions
#define M 16
#define N 16
#define K 16

// HIP kernel for matrix multiplication
__global__ void matrixMulKernel(half* A, half* B, float* C) {
    // Shared memory for tile of A and B
    __shared__ half As[M][K];
    __shared__ half Bs[K][N];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = K * M * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + K - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = M;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = N * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = K * N;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
        // Load the matrices from device memory to shared memory
        As[ty][tx] = A[a + K * ty + tx];
        Bs[ty][tx] = B[b + N * ty + tx];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices
        #pragma unroll
        for (int k = 0; k < K; ++k) {
            Csub += __half2float(As[ty][k]) * __half2float(Bs[k][tx]);
        }

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory
    // each thread writes one element
    int c = N * M * by + M * bx;
    C[c + N * ty + tx] = Csub;
}

// Host function to launch the kernel
void launchMatrixMulKernel(half* A, half* B, float* C, int numBlocks) {
    dim3 threadsPerBlock(M, N);
    dim3 blocksPerGrid(numBlocks, numBlocks);
    hipLaunchKernelGGL(matrixMulKernel, blocksPerGrid, threadsPerBlock, 0, 0, A, B, C);
}

// Error checking macro
#define HIP_CHECK(call) { hipError_t err = call; if (err != hipSuccess) { std::cerr << "HIP error: " << hipGetErrorString(err) << std::endl; exit(1); } }

int main() {
    // Allocate memory
    half *A, *B;
    float *C;
    HIP_CHECK(hipMalloc(&A, M * K * sizeof(half)));
    HIP_CHECK(hipMalloc(&B, K * N * sizeof(half)));
    HIP_CHECK(hipMalloc(&C, M * N * sizeof(float)));

    // Initialize matrices (you would typically do this on the GPU)
    // ...

    // Launch kernel
    launchMatrixMulKernel(A, B, C, 1);  // Assuming 1 block for simplicity

    // Clean up
    HIP_CHECK(hipFree(A));
    HIP_CHECK(hipFree(B));
    HIP_CHECK(hipFree(C));

    return 0;
}

```

Key points about this example:

1. It uses `half` precision for input matrices A and B, which can potentially benefit from Matrix Core acceleration.

2. The kernel is designed for 16x16 matrices, which is a common size for Matrix Core operations.

3. Shared memory is used to improve performance by reducing global memory accesses.

4. The main computation loop uses `__half2float` conversions. On GPUs with native FP16 support, these conversions might be optimized out.

5. The kernel uses a tiled approach, which is generally efficient for matrix multiplication.

6. Error checking is included for HIP calls.

Important considerations:

1. This kernel doesn't guarantee the use of Matrix Cores. The actual use of Matrix Cores depends on the GPU architecture and the HIP compiler's optimizations.

2. For larger matrices, you'd need to implement a more sophisticated tiling strategy.

3. Performance tuning is crucial. You might need to experiment with different block sizes and memory access patterns for optimal performance.

4. The HIP runtime and compiler will attempt to optimize this code for the target GPU, potentially leveraging Matrix Cores if available.

5. For production use, you should implement proper error handling and potentially use more sophisticated synchronization methods.

To fully leverage Matrix Cores, you might need to use specific intrinsics or rely on compiler optimizations. The exact method can vary depending on the GPU architecture and HIP version. Always profile your code to ensure you're getting the expected performance benefits.

Creating a custom CUDA kernel that directly utilizes tensor cores

Creating a custom CUDA kernel that directly utilizes tensor cores is an advanced topic, as tensor cores are typically accessed through higher-level libraries like cuBLAS or cuDNN. However, NVIDIA does provide a way to use tensor cores in custom kernels through their CUDA Core library, specifically with Warp Matrix Multiply-Accumulate (WMMA) API. Here's an overview of how to create a kernel that works on tensor cores:

1. Use CUDA Core WMMA API:

   The WMMA API allows you to program tensor cores directly in your CUDA kernels.

2. Include necessary headers:

   ```cpp

   #include <mma.h>

   #include <cuda_fp16.h>

```

3. Use appropriate data types:

   Tensor cores work with specific data types like half precision floating point (`__half`).

4. Define matrix fragments:

   Use `nvcuda::wmma::fragment` to define matrix fragments that will be processed by tensor cores.

5. Load, compute, and store operations:

   Use WMMA load, multiply-accumulate, and store operations.

Here's an example kernel that uses tensor cores via the WMMA API:

```cuda

#include <mma.h>

#include <cuda_fp16.h>

// Define matrix dimensions

const int M = 16;

const int N = 16;

const int K = 16;

__global__ void wmma_example(half *a, half *b, float *c) {

    // Declare the fragments

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, M, N, K, half, nvcuda::wmma::col_major> a_frag;

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, M, N, K, half, nvcuda::wmma::col_major> b_frag;

    nvcuda::wmma::fragment<nvcuda::wmma::accumulator, M, N, K, float> c_frag;

    // Initialize the output to zero

    nvcuda::wmma::fill_fragment(c_frag, 0.0f);

    // Load the inputs

    nvcuda::wmma::load_matrix_sync(a_frag, a, K);

    nvcuda::wmma::load_matrix_sync(b_frag, b, K);

    // Perform the matrix multiplication

    nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);

    // Store the output

    nvcuda::wmma::store_matrix_sync(c, c_frag, N, nvcuda::wmma::mem_row_major);

}

// Host function to launch the kernel

void launch_wmma_kernel(half *a, half *b, float *c) {

    dim3 gridDim(1);

    dim3 blockDim(32);  // One warp

    wmma_example<<<gridDim, blockDim>>>(a, b, c);

}

```

Key points about this example:

1. We're using 16x16 matrices as this is a common size for tensor core operations.
2. The kernel uses `nvcuda::wmma::fragment` to define matrix fragments.
3. `load_matrix_sync`, `mma_sync`, and `store_matrix_sync` are used to load data, perform matrix multiplication, and store results using tensor cores.
4. The kernel operates on half-precision input (`half`) and produces single-precision output (`float`).

To use this kernel:

1. Compile with a CUDA compiler that supports tensor cores (CUDA 9.0 or later).
2. Use appropriate GPU architecture flags (e.g., `-arch=sm_70` for Volta, `-arch=sm_75` for Turing).
3. Allocate memory and copy data to the GPU before calling `launch_wmma_kernel`.

Important considerations:

1. Error checking is omitted for brevity but should be included in production code.
2. This is a basic example. Real-world usage often involves tiling and more complex memory access patterns for larger matrices.
3. Performance tuning is crucial. The exact dimensions and data types should be chosen based on your specific use case and target GPU architecture.
4. Not all operations can be efficiently mapped to tensor cores. They're most beneficial for large matrix multiplications common in deep learning workloads.

Remember, while this approach gives you direct control over tensor core usage, in many cases, using higher-level libraries like cuBLAS or cuDNN is more practical and can automatically leverage tensor cores when appropriate.

2/09/2023

AWS ec2 gpu instance comparison

Architecture	NVIDIA GPU	Instance type	Instance name	Number of GPUs	GPU Memory (per GPU)	GPU Interconnect (NVLink / PCIe)	Thermal Design Power (TDP) from nvidia-smi	Tensor Cores (mixed-precision)	Precision Support	CPU Type	Nitro based
Ampere	A100	P4	p4d.24xlarge	8	40 GB	NVLink gen 3 (600 GB/s)	400W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	Intel Xeon Scalable (Cascade Lake)	Yes
Ampere	A10G	G5	g5.xlarge	1	24 GB	NA (single GPU)	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.2xlarge	1	24 GB	NA (single GPU)	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.4xlarge	1	24 GB	NA (single GPU)	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.8xlarge	1	24 GB	NA (single GPU)	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.16xlarge	1	24 GB	NA (single GPU)	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.12xlarge	4	24 GB	PCIe	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.24xlarge	4	24 GB	PCIe	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Ampere	A10G	G5	g5.48xlarge	8	24 GB	PCIe	300W	Tensor Cores (Gen 3)	FP64, FP32, FP16, INT8, BF16, TF32	AMD EPYC	Yes
Turing	T4G	G5	g5g.xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4G	G5	g5g.2xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4G	G5	g5g.4xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4G	G5	g5g.8xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4G	G5	g5g.16xlarge	2	16 GB	PCIe	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4G	G5	g5g.metal	2	16 GB	PCIe	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	AWS Graviton2	Yes
Turing	T4	G4	g4dn.xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.2xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.4xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.8xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.16xlarge	1	16 GB	NA (single GPU)	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.12xlarge	4	16 GB	PCIe	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Turing	T4	G4	g4dn.metal	8	16 GB	PCIe	70W	Tensor Cores (Gen 2)	FP32, FP16, INT8	Intel Xeon Scalable (Cascade Lake)	Yes
Volta	V100	P3	p3.2xlarge	1	16 GB	NA (single GPU)	300W	Tensor Cores (Gen 1)	FP64, FP32, FP16	Intel Xeon (Broadwell)	No
Volta	V100	P3	p3.8xlarge	4	16 GB	NVLink gen 2 (300 GB/s)	300W	Tensor Cores (Gen 1)	FP64, FP32, FP16	Intel Xeon (Broadwell)	No
Volta	V100	P3	p3.16xlarge	8	16 GB	NVLink gen 2 (300 GB/s)	300W	Tensor Cores (Gen 1)	FP64, FP32, FP16	Intel Xeon (Broadwell)	No
Volta	V100*	P3	p3dn.24xlarge	8	32 GB	NVLink gen 2 (300 GB/s)	300W	Tensor Cores (Gen 1)	FP64, FP32, FP16	Intel Xeon (Skylake)	Yes
Kepler	K80	P2	p2.xlarge	1	12 GB	NA (single GPU)	149W	No	FP64, FP32	Intel Xeon (Broadwell)	No
Kepler	K80	P2	p2.8xlarge	8	12 GB	PCIe	149W	No	FP64, FP32	Intel Xeon (Broadwell)	No
Kepler	K80	P2	p2.16xlarge	16	12 GB	PCIe	149W	No	FP64, FP32	Intel Xeon (Broadwell)	No
Maxwell	M60	G3	g3s.xlarge	1	8 GB	PCIe	150W	No	FP32	Intel Xeon (Broadwell)	No
Maxwell	M60	G3	g3.4xlarge	1	8 GB	PCIe	150W	No	FP32	Intel Xeon (Broadwell)	No
Maxwell	M60	G3	g3.8xlarge	2	8 GB	PCIe	150W	No	FP32	Intel Xeon (Broadwell)	No
Maxwell	M60	G3	g3.16xlarge	4	8 GB	PCIe	150W	No	FP32	Intel Xeon (Broadwell)	No

MareArts Computer Vision Study.

Pages

8/25/2025

GPU memory vs shared memory

CK Tile Tutorial Day 2 (AMD hip programming) - Simple GEMM.

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.

3/07/2025

Check my torch support GPU

9/17/2024

What is IREE turbine

HIP kernel for matrix multiplication that can leverage Matrix Cores

Creating a custom CUDA kernel that directly utilizes tensor cores

2/09/2023

AWS ec2 gpu instance comparison