8/25/2025

GPU memory vs shared memory



ceiling technic

 Add the maximum possible remainder (TILE-1) before dividing!


  Simple Example with 10:


  Let's use TILE=10 for easier understanding:


  Without ceiling (wrong):


  11 / 10 = 1 (need 2!) ❌

  19 / 10 = 1 (need 2!) ❌


  With ceiling technique:


  (11 + 9) / 10 = 20 / 10 = 2 ✓

  (19 + 9) / 10 = 28 / 10 = 2 ✓

  (20 + 9) / 10 = 29 / 10 = 2 ✓ (still 2, correct!)

  (21 + 9) / 10 = 30 / 10 = 3 ✓


  Why Add (TILE-1)?


  Think of it like this:


  If remainder = 0 (perfectly divisible):


  20 / 10 = 2.0

  (20 + 9) / 10 = 29 / 10 = 2.9 → 2 (same!)


  If remainder > 0 (needs extra block):


  21 / 10 = 2.1 (remainder 1)

  (21 + 9) / 10 = 30 / 10 = 3.0 → 3 (pushed to next!)


  The Magic:


  Adding (TILE-1):

  - Remainder 0: Adds 0.9999... → stays same integer

  - Remainder ≥1: Adds enough to reach next integer


  Visual Pattern:


  Value | +9 | /10 | Result

  ------|----|----|-------

  10    | 19 | 1.9| 1 ✓

  11    | 20 | 2.0| 2 ✓ (jumps up!)

  19    | 28 | 2.8| 2 ✓

  20    | 29 | 2.9| 2 ✓

  21    | 30 | 3.0| 3 ✓ (jumps up!)


  Formula Summary:


  // Ceiling division formula:

  ceil(A/B) = (A + B - 1) / B


  // For our GEMM tiles:

  num_blocks = (matrix_size + tile_size - 1) / tile_size


  It's simple: "Add almost one tile, then divide" - this guarantees rounding up!

CK Tile Tutorial Day 2 (AMD hip programming) - Simple GEMM.

 Concepts Added:

  • 2D grid/block configuration
  • Matrix multiplication basics
  • Each thread computes one output element

Key Pattern:

// Each thread computes C[row][col]
for (int k = 0; k < K; k++) {
    sum += A[row][k] * B[k][col];
}
.
=== Thread Mapping Visualization ===
Each thread computes one C[i][j]:

  Block(0,0)        Block(1,0)
  ┌─────────┐      ┌─────────┐
  │T00 T01..│      │T00 T01..│
  │T10 T11..│      │T10 T11..│
  │... ... ..│      │... ... ..│
  └─────────┘      └─────────┘
       ↓                ↓
  C[0:16,0:16]    C[0:16,16:32]

Each thread's work:
  for k in 0..K:
    sum += A[row][k] * B[k][col]
  C[row][col] = sum

=== Step 2: Simple GEMM ===
Matrix multiply: (64x64) * (64x64) = (64x64)
Launching with grid(4,4), block(16,16)
Result: CORRECT
Time: 0.4232 ms
Performance: 1.23887 GFLOPS

=== Step 2: Simple GEMM ===
Matrix multiply: (128x128) * (128x128) = (128x128)
Launching with grid(8,8), block(16,16)
Result: CORRECT
Time: 0.03824 ms
Performance: 109.684 GFLOPS

Key Concepts Added:
1. 2D grid/block configuration
2. Each thread computes one output element
3. Row-major vs column-major layouts
4. Performance measurement (GFLOPS)
..

code
.
// Step 2: Simple GEMM (Matrix Multiplication)
// Building on Step 1, now each thread computes one output element

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments
// ============================================
struct SimpleGemmKernelArgs {
const float* a_ptr; // M x K matrix
const float* b_ptr; // K x N matrix
float* c_ptr; // M x N matrix
int M;
int N;
int K;
SimpleGemmKernelArgs(const float* a, const float* b, float* c,
int m, int n, int k)
: a_ptr(a), b_ptr(b), c_ptr(c), M(m), N(n), K(k) {}
};

// ============================================
// PART 2: The Kernel (One thread per output)
// ============================================
struct SimpleGemmKernel {
static dim3 GridSize(const SimpleGemmKernelArgs& args) {
// 16x16 threads per block
int grid_m = (args.M + 15) / 16;
int grid_n = (args.N + 15) / 16;
return dim3(grid_n, grid_m, 1); // Note: x=N, y=M
}
static dim3 BlockSize() {
return dim3(16, 16, 1); // 16x16 = 256 threads
}
__device__ void operator()(const SimpleGemmKernelArgs& args) const {
// Each thread computes one element of C
int col = blockIdx.x * blockDim.x + threadIdx.x; // N dimension
int row = blockIdx.y * blockDim.y + threadIdx.y; // M dimension
// Bounds check
if (row >= args.M || col >= args.N) return;
// Compute dot product for C[row][col]
float sum = 0.0f;
for (int k = 0; k < args.K; k++) {
// A is row-major: A[row][k] = A[row * K + k]
// B is column-major: B[k][col] = B[k + col * K]
float a_val = args.a_ptr[row * args.K + k];
float b_val = args.b_ptr[k + col * args.K];
sum += a_val * b_val;
}
// Store result (C is row-major)
args.c_ptr[row * args.N + col] = sum;
}
};

// ============================================
// PART 3: Host Code
// ============================================
__global__ void simple_gemm_kernel(SimpleGemmKernelArgs args) {
SimpleGemmKernel kernel;
kernel(args);
}

void run_simple_gemm(int M, int N, int K) {
std::cout << "\n=== Step 2: Simple GEMM ===\n";
std::cout << "Matrix multiply: (" << M << "x" << K << ") * ("
<< K << "x" << N << ") = (" << M << "x" << N << ")\n";
// Allocate host memory
std::vector<float> h_a(M * K);
std::vector<float> h_b(K * N);
std::vector<float> h_c(M * N, 0.0f);
// Initialize with simple values
for (int i = 0; i < M * K; i++) h_a[i] = 1.0f;
for (int i = 0; i < K * N; i++) h_b[i] = 2.0f;
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, M * K * sizeof(float));
hipMalloc(&d_b, K * N * sizeof(float));
hipMalloc(&d_c, M * N * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), M * K * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), K * N * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
SimpleGemmKernelArgs args(d_a, d_b, d_c, M, N, K);
// Get launch configuration
dim3 grid = SimpleGemmKernel::GridSize(args);
dim3 block = SimpleGemmKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "," << grid.y
<< "), block(" << block.x << "," << block.y << ")\n";
// Launch kernel
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
simple_gemm_kernel<<<grid, block>>>(args);
hipEventRecord(stop);
hipEventSynchronize(stop);
float milliseconds = 0;
hipEventElapsedTime(&milliseconds, start, stop);
// Copy result back
hipMemcpy(h_c.data(), d_c, M * N * sizeof(float), hipMemcpyDeviceToHost);
// Verify (each element should be K * 1.0 * 2.0 = 2K)
float expected = 2.0f * K;
bool correct = true;
for (int i = 0; i < std::min(10, M*N); i++) {
if (h_c[i] != expected) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "Time: " << milliseconds << " ms\n";
// Calculate FLOPS
double flops = 2.0 * M * N * K; // 2 ops per multiply-add
double gflops = (flops / milliseconds) / 1e6;
std::cout << "Performance: " << gflops << " GFLOPS\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
hipEventDestroy(start);
hipEventDestroy(stop);
}

// ============================================
// VISUALIZATION: How threads map to output
// ============================================
void visualize_thread_mapping() {
std::cout << "\n=== Thread Mapping Visualization ===\n";
std::cout << "Each thread computes one C[i][j]:\n\n";
std::cout << " Block(0,0) Block(1,0)\n";
std::cout << " ┌─────────┐ ┌─────────┐\n";
std::cout << " │T00 T01..│ │T00 T01..│\n";
std::cout << " │T10 T11..│ │T10 T11..│\n";
std::cout << " │... ... ..│ │... ... ..│\n";
std::cout << " └─────────┘ └─────────┘\n";
std::cout << " ↓ ↓\n";
std::cout << " C[0:16,0:16] C[0:16,16:32]\n\n";
std::cout << "Each thread's work:\n";
std::cout << " for k in 0..K:\n";
std::cout << " sum += A[row][k] * B[k][col]\n";
std::cout << " C[row][col] = sum\n";
}

// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 2: Simple GEMM\n";
std::cout << "======================================\n";
visualize_thread_mapping();
// Run with different sizes
run_simple_gemm(64, 64, 64);
run_simple_gemm(128, 128, 128);
std::cout << "\nKey Concepts Added:\n";
std::cout << "1. 2D grid/block configuration\n";
std::cout << "2. Each thread computes one output element\n";
std::cout << "3. Row-major vs column-major layouts\n";
std::cout << "4. Performance measurement (GFLOPS)\n";
std::cout << "\nProblem: Each thread reads K elements from A and B\n";
std::cout << " → Poor memory reuse!\n";
std::cout << "Next: Add tiling and shared memory for efficiency\n";
return 0;
}
..

🙇🏻‍♂️
MareArts

8/24/2025

CK Tile Tutorial Day 1 (AMD hip programming) - Vector add.

.

Concepts:

  • Basic kernel structure: Args → Kernel → operator()
  • Grid/Block configuration
  • One thread per element processing

Key Code:

struct VectorAddKernel {
    __device__ void operator()(args) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        c[idx] = a[idx] + b[idx];
    }
};


..

code

..

// Step 1: Simplest CK Tile Kernel - Vector Addition
// This demonstrates the absolute basics of CK Tile

#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>

// ============================================
// PART 1: Kernel Arguments (Host → Device)
// ============================================
struct VectorAddKernelArgs {
const float* a_ptr;
const float* b_ptr;
float* c_ptr;
int n;
// Constructor from host arguments
VectorAddKernelArgs(const float* a, const float* b, float* c, int size)
: a_ptr(a), b_ptr(b), c_ptr(c), n(size) {}
};

// ============================================
// PART 2: The Kernel
// ============================================
struct VectorAddKernel {
// Static method to get grid size (how many blocks)
static dim3 GridSize(const VectorAddKernelArgs& args) {
// 256 threads per block, divide work
int blocks = (args.n + 255) / 256;
return dim3(blocks, 1, 1);
}
// Static method to get block size (threads per block)
static dim3 BlockSize() {
return dim3(256, 1, 1);
}
// The actual kernel function - called by each thread
__device__ void operator()(const VectorAddKernelArgs& args) const {
// Calculate global thread index
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Check bounds
if (idx < args.n) {
// Each thread does one element
args.c_ptr[idx] = args.a_ptr[idx] + args.b_ptr[idx];
}
}
};

// ============================================
// PART 3: Host Launch Function
// ============================================
__global__ void vector_add_kernel(VectorAddKernelArgs args) {
VectorAddKernel kernel;
kernel(args);
}

void run_vector_add(int n) {
std::cout << "\n=== Step 1: Vector Addition ===\n";
std::cout << "Adding two vectors of size " << n << "\n";
// Allocate host memory
std::vector<float> h_a(n, 1.0f);
std::vector<float> h_b(n, 2.0f);
std::vector<float> h_c(n, 0.0f);
// Allocate device memory
float *d_a, *d_b, *d_c;
hipMalloc(&d_a, n * sizeof(float));
hipMalloc(&d_b, n * sizeof(float));
hipMalloc(&d_c, n * sizeof(float));
// Copy to device
hipMemcpy(d_a, h_a.data(), n * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_b, h_b.data(), n * sizeof(float), hipMemcpyHostToDevice);
// Create kernel arguments
VectorAddKernelArgs args(d_a, d_b, d_c, n);
// Get launch configuration
dim3 grid = VectorAddKernel::GridSize(args);
dim3 block = VectorAddKernel::BlockSize();
std::cout << "Launching with grid(" << grid.x << "), block(" << block.x << ")\n";
// Launch kernel
vector_add_kernel<<<grid, block>>>(args);
// Copy result back
hipMemcpy(h_c.data(), d_c, n * sizeof(float), hipMemcpyDeviceToHost);
// Verify
bool correct = true;
for (int i = 0; i < std::min(10, n); i++) {
if (h_c[i] != 3.0f) {
correct = false;
break;
}
}
std::cout << "Result: " << (correct ? "CORRECT" : "WRONG") << "\n";
std::cout << "First 5 elements: ";
for (int i = 0; i < std::min(5, n); i++) {
std::cout << h_c[i] << " ";
}
std::cout << "\n";
// Cleanup
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
}

// ============================================
// PART 4: Main
// ============================================
int main() {
std::cout << "MareArts CK Tile Tutorial - Step 1: Vector Addition\n";
std::cout << "==========================================\n";
// Run with different sizes
run_vector_add(1024);
run_vector_add(10000);
std::cout << "\nKey Concepts Demonstrated:\n";
std::cout << "1. Kernel structure: Args → Kernel → operator()\n";
std::cout << "2. Grid/Block configuration\n";
std::cout << "3. Each thread processes one element\n";
std::cout << "4. Bounds checking for safety\n";
return 0;
}

...


Result

CK Tile Tutorial - Step 1: Vector Addition

==========================================


=== Step 1: Vector Addition ===

Adding two vectors of size 1024

Launching with grid(4), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3 


=== Step 1: Vector Addition ===

Adding two vectors of size 10000

Launching with grid(40), block(256)

Result: CORRECT

First 5 elements: 3 3 3 3 3 


Key Concepts Demonstrated:

1. Kernel structure: Args → Kernel → operator()

2. Grid/Block configuration

3. Each thread processes one element

4. Bounds checking for safety



8/22/2025

ONNX Runtime with ROCm (AMD GPU) Setup Guide

ONNX Runtime with ROCm (AMD GPU) Setup Guide

Installation

Prerequisites

  • ROCm installed (6.0+ recommended)
  • Python 3.8-3.10

Install ONNX Runtime with ROCm Support

# 1. Remove existing ONNX Runtime (if any)
pip uninstall -y onnxruntime onnxruntime-gpu

# 2. Install from AMD ROCm repository
# For ROCm 6.4
pip install onnxruntime-rocm -f https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/

# For ROCm 6.2
pip install onnxruntime-rocm -f https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2/

# For ROCm 6.0
pip install onnxruntime-rocm -f https://repo.radeon.com/rocm/manylinux/rocm-rel-6.0/

Verify Installation

import onnxruntime as ort

# Check available providers
print("Available providers:", ort.get_available_providers())

# Should show: ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']

Simple Usage Example

import onnxruntime as ort
import numpy as np

# Load ONNX model with ROCm
providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession("model.onnx", providers=providers)

# Check which provider is being used
print(f"Using: {session.get_providers()[0]}")

# Prepare input (example: batch_size=1, 3 channels, 640x640 image)
input_data = np.random.randn(1, 3, 640, 640).astype(np.float32)

# Run inference
input_name = session.get_inputs()[0].name
output = session.run(None, {input_name: input_data})

print(f"Output shape: {output[0].shape}")

Advanced: Using MIGraphX (AMD Optimized)

# MIGraphX is AMD's optimized graph execution provider
# It can be faster than ROCMExecutionProvider for some models

providers = [
    'MIGraphXExecutionProvider',  # Fastest on AMD
    'ROCMExecutionProvider',       # Standard ROCm
    'CPUExecutionProvider'         # Fallback
]

session = ort.InferenceSession("model.onnx", providers=providers)

Complete Example: Image Detection

import onnxruntime as ort
import numpy as np
import cv2

def load_model(model_path, use_gpu=True):
    """Load ONNX model with ROCm support"""
    if use_gpu:
        providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
    else:
        providers = ['CPUExecutionProvider']
    
    session = ort.InferenceSession(model_path, providers=providers)
    print(f"Model loaded with: {session.get_providers()[0]}")
    return session

def preprocess_image(image_path, size=640):
    """Preprocess image for inference"""
    image = cv2.imread(image_path)
    resized = cv2.resize(image, (size, size))
    rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    normalized = rgb.astype(np.float32) / 255.0
    transposed = normalized.transpose(2, 0, 1)  # HWC to CHW
    batched = np.expand_dims(transposed, axis=0)  # Add batch dimension
    return batched, image

def run_inference(session, input_data):
    """Run model inference"""
    input_name = session.get_inputs()[0].name
    outputs = session.run(None, {input_name: input_data})
    return outputs

# Usage
model = load_model("rtdetr_fp32.onnx", use_gpu=True)
input_data, original_image = preprocess_image("test.jpg")
outputs = run_inference(model, input_data)

print(f"Detection output shape: {outputs[0].shape}")

Troubleshooting

1. ROCMExecutionProvider not available

# Check ROCm installation
import subprocess
result = subprocess.run(['rocm-smi'], capture_output=True, text=True)
print(result.stdout)

2. Fallback to CPU

If ONNX Runtime falls back to CPU despite having ROCm:

  • Check ROCm version compatibility
  • Verify GPU is visible: rocm-smi
  • Set environment variable: export HIP_VISIBLE_DEVICES=0

3. Performance Tips

  • Use MIGraphXExecutionProvider for best performance on AMD GPUs
  • FP16 models can be faster but may have slight accuracy loss
  • Batch processing improves throughput

Environment Variables

# Select specific GPU
export HIP_VISIBLE_DEVICES=0

# Enable verbose logging
export ORT_ROCM_VERBOSE_LEVEL=1

# Set memory limit (in MB)
export ORT_ROCM_MEM_LIMIT=4096

Performance Comparison

ProviderRelative SpeedUse Case
MIGraphXExecutionProviderFastestProduction, optimized models
ROCMExecutionProviderFastGeneral purpose
CPUExecutionProviderSlowestFallback, debugging

Notes

  • ONNX Runtime ROCm version should match your ROCm installation
  • Not all ONNX operators are supported on ROCm - unsupported ops fall back to CPU
  • For best performance, export models with static shapes

7/25/2025

Understanding C++ Templates, (class, member function and this)

 

template<int INT>
void AAA() {
std::cout << "INT = " << INT << std::endl;
}

How to Call It:

Normal Context (Outside Class):

AAA<2>(); // ✅ Just call directly
AAA<5>(); // ✅ Works fine
AAA<10>(); // ✅ No problem

Inside Template Class Context:

template<typename T>
class MyClass {
template<int INT>
void AAA() {
std::cout << "INT = " << INT << std::endl;
}

void some_function() {
// WRONG:
template AAA<2>(); // ❌ ERROR! Invalid syntax

// CORRECT:
this->template AAA<2>(); // ✅ Works!
AAA<2>(); // ✅ Usually works too
}
};

Key Point:

- template AAA<2>(); is INVALID C++ syntax
- this->template AAA<2>(); is VALID C++ syntax

The Rule:

- template keyword only goes after -> or . in template contexts
- You cannot start a statement with just template

Correct Examples:

this->template AAA<2>(); // ✅ In class context
obj.template AAA<2>(); // ✅ With object
ptr->template AAA<2>(); // ✅ With pointer
AAA<2>(); // ✅ Direct call (no template keyword needed)

C++ tuple vs vector, Tutorial

 

// ============================================================================
// SIMPLE COMPARISON: TUPLE vs VECTOR
// When to use which one and why they're different
// ============================================================================

#include <tuple>
#include <vector>
#include <iostream>
#include <string>

// ============================================================================
// VECTOR: Dynamic Array of SAME TYPE
// ============================================================================

void vector_example() {
std::cout << "=== VECTOR EXAMPLE ===" << std::endl;
// Vector holds multiple values of THE SAME TYPE
std::vector<int> numbers = {10, 20, 30, 40, 50};
std::cout << "Vector contents: ";
for (size_t i = 0; i < numbers.size(); i++) {
std::cout << numbers[i] << " ";
}
std::cout << std::endl;
// Add more elements
numbers.push_back(60);
numbers.push_back(70);
std::cout << "After adding elements: ";
for (int num : numbers) {
std::cout << num << " ";
}
std::cout << std::endl;
std::cout << "Vector size: " << numbers.size() << std::endl;
std::cout << "Can grow/shrink at runtime: YES" << std::endl;
std::cout << "All elements same type: YES (int)" << std::endl;
}

// ============================================================================
// TUPLE: Fixed Collection of DIFFERENT TYPES
// ============================================================================

void tuple_example() {
std::cout << "\n=== TUPLE EXAMPLE ===" << std::endl;
// Tuple holds DIFFERENT TYPES in fixed positions
std::tuple<int, float, std::string, bool> data = {42, 3.14f, "hello", true};
std::cout << "Tuple contents:" << std::endl;
std::cout << "Position 0 (int): " << std::get<0>(data) << std::endl;
std::cout << "Position 1 (float): " << std::get<1>(data) << std::endl;
std::cout << "Position 2 (string): " << std::get<2>(data) << std::endl;
std::cout << "Position 3 (bool): " << std::get<3>(data) << std::endl;
// Cannot add elements - size is fixed at compile time
// data.push_back(something); // ERROR! No such method
std::cout << "Tuple size: " << std::tuple_size_v<decltype(data)> << std::endl;
std::cout << "Can grow/shrink at runtime: NO" << std::endl;
std::cout << "All elements same type: NO (int, float, string, bool)" << std::endl;
}

// ============================================================================
// SIDE-BY-SIDE COMPARISON
// ============================================================================

void comparison() {
std::cout << "\n=== SIDE-BY-SIDE COMPARISON (marearts.com)===" << std::endl;
// VECTOR: All same type, dynamic size
std::vector<double> grades = {85.5, 92.0, 78.5, 95.0};
// TUPLE: Different types, fixed size
std::tuple<std::string, int, double, char> student = {"Alice", 20, 89.5, 'A'};
std::cout << "VECTOR (grades):" << std::endl;
std::cout << " Type: vector<double>" << std::endl;
std::cout << " Values: ";
for (double grade : grades) {
std::cout << grade << " ";
}
std::cout << std::endl;
std::cout << " Access: grades[0], grades[1], grades[2]..." << std::endl;
std::cout << " Can add more: YES (grades.push_back(88.0))" << std::endl;
std::cout << "\nTUPLE (student info):" << std::endl;
std::cout << " Type: tuple<string, int, double, char>" << std::endl;
std::cout << " Values: " << std::get<0>(student) << ", "
<< std::get<1>(student) << ", "
<< std::get<2>(student) << ", "
<< std::get<3>(student) << std::endl;
std::cout << " Access: std::get<0>(student), std::get<1>(student)..." << std::endl;
std::cout << " Can add more: NO (fixed at compile time)" << std::endl;
}

// ============================================================================
// WHEN TO USE WHICH?
// ============================================================================

void when_to_use() {
std::cout << "\n=== WHEN TO USE WHICH? ===" << std::endl;
std::cout << "USE VECTOR WHEN:" << std::endl;
std::cout << " ✓ All elements are the same type" << std::endl;
std::cout << " ✓ You don't know size at compile time" << std::endl;
std::cout << " ✓ Size can change during runtime" << std::endl;
std::cout << " ✓ You want to loop through elements" << std::endl;
std::cout << " ✓ Examples: list of numbers, array of names, dynamic data" << std::endl;
std::cout << "\nUSE TUPLE WHEN:" << std::endl;
std::cout << " ✓ Elements have different types" << std::endl;
std::cout << " ✓ Size is fixed and known at compile time" << std::endl;
std::cout << " ✓ Each position has specific meaning" << std::endl;
std::cout << " ✓ You want type safety for each element" << std::endl;
std::cout << " ✓ Examples: coordinates (x,y), person data (name,age,score), config settings" << std::endl;
}

// ============================================================================
// PRACTICAL EXAMPLES
// ============================================================================

void practical_examples() {
std::cout << "\n=== PRACTICAL EXAMPLES ===" << std::endl;
// VECTOR EXAMPLES (same type, dynamic)
std::cout << "VECTOR use cases:" << std::endl;
std::vector<int> test_scores; // Dynamic list of test scores
test_scores.push_back(85);
test_scores.push_back(92);
test_scores.push_back(78); // Can keep adding
std::vector<std::string> names = {"Alice", "Bob", "Charlie"}; // List of names
std::cout << " Test scores: ";
for (int score : test_scores) std::cout << score << " ";
std::cout << std::endl;
// TUPLE EXAMPLES (different types, fixed)
std::cout << "\nTUPLE use cases:" << std::endl;
std::tuple<int, int> coordinates = {10, 20}; // 2D point (x, y)
std::tuple<std::string, int, char> student = {"Bob", 19, 'B'}; // Name, age, grade
std::tuple<float, float, float> rgb_color = {0.8f, 0.2f, 0.5f}; // Red, green, blue
auto [x, y] = coordinates; // C++17 structured binding
std::cout << " Coordinates: (" << x << ", " << y << ")" << std::endl;
std::cout << " Student: " << std::get<0>(student) << ", age " << std::get<1>(student)
<< ", grade " << std::get<2>(student) << std::endl;
}

// ============================================================================
// MEMORY AND PERFORMANCE
// ============================================================================

void memory_and_performance() {
std::cout << "\n=== MEMORY & PERFORMANCE ===" << std::endl;
// Vector - dynamic allocation
std::vector<int> vec = {1, 2, 3, 4, 5};
// Tuple - compile-time known, usually stack allocated
std::tuple<int, int, int, int, int> tup = {1, 2, 3, 4, 5};
std::cout << "VECTOR:" << std::endl;
std::cout << " Memory: Heap allocated (dynamic)" << std::endl;
std::cout << " Access: Runtime indexing vec[i]" << std::endl;
std::cout << " Size overhead: Stores capacity + size info" << std::endl;
std::cout << " Performance: Slight overhead for bounds checking" << std::endl;
std::cout << "\nTUPLE:" << std::endl;
std::cout << " Memory: Usually stack allocated (compile-time size)" << std::endl;
std::cout << " Access: Compile-time indexing std::get<N>()" << std::endl;
std::cout << " Size overhead: None (just the data)" << std::endl;
std::cout << " Performance: Maximum efficiency (compile-time optimized)" << std::endl;
}

// ============================================================================
// CANNOT MIX - ERROR EXAMPLES
// ============================================================================

void error_examples() {
std::cout << "\n=== WHAT YOU CANNOT DO ===" << std::endl;
std::cout << "VECTOR limitations:" << std::endl;
std::cout << " ✗ Cannot mix types: vector<int, string> // COMPILER ERROR" << std::endl;
std::cout << " ✗ All elements must be same type" << std::endl;
std::cout << "\nTUPLE limitations:" << std::endl;
std::cout << " ✗ Cannot resize: tuple.push_back() // NO SUCH METHOD" << std::endl;
std::cout << " ✗ Cannot loop easily (no iterators)" << std::endl;
std::cout << " ✗ Must know position at compile time: tuple[i] // ERROR" << std::endl;
// Demonstrate what works
std::vector<int> good_vector = {1, 2, 3}; // ✓ Same type
// std::vector<int, string> bad_vector; // ✗ COMPILER ERROR
std::tuple<int, std::string> good_tuple = {42, "hello"}; // ✓ Different types
// good_tuple.push_back(123); // ✗ NO SUCH METHOD
std::cout << "\nWorking examples:" << std::endl;
std::cout << " Vector: " << good_vector[0] << ", " << good_vector[1] << std::endl;
std::cout << " Tuple: " << std::get<0>(good_tuple) << ", " << std::get<1>(good_tuple) << std::endl;
}

// ============================================================================
// MAIN FUNCTION
// ============================================================================

int main() {
std::cout << "TUPLE vs VECTOR - Simple Comparison" << std::endl;
std::cout << "====================================" << std::endl;
vector_example();
tuple_example();
comparison();
when_to_use();
practical_examples();
memory_and_performance();
error_examples();
std::cout << "\n=== QUICK SUMMARY ===" << std::endl;
std::cout << "VECTOR = Same type + Dynamic size + Runtime flexibility" << std::endl;
std::cout << "TUPLE = Different types + Fixed size + Compile-time safety" << std::endl;
std::cout << "\nThey solve different problems!" << std::endl;
return 0;
}

// ============================================================================
// KEY DIFFERENCES SUMMARY:
// ============================================================================
//
// VECTOR<T>:
// - All elements type T (same type)
// - Dynamic size (can grow/shrink)
// - Runtime access: vec[i]
// - Memory: heap allocated
// - Use for: lists, arrays, collections
//
// TUPLE<T1, T2, T3>:
// - Each element different type
// - Fixed size (compile-time)
// - Compile-time access: std::get<N>()
// - Memory: usually stack
// - Use for: structured data, coordinates, configs
//
// CHOOSE BASED ON:
// - Same vs different types?
// - Dynamic vs fixed size?
// - Runtime vs compile-time access?