6/02/2025

C++ Variadic Template example code for studying

 

Example 1: Basic Variadic Template

#include <iostream>

// 1. Simple function that accepts any number of arguments
template <typename... Args>
void print_simple(Args... args) {
    std::cout << "Number of arguments: " << sizeof...(args) << std::endl;
}

int main() {
    print_simple();                    // 0 arguments
    print_simple(1);                   // 1 argument
    print_simple(1, 2.5, "hello");    // 3 arguments
    return 0;
}

Example 2: Print All Arguments (Recursion)

#include <iostream>

// Base case: no arguments left
void print() {
    std::cout << std::endl;
}

// Recursive case: print first, then rest
template <typename First, typename... Rest>
void print(First first, Rest... rest) {
    std::cout << first << " ";
    print(rest...);  // Call with remaining arguments
}

int main() {
    print(1, 2, 3);                        // Output: 1 2 3
    print("Hello", 42, 3.14, 'A');        // Output: Hello 42 3.14 A
    return 0;
}

Example 3: Simple Integer Sequence

#include <iostream>

// Our own simple Sequence class
template <int... Values>
struct Sequence {
    static constexpr int size = sizeof...(Values);
    
    static void print() {
        std::cout << "Sequence contains " << size << " values: ";
        ((std::cout << Values << " "), ...);  // C++17 fold expression
        std::cout << std::endl;
    }
};

int main() {
    using Seq1 = Sequence<1, 2, 3>;
    using Seq2 = Sequence<10>;
    using Seq3 = Sequence<>;  // empty
    
    Seq1::print();  // Output: Sequence contains 3 values: 1 2 3
    Seq2::print();  // Output: Sequence contains 1 values: 10
    Seq3::print();  // Output: Sequence contains 0 values:
    
    std::cout << "Seq1 size: " << Seq1::size << std::endl;  // Output: 3
    
    return 0;
}

Example 4: Sequence with Array Access

#include <iostream>
#include <array>

template <int... Values>
struct Sequence {
    static constexpr int size = sizeof...(Values);
    static constexpr std::array<int, size> values = {Values...};
    
    static constexpr int at(int index) {
        return values[index];
    }
    
    static void print_all() {
        for (int i = 0; i < size; ++i) {
            std::cout << "Index " << i << ": " << at(i) << std::endl;
        }
    }
};

int main() {
    using MySeq = Sequence<10, 20, 30, 40>;
    
    std::cout << "Size: " << MySeq::size << std::endl;
    std::cout << "Element at index 2: " << MySeq::at(2) << std::endl;
    
    MySeq::print_all();
    
    return 0;
}

Example 5: Type Sequence (Like std::tuple)

#include <iostream>
#include <typeinfo>

// Store multiple types
template <typename... Types>
struct TypeList {
    static constexpr int size = sizeof...(Types);
};

// Helper to print type names
template <typename T>
void print_type() {
    std::cout << typeid(T).name() << " ";
}

template <typename... Types>
void print_types() {
    std::cout << "Types (" << sizeof...(Types) << "): ";
    ((print_type<Types>()), ...);
    std::cout << std::endl;
}

int main() {
    using MyTypes = TypeList<int, double, char>;
    std::cout << "Number of types: " << MyTypes::size << std::endl;
    
    print_types<int, double, char>();
    print_types<std::string>();
    
    return 0;
}

Example 6: CK-Style Sequence Usage

#include <iostream>

// Simplified CK-style Sequence
template <int... Is>
struct Sequence {
    static constexpr int Size() { return sizeof...(Is); }
    
    template <int N>
    static constexpr int At() {
        constexpr int arr[] = {Is...};
        return arr[N];
    }
};

// Alias for convenience
template <int... Is>
using S = Sequence<Is...>;

// Function that uses sequences for dimension mapping
template <int... InputDims, int... OutputDims>
void describe_mapping(S<InputDims...>, S<OutputDims...>) {
    std::cout << "Mapping: ";
    std::cout << "Input dims [";
    ((std::cout << InputDims << " "), ...);
    std::cout << "] -> Output dims [";
    ((std::cout << OutputDims << " "), ...);
    std::cout << "]" << std::endl;
}

int main() {
    // Define dimension indices
    S<0> batch_dim{};      // Dimension 0: batch
    S<1> height_dim{};     // Dimension 1: height
    S<2> width_dim{};      // Dimension 2: width
    S<3> channel_dim{};    // Dimension 3: channel
    
    // Example: NHWC to NCHW transformation mapping
    describe_mapping(S<0,1,2,3>{}, S<0,3,1,2>{});
    // Output: Mapping: Input dims [0 1 2 3 ] -> Output dims [0 3 1 2 ]
    
    // Example: Merge height and width
    describe_mapping(S<0,1,2,3>{}, S<0,1,2>{});
    // Output: Mapping: Input dims [0 1 2 3 ] -> Output dims [0 1 2 ]
    
    // Access sequence properties
    using NHWC = S<0,1,2,3>;
    std::cout << "NHWC dimensions: " << NHWC::Size() << std::endl;
    std::cout << "Height dimension index: " << NHWC::At<1>() << std::endl;
    
    return 0;
}

Example 7: Advanced - Tensor Transformation Simulator

#include <iostream>
#include <vector>

template <int... Dims>
struct Sequence {
    static constexpr int size = sizeof...(Dims);
    static constexpr int dims[size] = {Dims...};
};

template <int... Is>
using S = Sequence<Is...>;

// Simplified tensor descriptor
struct TensorDesc {
    std::vector<int> shape;
    std::vector<std::string> dim_names;
    
    TensorDesc(std::initializer_list<int> s, std::initializer_list<std::string> n)
        : shape(s), dim_names(n) {}
    
    void print() const {
        std::cout << "Tensor shape: ";
        for (size_t i = 0; i < shape.size(); ++i) {
            std::cout << dim_names[i] << "=" << shape[i] << " ";
        }
        std::cout << std::endl;
    }
};

// Transform function that uses sequences
template <int... InputIndices, int... OutputIndices>
TensorDesc transform_tensor(const TensorDesc& input,
                           S<InputIndices...> in_mapping,
                           S<OutputIndices...> out_mapping) {
    std::vector<int> new_shape;
    std::vector<std::string> new_names;
    
    // Simple pass-through transformation
    constexpr int in_dims[] = {InputIndices...};
    for (int idx : in_dims) {
        new_shape.push_back(input.shape[idx]);
        new_names.push_back(input.dim_names[idx]);
    }
    
    return TensorDesc{new_shape, new_names};
}

int main() {
    // Create NHWC tensor
    TensorDesc input_tensor({32, 224, 224, 3}, {"N", "H", "W", "C"});
    std::cout << "Input ";
    input_tensor.print();
    
    // Transform to NCH (drop W dimension)
    auto output = transform_tensor(input_tensor, 
                                  S<0,3,1>{},    // Take N, C, H
                                  S<0,1,2>{});   // Map to positions 0,1,2
    std::cout << "Output ";
    output.print();
    
    // Another example: reorder to NCHW
    auto reordered = transform_tensor(input_tensor,
                                     S<0,3,1,2>{},  // N,C,H,W order
                                     S<0,1,2,3>{}); // Same positions
    std::cout << "Reordered ";
    reordered.print();
    
    return 0;
}

Key Concepts to Remember:

  1. ... in template: Declares a parameter pack that can accept any number of arguments
  2. ... after variable: Expands the parameter pack
  3. sizeof...(): Gets the number of elements in a pack
  4. Sequence: A compile-time container of values (usually integers)
  5. In CK: Sequences are used to specify which dimensions to transform and how to map them

5/03/2025

Google Coral USB Edge TPU Implementation Guide

 

Google Coral USB Edge TPU Implementation Guide

1. Installation and Troubleshooting

1.1 Hardware Requirements

  • Google Coral USB Accelerator
  • USB 3.0 port (for best performance)
  • Linux host system (Ubuntu, Debian, etc.)

1.2 Installation Steps

# 1. Add the Coral package repository
echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list

# 2. Add the package key
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -

# 3. Update package lists
sudo apt-get update

# 4. Install the Edge TPU runtime
sudo apt-get install libedgetpu1-std

# 5. Create a conda environment with Python 3.9
conda create -n coral_env python=3.9
conda activate coral_env

# 6. Install necessary packages
pip install numpy pillow opencv-python

# 7. Install PyCoral from Google's repository
pip install --extra-index-url https://google-coral.github.io/py-repo/ pycoral

1.3 Common Issues and Solutions

1.3.1 "Failed to load delegate from libedgetpu.so.1" Error

This is a common error that occurs when the Edge TPU library doesn't have executable permissions or when the user doesn't have the right permissions to access the device.

Solution:

# Add executable permission to the library
sudo chmod +x /usr/lib/x86_64-linux-gnu/libedgetpu.so.1.0

# Add your user to the plugdev group
sudo usermod -aG plugdev $USER

# Create proper udev rules
sudo bash -c 'cat > /etc/udev/rules.d/99-edgetpu.rules << EOF
SUBSYSTEM=="usb", ATTRS{idVendor}=="1a6e", ATTRS{idProduct}=="089a", MODE="0664", GROUP="plugdev"
EOF'

# Reload udev rules
sudo udevadm control --reload-rules && sudo udevadm trigger

# Unplug and replug your device

1.3.2 Verifying Installation

Create a simple Python script to check if the Edge TPU is detected:

# check_coral.py
from pycoral.utils import edgetpu

print("Testing Edge TPU...")
devices = edgetpu.list_edge_tpus()
print(f"Available Edge TPUs: {devices}")
if devices:
    print("Edge TPU is working correctly!")
else:
    print("No Edge TPU detected.")

Run with:

python check_coral.py

2. Edge TPU Inference with YOLOv10/YOLOv11

2.1 Complete Implementation

import time
import cv2
import numpy as np
import os
import sys
from PIL import Image
from pycoral.utils import edgetpu
from pycoral.adapters import common


class YOLOv10EdgeTPU:

    def __init__(self, path, conf_thres=0.3, iou_thres=0.5, input_size=(640, 640)):
        self.conf_threshold = conf_thres
        self.iou_threshold = iou_thres
        # Set default input size
        self.default_input_height, self.default_input_width = input_size

        # Initialize model
        self.initialize_model(path)

    def __call__(self, image):
        return self.detect_objects(image)

    def initialize_model(self, path):
        # Load Edge TPU model
        self.interpreter = edgetpu.make_interpreter(path)
        self.interpreter.allocate_tensors()
        
        # Get model info
        self.get_input_details()
        self.get_output_details()

    def detect_objects(self, image):
        input_tensor = self.prepare_input(image)

        # Perform inference on the image
        outputs = self.inference(input_tensor)

        # Process outputs
        self.boxes, self.scores, self.class_ids = self.process_output(outputs)
        
        # Ensure boxes, scores, and class_ids have the same length
        if len(self.boxes) != len(self.scores) or len(self.boxes) != len(self.class_ids):
            return []   

        detections = []
        for i in range(len(self.boxes)):
            detection = {
                'box': self.boxes[i].tolist(),  # Convert numpy array to Python list
                'score': self.scores[i],
                'class_id': self.class_ids[i]
            }
            detections.append(detection)

        return detections

    def prepare_input(self, image):
        self.img_height, self.img_width = image.shape[:2]

        # Convert to RGB (OpenCV loads as BGR)
        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Use the model's input dimensions
        input_width = self.input_width
        input_height = self.input_height

        # Resize input image
        input_img = cv2.resize(input_img, (input_width, input_height))

        # Scale input pixel values to 0 to 1
        input_img = input_img.astype(np.float32) / 255.0
        
        # Add batch dimension
        input_tensor = np.expand_dims(input_img, axis=0)

        return input_tensor

    def inference(self, input_tensor):
        start = time.perf_counter()
        
        # Set input tensor
        common.set_input(self.interpreter, input_tensor)
        
        # Run inference
        self.interpreter.invoke()
        
        # Get all output tensors
        outputs = []
        for i in range(len(self.output_details)):
            outputs.append(self.interpreter.get_tensor(self.output_details[i]['index']))
            
        infer_time = (time.perf_counter() - start) * 1000
        print(f"Inference time: {infer_time:.2f} ms")
        return outputs

    def process_output(self, outputs):
        """Process output for YOLOv10/YOLOv11 TFLite model with shape [1, 5, 8400]"""
        # Get the first output tensor
        predictions = outputs[0]  # Shape [1, 5, 8400]
        print(f"Output 0 shape: {predictions.shape}")
        
        # Transpose the predictions from [1, 5, 8400] to [8400, 5]
        predictions = predictions[0].T  # Now shape is [8400, 5]
        print(f"Transposed predictions shape: {predictions.shape}")
        
        # Extract confidence scores
        objectness_scores = predictions[:, 4]  # Last column is confidence
        
        # Filter by confidence threshold
        valid_indices = np.where(objectness_scores > self.conf_threshold)[0]
        if len(valid_indices) == 0:
            return [], [], []
        
        valid_boxes = predictions[valid_indices, :4]  # First 4 values are bbox coords
        valid_scores = predictions[valid_indices, 4]  # 5th value is confidence
        
        # For single-class model, all class IDs are 0
        valid_class_ids = np.zeros(len(valid_scores), dtype=np.int32)
        
        # Convert normalized coordinates to pixel coordinates
        # For YOLO, the output is in normalized format (0-1)
        valid_boxes[:, 0] *= self.img_width   # x
        valid_boxes[:, 1] *= self.img_height  # y
        valid_boxes[:, 2] *= self.img_width   # width
        valid_boxes[:, 3] *= self.img_height  # height
        
        # Convert from [cx, cy, width, height] to [x1, y1, x2, y2]
        boxes_xyxy = self.cxcywh_to_xyxy(valid_boxes)
        
        # Apply non-maximum suppression
        indices = self.nms(boxes_xyxy, valid_scores)
        
        print(f"After NMS: {len(indices)} detections")
        return boxes_xyxy[indices], valid_scores[indices], valid_class_ids[indices]
    
    def cxcywh_to_xyxy(self, boxes):
        """Convert boxes from center_x, center_y, width, height to x1, y1, x2, y2 format"""
        xyxy = np.zeros_like(boxes)
        xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2  # x1 = cx - w/2
        xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2  # y1 = cy - h/2
        xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2  # x2 = cx + w/2
        xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2  # y2 = cy + h/2
        return xyxy
    
    def nms(self, boxes, scores):
        """Apply non-maximum suppression"""
        # Get indices of boxes sorted by scores in descending order
        indices = np.argsort(scores)[::-1]
        
        keep = []
        while indices.size > 0:
            # Pick the box with highest score
            current = indices[0]
            keep.append(current)
            
            # If only one box left, break
            if indices.size == 1:
                break
                
            # Compute IoU of the picked box with the rest
            ious = self.box_iou(boxes[current:current+1], boxes[indices[1:]])
            
            # Remove boxes with IoU over threshold
            mask = ious < self.iou_threshold
            indices = indices[1:][mask]
            
        return keep
    
    def box_iou(self, box1, box2):
        """
        Calculate IoU between box1 and box2
        box1, box2: [x1, y1, x2, y2]
        """
        # Calculate intersection area
        x1 = np.maximum(box1[0, 0], box2[:, 0])
        y1 = np.maximum(box1[0, 1], box2[:, 1])
        x2 = np.minimum(box1[0, 2], box2[:, 2])
        y2 = np.minimum(box1[0, 3], box2[:, 3])
        
        intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
        
        # Calculate union area
        box1_area = (box1[0, 2] - box1[0, 0]) * (box1[0, 3] - box1[0, 1])
        box2_area = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
        union = box1_area + box2_area - intersection
        
        # Calculate IoU
        iou = intersection / union
        
        return iou

    def draw_detections(self, image, draw_scores=True, mask_alpha=0.4):
        """Draw detection boxes on image with scores if requested"""
        img_copy = image.copy()
        
        for i, box in enumerate(self.boxes):
            x1, y1, x2, y2 = [int(val) for val in box]
            
            # Draw box with thicker lines for visibility
            cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 3)
            
            # Draw score if requested
            if draw_scores and i < len(self.scores):
                score = self.scores[i]
                class_id = self.class_ids[i] if i < len(self.class_ids) else 0
                
                # For your license plate detector
                label = f"License Plate: {score:.2f}"
                
                # Calculate text size and position
                text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)[0]
                
                # Draw text background
                cv2.rectangle(img_copy, (x1, y1 - text_size[1] - 5), (x1 + text_size[0], y1), (0, 255, 0), -1)
                
                # Draw text
                cv2.putText(img_copy, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2)
        
        return img_copy

    def get_input_details(self):
        input_details = self.interpreter.get_input_details()
        self.input_details = input_details
        
        # Print input details for debugging
        print(f"Input details: {input_details}")
        
        # Get input shape
        self.input_shape = input_details[0]['shape']
        print(f"Input shape: {self.input_shape}")
        
        # Get input dimensions
        if len(self.input_shape) == 4:
            _, self.input_height, self.input_width, _ = self.input_shape
        else:
            self.input_height = self.default_input_height
            self.input_width = self.default_input_width
            
        print(f"Input dimensions: {self.input_width}x{self.input_height}")

    def get_output_details(self):
        self.output_details = self.interpreter.get_output_details()
        print(f"Output details: {self.output_details}")


def run_benchmark(detector, img, num_runs=10):
    """Run inference multiple times to benchmark performance"""
    print(f"\nRunning benchmark with {num_runs} iterations...")
    times = []
    
    # Prepare input once
    input_tensor = detector.prepare_input(img)
    common.set_input(detector.interpreter, input_tensor)
    
    for i in range(num_runs):
        start = time.perf_counter()
        detector.interpreter.invoke()
        inference_time = (time.perf_counter() - start) * 1000
        times.append(inference_time)
        print(f"Run {i+1}: {inference_time:.2f} ms")
    
    avg_time = sum(times) / len(times)
    print(f"\nAverage inference time: {avg_time:.2f} ms")
    print(f"Min: {min(times):.2f} ms, Max: {max(times):.2f} ms")
    print(f"FPS: {1000/avg_time:.2f}")
    
    return avg_time


if __name__ == '__main__':
    # Get model path from command line or use default
    model_path = "float32_edgetpu.tflite"
    img_path = "cs1.png"
    
    # Parse command line arguments
    if len(sys.argv) > 1:
        for i, arg in enumerate(sys.argv):
            if arg == "--model" and i+1 < len(sys.argv):
                model_path = sys.argv[i+1]
            elif arg == "--image" and i+1 < len(sys.argv):
                img_path = sys.argv[i+1]
    
    # Check if files exist
    if not os.path.exists(model_path):
        print(f"Error: Model file not found at {model_path}")
        sys.exit(1)
        
    if not os.path.exists(img_path):
        print(f"Error: Image file not found at {img_path}")
        sys.exit(1)

    # Initialize YOLOv10 EdgeTPU detector
    detector = YOLOv10EdgeTPU(model_path, conf_thres=0.3, iou_thres=0.5, input_size=(640, 640))

    # Load image
    img = cv2.imread(img_path)
    if img is None:
        print(f"Failed to load image from path: {img_path}")
        sys.exit(1)
    print(f"Using image: {img_path}")
    print(f"Image shape: {img.shape}")

    # Detect Objects
    detections = detector(img)
    print(f"Found {len(detections)} detections")
    
    # Log all detections
    for i, det in enumerate(detections):
        print(f"Detection {i+1}: box={det['box']}, score={det['score']:.4f}, class={det['class_id']}")
    
    # Draw detections
    output_img = detector.draw_detections(img, draw_scores=True)
    output_path = "edgetpu_detections.jpg"
    cv2.imwrite(output_path, output_img)
    print(f"Output saved to {output_path}")
    
    # Run benchmark if requested
    if "--benchmark" in sys.argv:
        run_benchmark(detector, img, num_runs=10)

2.2 Usage

# Basic inference
python edge_tpu_yolov10.py

# Specify custom model and image
python edge_tpu_yolov10.py --model /path/to/model_edgetpu.tflite --image /path/to/image.jpg

# Run benchmark
python edge_tpu_yolov10.py --benchmark

3. Comparison: Edge TPU vs GPU

The Edge TPU provides hardware acceleration for TensorFlow Lite models specifically compiled for it. Here's how it compares to GPU inference:

  • Edge TPU: ~150 ms inference time (lower with benchmarking)
  • GPU (NVIDIA): Can be much faster, depending on the GPU model
  • Benefits of Edge TPU: Low power consumption, no need for a powerful GPU, portable

4. Tips and Best Practices

  1. Model Compilation:
    • Use the Edge TPU Compiler to convert TFLite models for Edge TPU
    • INT8 quantized models work best with Edge TPU
  2. Performance Optimization:
    • Use USB 3.0 ports for faster data transfer
    • Consider using the high-performance version (libedgetpu1-max) if you need more speed and can manage the heat
  3. Troubleshooting:
    • Always check permissions (chmod +x on the library file)
    • Make sure your user is in the plugdev group
    • Check the udev rules
  4. Hardware Limitations:
    • Edge TPU has limited memory (8MB shared between all models)
    • Not all operations are supported (mainly INT8 quantized operations)

This implementation provides a solid foundation for running YOLO object detection on the Google Coral Edge TPU with proper pre-processing and post-processing for optimal results.