Google Coral USB Edge TPU Implementation Guide
1. Installation and Troubleshooting
1.1 Hardware Requirements
- Google Coral USB Accelerator
- USB 3.0 port (for best performance)
- Linux host system (Ubuntu, Debian, etc.)
1.2 Installation Steps
# 1. Add the Coral package repository echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list # 2. Add the package key curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - # 3. Update package lists sudo apt-get update # 4. Install the Edge TPU runtime sudo apt-get install libedgetpu1-std # 5. Create a conda environment with Python 3.9 conda create -n coral_env python=3.9 conda activate coral_env # 6. Install necessary packages pip install numpy pillow opencv-python # 7. Install PyCoral from Google's repository pip install --extra-index-url https://google-coral.github.io/py-repo/ pycoral
1.3 Common Issues and Solutions
1.3.1 "Failed to load delegate from libedgetpu.so.1" Error
This is a common error that occurs when the Edge TPU library doesn't have executable permissions or when the user doesn't have the right permissions to access the device.
Solution:
# Add executable permission to the library sudo chmod +x /usr/lib/x86_64-linux-gnu/libedgetpu.so.1.0 # Add your user to the plugdev group sudo usermod -aG plugdev $USER # Create proper udev rules sudo bash -c 'cat > /etc/udev/rules.d/99-edgetpu.rules << EOF SUBSYSTEM=="usb", ATTRS{idVendor}=="1a6e", ATTRS{idProduct}=="089a", MODE="0664", GROUP="plugdev" EOF' # Reload udev rules sudo udevadm control --reload-rules && sudo udevadm trigger # Unplug and replug your device
1.3.2 Verifying Installation
Create a simple Python script to check if the Edge TPU is detected:
# check_coral.py from pycoral.utils import edgetpu print("Testing Edge TPU...") devices = edgetpu.list_edge_tpus() print(f"Available Edge TPUs: {devices}") if devices: print("Edge TPU is working correctly!") else: print("No Edge TPU detected.")
Run with:
python check_coral.py
2. Edge TPU Inference with YOLOv10/YOLOv11
2.1 Complete Implementation
import time import cv2 import numpy as np import os import sys from PIL import Image from pycoral.utils import edgetpu from pycoral.adapters import common class YOLOv10EdgeTPU: def __init__(self, path, conf_thres=0.3, iou_thres=0.5, input_size=(640, 640)): self.conf_threshold = conf_thres self.iou_threshold = iou_thres # Set default input size self.default_input_height, self.default_input_width = input_size # Initialize model self.initialize_model(path) def __call__(self, image): return self.detect_objects(image) def initialize_model(self, path): # Load Edge TPU model self.interpreter = edgetpu.make_interpreter(path) self.interpreter.allocate_tensors() # Get model info self.get_input_details() self.get_output_details() def detect_objects(self, image): input_tensor = self.prepare_input(image) # Perform inference on the image outputs = self.inference(input_tensor) # Process outputs self.boxes, self.scores, self.class_ids = self.process_output(outputs) # Ensure boxes, scores, and class_ids have the same length if len(self.boxes) != len(self.scores) or len(self.boxes) != len(self.class_ids): return [] detections = [] for i in range(len(self.boxes)): detection = { 'box': self.boxes[i].tolist(), # Convert numpy array to Python list 'score': self.scores[i], 'class_id': self.class_ids[i] } detections.append(detection) return detections def prepare_input(self, image): self.img_height, self.img_width = image.shape[:2] # Convert to RGB (OpenCV loads as BGR) input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Use the model's input dimensions input_width = self.input_width input_height = self.input_height # Resize input image input_img = cv2.resize(input_img, (input_width, input_height)) # Scale input pixel values to 0 to 1 input_img = input_img.astype(np.float32) / 255.0 # Add batch dimension input_tensor = np.expand_dims(input_img, axis=0) return input_tensor def inference(self, input_tensor): start = time.perf_counter() # Set input tensor common.set_input(self.interpreter, input_tensor) # Run inference self.interpreter.invoke() # Get all output tensors outputs = [] for i in range(len(self.output_details)): outputs.append(self.interpreter.get_tensor(self.output_details[i]['index'])) infer_time = (time.perf_counter() - start) * 1000 print(f"Inference time: {infer_time:.2f} ms") return outputs def process_output(self, outputs): """Process output for YOLOv10/YOLOv11 TFLite model with shape [1, 5, 8400]""" # Get the first output tensor predictions = outputs[0] # Shape [1, 5, 8400] print(f"Output 0 shape: {predictions.shape}") # Transpose the predictions from [1, 5, 8400] to [8400, 5] predictions = predictions[0].T # Now shape is [8400, 5] print(f"Transposed predictions shape: {predictions.shape}") # Extract confidence scores objectness_scores = predictions[:, 4] # Last column is confidence # Filter by confidence threshold valid_indices = np.where(objectness_scores > self.conf_threshold)[0] if len(valid_indices) == 0: return [], [], [] valid_boxes = predictions[valid_indices, :4] # First 4 values are bbox coords valid_scores = predictions[valid_indices, 4] # 5th value is confidence # For single-class model, all class IDs are 0 valid_class_ids = np.zeros(len(valid_scores), dtype=np.int32) # Convert normalized coordinates to pixel coordinates # For YOLO, the output is in normalized format (0-1) valid_boxes[:, 0] *= self.img_width # x valid_boxes[:, 1] *= self.img_height # y valid_boxes[:, 2] *= self.img_width # width valid_boxes[:, 3] *= self.img_height # height # Convert from [cx, cy, width, height] to [x1, y1, x2, y2] boxes_xyxy = self.cxcywh_to_xyxy(valid_boxes) # Apply non-maximum suppression indices = self.nms(boxes_xyxy, valid_scores) print(f"After NMS: {len(indices)} detections") return boxes_xyxy[indices], valid_scores[indices], valid_class_ids[indices] def cxcywh_to_xyxy(self, boxes): """Convert boxes from center_x, center_y, width, height to x1, y1, x2, y2 format""" xyxy = np.zeros_like(boxes) xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2 # x1 = cx - w/2 xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2 # y1 = cy - h/2 xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2 # x2 = cx + w/2 xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2 # y2 = cy + h/2 return xyxy def nms(self, boxes, scores): """Apply non-maximum suppression""" # Get indices of boxes sorted by scores in descending order indices = np.argsort(scores)[::-1] keep = [] while indices.size > 0: # Pick the box with highest score current = indices[0] keep.append(current) # If only one box left, break if indices.size == 1: break # Compute IoU of the picked box with the rest ious = self.box_iou(boxes[current:current+1], boxes[indices[1:]]) # Remove boxes with IoU over threshold mask = ious < self.iou_threshold indices = indices[1:][mask] return keep def box_iou(self, box1, box2): """ Calculate IoU between box1 and box2 box1, box2: [x1, y1, x2, y2] """ # Calculate intersection area x1 = np.maximum(box1[0, 0], box2[:, 0]) y1 = np.maximum(box1[0, 1], box2[:, 1]) x2 = np.minimum(box1[0, 2], box2[:, 2]) y2 = np.minimum(box1[0, 3], box2[:, 3]) intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1) # Calculate union area box1_area = (box1[0, 2] - box1[0, 0]) * (box1[0, 3] - box1[0, 1]) box2_area = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1]) union = box1_area + box2_area - intersection # Calculate IoU iou = intersection / union return iou def draw_detections(self, image, draw_scores=True, mask_alpha=0.4): """Draw detection boxes on image with scores if requested""" img_copy = image.copy() for i, box in enumerate(self.boxes): x1, y1, x2, y2 = [int(val) for val in box] # Draw box with thicker lines for visibility cv2.rectangle(img_copy, (x1, y1), (x2, y2), (0, 255, 0), 3) # Draw score if requested if draw_scores and i < len(self.scores): score = self.scores[i] class_id = self.class_ids[i] if i < len(self.class_ids) else 0 # For your license plate detector label = f"License Plate: {score:.2f}" # Calculate text size and position text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)[0] # Draw text background cv2.rectangle(img_copy, (x1, y1 - text_size[1] - 5), (x1 + text_size[0], y1), (0, 255, 0), -1) # Draw text cv2.putText(img_copy, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) return img_copy def get_input_details(self): input_details = self.interpreter.get_input_details() self.input_details = input_details # Print input details for debugging print(f"Input details: {input_details}") # Get input shape self.input_shape = input_details[0]['shape'] print(f"Input shape: {self.input_shape}") # Get input dimensions if len(self.input_shape) == 4: _, self.input_height, self.input_width, _ = self.input_shape else: self.input_height = self.default_input_height self.input_width = self.default_input_width print(f"Input dimensions: {self.input_width}x{self.input_height}") def get_output_details(self): self.output_details = self.interpreter.get_output_details() print(f"Output details: {self.output_details}") def run_benchmark(detector, img, num_runs=10): """Run inference multiple times to benchmark performance""" print(f"\nRunning benchmark with {num_runs} iterations...") times = [] # Prepare input once input_tensor = detector.prepare_input(img) common.set_input(detector.interpreter, input_tensor) for i in range(num_runs): start = time.perf_counter() detector.interpreter.invoke() inference_time = (time.perf_counter() - start) * 1000 times.append(inference_time) print(f"Run {i+1}: {inference_time:.2f} ms") avg_time = sum(times) / len(times) print(f"\nAverage inference time: {avg_time:.2f} ms") print(f"Min: {min(times):.2f} ms, Max: {max(times):.2f} ms") print(f"FPS: {1000/avg_time:.2f}") return avg_time if __name__ == '__main__': # Get model path from command line or use default model_path = "float32_edgetpu.tflite" img_path = "cs1.png" # Parse command line arguments if len(sys.argv) > 1: for i, arg in enumerate(sys.argv): if arg == "--model" and i+1 < len(sys.argv): model_path = sys.argv[i+1] elif arg == "--image" and i+1 < len(sys.argv): img_path = sys.argv[i+1] # Check if files exist if not os.path.exists(model_path): print(f"Error: Model file not found at {model_path}") sys.exit(1) if not os.path.exists(img_path): print(f"Error: Image file not found at {img_path}") sys.exit(1) # Initialize YOLOv10 EdgeTPU detector detector = YOLOv10EdgeTPU(model_path, conf_thres=0.3, iou_thres=0.5, input_size=(640, 640)) # Load image img = cv2.imread(img_path) if img is None: print(f"Failed to load image from path: {img_path}") sys.exit(1) print(f"Using image: {img_path}") print(f"Image shape: {img.shape}") # Detect Objects detections = detector(img) print(f"Found {len(detections)} detections") # Log all detections for i, det in enumerate(detections): print(f"Detection {i+1}: box={det['box']}, score={det['score']:.4f}, class={det['class_id']}") # Draw detections output_img = detector.draw_detections(img, draw_scores=True) output_path = "edgetpu_detections.jpg" cv2.imwrite(output_path, output_img) print(f"Output saved to {output_path}") # Run benchmark if requested if "--benchmark" in sys.argv: run_benchmark(detector, img, num_runs=10)
2.2 Usage
# Basic inference python edge_tpu_yolov10.py # Specify custom model and image python edge_tpu_yolov10.py --model /path/to/model_edgetpu.tflite --image /path/to/image.jpg # Run benchmark python edge_tpu_yolov10.py --benchmark
3. Comparison: Edge TPU vs GPU
The Edge TPU provides hardware acceleration for TensorFlow Lite models specifically compiled for it. Here's how it compares to GPU inference:
- Edge TPU: ~150 ms inference time (lower with benchmarking)
- GPU (NVIDIA): Can be much faster, depending on the GPU model
- Benefits of Edge TPU: Low power consumption, no need for a powerful GPU, portable
4. Tips and Best Practices
- Model Compilation:
- Use the Edge TPU Compiler to convert TFLite models for Edge TPU
- INT8 quantized models work best with Edge TPU
- Performance Optimization:
- Use USB 3.0 ports for faster data transfer
- Consider using the high-performance version (
libedgetpu1-max
) if you need more speed and can manage the heat
- Troubleshooting:
- Always check permissions (
chmod +x
on the library file) - Make sure your user is in the
plugdev
group - Check the udev rules
- Always check permissions (
- Hardware Limitations:
- Edge TPU has limited memory (8MB shared between all models)
- Not all operations are supported (mainly INT8 quantized operations)
This implementation provides a solid foundation for running YOLO object detection on the Google Coral Edge TPU with proper pre-processing and post-processing for optimal results.
No comments:
Post a Comment