# Production Vision Systems Comprehensive guide to deploying computer vision models in production environments. ## Table of Contents - [Model Export and Optimization](#model-export-and-optimization) - [TensorRT Deployment](#tensorrt-deployment) - [ONNX Runtime Deployment](#onnx-runtime-deployment) - [Edge Device Deployment](#edge-device-deployment) - [Model Serving](#model-serving) - [Video Processing Pipelines](#video-processing-pipelines) - [Monitoring and Observability](#monitoring-and-observability) - [Scaling and Performance](#scaling-and-performance) --- ## Model Export and Optimization ### PyTorch to ONNX Export Basic export: ```python import torch import torch.onnx def export_to_onnx(model, input_shape, output_path, dynamic_batch=True): """ Export PyTorch model to ONNX format. Args: model: PyTorch model input_shape: (C, H, W) input dimensions output_path: Path to save .onnx file dynamic_batch: Allow variable batch sizes """ model.set_mode('inference') # Create dummy input dummy_input = torch.randn(1, *input_shape) # Dynamic axes for variable batch size dynamic_axes = None if dynamic_batch: dynamic_axes = { 'input': {0: 'batch_size'}, 'output': {0: 'batch_size'} } # Export torch.onnx.export( model, dummy_input, output_path, export_params=True, opset_version=17, do_constant_folding=True, input_names=['input'], output_names=['output'], dynamic_axes=dynamic_axes ) print(f"Exported to {output_path}") return output_path ``` ### ONNX Model Optimization Simplify and optimize ONNX graph: ```python import onnx from onnxsim import simplify def optimize_onnx(input_path, output_path): """ Simplify ONNX model for faster inference. """ # Load model model = onnx.load(input_path) # Check validity onnx.checker.check_model(model) # Simplify model_simplified, check = simplify(model) if check: onnx.save(model_simplified, output_path) print(f"Simplified model saved to {output_path}") # Print size reduction import os original_size = os.path.getsize(input_path) / 1024 / 1024 simplified_size = os.path.getsize(output_path) / 1024 / 1024 print(f"Size: {original_size:.2f}MB -> {simplified_size:.2f}MB") else: print("Simplification failed, saving original") onnx.save(model, output_path) return output_path ``` ### Model Size Analysis ```python def analyze_model(model_path): """ Analyze ONNX model structure and size. """ model = onnx.load(model_path) # Count parameters total_params = 0 param_sizes = {} for initializer in model.graph.initializer: param_count = 1 for dim in initializer.dims: param_count *= dim total_params += param_count param_sizes[initializer.name] = param_count # Print summary print(f"Total parameters: {total_params:,}") print(f"Model size: {total_params * 4 / 1024 / 1024:.2f} MB (FP32)") print(f"Model size: {total_params * 2 / 1024 / 1024:.2f} MB (FP16)") print(f"Model size: {total_params / 1024 / 1024:.2f} MB (INT8)") # Top 10 largest layers print("\nLargest layers:") sorted_params = sorted(param_sizes.items(), key=lambda x: x[1], reverse=True) for name, size in sorted_params[:10]: print(f" {name}: {size:,} params") return total_params ``` --- ## TensorRT Deployment ### TensorRT Engine Build ```python import tensorrt as trt def build_tensorrt_engine(onnx_path, engine_path, precision='fp16', max_batch_size=8, workspace_gb=4): """ Build TensorRT engine from ONNX model. Args: onnx_path: Path to ONNX model engine_path: Path to save TensorRT engine precision: 'fp32', 'fp16', or 'int8' max_batch_size: Maximum batch size workspace_gb: GPU memory workspace in GB """ logger = trt.Logger(trt.Logger.WARNING) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger) # Parse ONNX with open(onnx_path, 'rb') as f: if not parser.parse(f.read()): for error in range(parser.num_errors): print(parser.get_error(error)) raise RuntimeError("ONNX parsing failed") # Configure builder config = builder.create_builder_config() config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_gb * 1024 * 1024 * 1024) # Set precision if precision == 'fp16': config.set_flag(trt.BuilderFlag.FP16) elif precision == 'int8': config.set_flag(trt.BuilderFlag.INT8) # Requires calibrator for INT8 # Set optimization profile for dynamic shapes profile = builder.create_optimization_profile() input_name = network.get_input(0).name input_shape = network.get_input(0).shape # Min, optimal, max batch sizes min_shape = (1,) + tuple(input_shape[1:]) opt_shape = (max_batch_size // 2,) + tuple(input_shape[1:]) max_shape = (max_batch_size,) + tuple(input_shape[1:]) profile.set_shape(input_name, min_shape, opt_shape, max_shape) config.add_optimization_profile(profile) # Build engine serialized_engine = builder.build_serialized_network(network, config) # Save engine with open(engine_path, 'wb') as f: f.write(serialized_engine) print(f"TensorRT engine saved to {engine_path}") return engine_path ``` ### TensorRT Inference ```python import numpy as np import pycuda.driver as cuda import pycuda.autoinit class TensorRTInference: def __init__(self, engine_path): """ Load TensorRT engine and prepare for inference. """ self.logger = trt.Logger(trt.Logger.WARNING) # Load engine with open(engine_path, 'rb') as f: engine_data = f.read() runtime = trt.Runtime(self.logger) self.engine = runtime.deserialize_cuda_engine(engine_data) self.context = self.engine.create_execution_context() # Allocate buffers self.inputs = [] self.outputs = [] self.bindings = [] self.stream = cuda.Stream() for i in range(self.engine.num_io_tensors): name = self.engine.get_tensor_name(i) dtype = trt.nptype(self.engine.get_tensor_dtype(name)) shape = self.engine.get_tensor_shape(name) size = trt.volume(shape) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(device_mem)) if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: self.inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape, 'name': name}) else: self.outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape, 'name': name}) def infer(self, input_data): """ Run inference on input data. Args: input_data: numpy array (batch, C, H, W) Returns: Output numpy array """ # Copy input to host buffer np.copyto(self.inputs[0]['host'], input_data.ravel()) # Transfer input to device cuda.memcpy_htod_async( self.inputs[0]['device'], self.inputs[0]['host'], self.stream ) # Run inference self.context.execute_async_v2( bindings=self.bindings, stream_handle=self.stream.handle ) # Transfer output from device cuda.memcpy_dtoh_async( self.outputs[0]['host'], self.outputs[0]['device'], self.stream ) # Synchronize self.stream.synchronize() # Reshape output output = self.outputs[0]['host'].reshape(self.outputs[0]['shape']) return output ``` ### INT8 Calibration ```python class Int8Calibrator(trt.IInt8EntropyCalibrator2): def __init__(self, calibration_data, cache_file, batch_size=8): """ INT8 calibrator for TensorRT. Args: calibration_data: List of numpy arrays cache_file: Path to save calibration cache batch_size: Calibration batch size """ super().__init__() self.calibration_data = calibration_data self.cache_file = cache_file self.batch_size = batch_size self.current_index = 0 # Allocate device buffer self.device_input = cuda.mem_alloc( calibration_data[0].nbytes * batch_size ) def get_batch_size(self): return self.batch_size def get_batch(self, names): if self.current_index + self.batch_size > len(self.calibration_data): return None # Get batch batch = self.calibration_data[ self.current_index:self.current_index + self.batch_size ] batch = np.stack(batch, axis=0) # Copy to device cuda.memcpy_htod(self.device_input, batch) self.current_index += self.batch_size return [int(self.device_input)] def read_calibration_cache(self): if os.path.exists(self.cache_file): with open(self.cache_file, 'rb') as f: return f.read() return None def write_calibration_cache(self, cache): with open(self.cache_file, 'wb') as f: f.write(cache) ``` --- ## ONNX Runtime Deployment ### Basic ONNX Runtime Inference ```python import onnxruntime as ort class ONNXInference: def __init__(self, model_path, device='cuda'): """ Initialize ONNX Runtime session. Args: model_path: Path to ONNX model device: 'cuda' or 'cpu' """ # Set execution providers if device == 'cuda': providers = [ ('CUDAExecutionProvider', { 'device_id': 0, 'arena_extend_strategy': 'kNextPowerOfTwo', 'gpu_mem_limit': 4 * 1024 * 1024 * 1024, # 4GB 'cudnn_conv_algo_search': 'EXHAUSTIVE', }), 'CPUExecutionProvider' ] else: providers = ['CPUExecutionProvider'] # Session options sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.intra_op_num_threads = 4 # Create session self.session = ort.InferenceSession( model_path, sess_options=sess_options, providers=providers ) # Get input/output info self.input_name = self.session.get_inputs()[0].name self.input_shape = self.session.get_inputs()[0].shape self.output_name = self.session.get_outputs()[0].name print(f"Loaded model: {model_path}") print(f"Input: {self.input_name} {self.input_shape}") print(f"Provider: {self.session.get_providers()[0]}") def infer(self, input_data): """ Run inference. Args: input_data: numpy array (batch, C, H, W) Returns: Model output """ outputs = self.session.run( [self.output_name], {self.input_name: input_data.astype(np.float32)} ) return outputs[0] def benchmark(self, input_shape, num_iterations=100, warmup=10): """ Benchmark inference speed. """ import time dummy_input = np.random.randn(*input_shape).astype(np.float32) # Warmup for _ in range(warmup): self.infer(dummy_input) # Benchmark start = time.perf_counter() for _ in range(num_iterations): self.infer(dummy_input) end = time.perf_counter() avg_time = (end - start) / num_iterations * 1000 fps = 1000 / avg_time * input_shape[0] print(f"Average latency: {avg_time:.2f}ms") print(f"Throughput: {fps:.1f} images/sec") return avg_time, fps ``` --- ## Edge Device Deployment ### NVIDIA Jetson Optimization ```python def optimize_for_jetson(model_path, output_path, jetson_model='orin'): """ Optimize model for NVIDIA Jetson deployment. Args: model_path: Path to ONNX model output_path: Path to save optimized engine jetson_model: 'nano', 'xavier', 'orin' """ # Jetson-specific configurations configs = { 'nano': {'precision': 'fp16', 'workspace': 1, 'dla': False}, 'xavier': {'precision': 'fp16', 'workspace': 2, 'dla': True}, 'orin': {'precision': 'int8', 'workspace': 4, 'dla': True}, } config = configs[jetson_model] # Build engine with Jetson-optimized settings logger = trt.Logger(trt.Logger.WARNING) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger) with open(model_path, 'rb') as f: parser.parse(f.read()) builder_config = builder.create_builder_config() builder_config.set_memory_pool_limit( trt.MemoryPoolType.WORKSPACE, config['workspace'] * 1024 * 1024 * 1024 ) if config['precision'] == 'fp16': builder_config.set_flag(trt.BuilderFlag.FP16) elif config['precision'] == 'int8': builder_config.set_flag(trt.BuilderFlag.INT8) # Enable DLA if supported if config['dla'] and builder.num_DLA_cores > 0: builder_config.default_device_type = trt.DeviceType.DLA builder_config.DLA_core = 0 builder_config.set_flag(trt.BuilderFlag.GPU_FALLBACK) # Build and save serialized = builder.build_serialized_network(network, builder_config) with open(output_path, 'wb') as f: f.write(serialized) print(f"Jetson-optimized engine saved to {output_path}") ``` ### OpenVINO for Intel Devices ```python from openvino.runtime import Core class OpenVINOInference: def __init__(self, model_path, device='CPU'): """ Initialize OpenVINO inference. Args: model_path: Path to ONNX or OpenVINO IR model device: 'CPU', 'GPU', 'MYRIAD' (Intel NCS) """ self.core = Core() # Load and compile model self.model = self.core.read_model(model_path) self.compiled = self.core.compile_model(self.model, device) # Get input/output info self.input_layer = self.compiled.input(0) self.output_layer = self.compiled.output(0) print(f"Loaded model on {device}") print(f"Input shape: {self.input_layer.shape}") def infer(self, input_data): """ Run inference. """ result = self.compiled([input_data]) return result[self.output_layer] def benchmark(self, input_shape, num_iterations=100): """ Benchmark inference speed. """ import time dummy = np.random.randn(*input_shape).astype(np.float32) # Warmup for _ in range(10): self.infer(dummy) # Benchmark start = time.perf_counter() for _ in range(num_iterations): self.infer(dummy) elapsed = time.perf_counter() - start latency = elapsed / num_iterations * 1000 print(f"Latency: {latency:.2f}ms") return latency def convert_to_openvino(onnx_path, output_dir, precision='FP16'): """ Convert ONNX to OpenVINO IR format. """ from openvino.tools import mo mo.convert_model( onnx_path, output_model=f"{output_dir}/model.xml", compress_to_fp16=(precision == 'FP16') ) print(f"Converted to OpenVINO IR at {output_dir}") ``` ### CoreML for Apple Silicon ```python import coremltools as ct def convert_to_coreml(model_or_path, output_path, compute_units='ALL'): """ Convert to CoreML for Apple devices. Args: model_or_path: PyTorch model or ONNX path output_path: Path to save .mlpackage compute_units: 'ALL', 'CPU_AND_GPU', 'CPU_AND_NE' """ # Map compute units units_map = { 'ALL': ct.ComputeUnit.ALL, 'CPU_AND_GPU': ct.ComputeUnit.CPU_AND_GPU, 'CPU_AND_NE': ct.ComputeUnit.CPU_AND_NE, # Neural Engine } # Convert from ONNX if isinstance(model_or_path, str) and model_or_path.endswith('.onnx'): mlmodel = ct.convert( model_or_path, compute_units=units_map[compute_units], minimum_deployment_target=ct.target.macOS13 # or iOS16 ) else: # Convert from PyTorch traced = torch.jit.trace(model_or_path, torch.randn(1, 3, 640, 640)) mlmodel = ct.convert( traced, inputs=[ct.TensorType(shape=(1, 3, 640, 640))], compute_units=units_map[compute_units], ) mlmodel.save(output_path) print(f"CoreML model saved to {output_path}") ``` --- ## Model Serving ### Triton Inference Server Configuration file (`config.pbtxt`): ```protobuf name: "yolov8" platform: "onnxruntime_onnx" max_batch_size: 8 input [ { name: "images" data_type: TYPE_FP32 dims: [ 3, 640, 640 ] } ] output [ { name: "output0" data_type: TYPE_FP32 dims: [ 84, 8400 ] } ] instance_group [ { count: 2 kind: KIND_GPU } ] dynamic_batching { preferred_batch_size: [ 4, 8 ] max_queue_delay_microseconds: 100 } ``` Triton client: ```python import tritonclient.http as httpclient class TritonClient: def __init__(self, url='localhost:8000', model_name='yolov8'): self.client = httpclient.InferenceServerClient(url=url) self.model_name = model_name # Check model is ready if not self.client.is_model_ready(model_name): raise RuntimeError(f"Model {model_name} is not ready") def infer(self, images): """ Send inference request to Triton. Args: images: numpy array (batch, C, H, W) """ # Create input inputs = [ httpclient.InferInput("images", images.shape, "FP32") ] inputs[0].set_data_from_numpy(images) # Create output request outputs = [ httpclient.InferRequestedOutput("output0") ] # Send request response = self.client.infer( model_name=self.model_name, inputs=inputs, outputs=outputs ) return response.as_numpy("output0") ``` ### TorchServe Deployment Model handler (`handler.py`): ```python from ts.torch_handler.base_handler import BaseHandler import torch import cv2 import numpy as np class YOLOHandler(BaseHandler): def __init__(self): super().__init__() self.input_size = 640 self.conf_threshold = 0.25 self.iou_threshold = 0.45 def preprocess(self, data): """Preprocess input images.""" images = [] for row in data: image = row.get("data") or row.get("body") if isinstance(image, (bytes, bytearray)): image = np.frombuffer(image, dtype=np.uint8) image = cv2.imdecode(image, cv2.IMREAD_COLOR) # Resize and normalize image = cv2.resize(image, (self.input_size, self.input_size)) image = image.astype(np.float32) / 255.0 image = np.transpose(image, (2, 0, 1)) images.append(image) return torch.tensor(np.stack(images)) def inference(self, data): """Run model inference.""" with torch.no_grad(): outputs = self.model(data) return outputs def postprocess(self, outputs): """Postprocess model outputs.""" results = [] for output in outputs: # Apply NMS and format results detections = self._nms(output, self.conf_threshold, self.iou_threshold) results.append(detections.tolist()) return results ``` TorchServe configuration (`config.properties`): ```properties inference_address=http://0.0.0.0:8080 management_address=http://0.0.0.0:8081 metrics_address=http://0.0.0.0:8082 number_of_netty_threads=4 job_queue_size=100 model_store=/opt/ml/model load_models=yolov8.mar ``` ### FastAPI Serving ```python from fastapi import FastAPI, File, UploadFile from fastapi.responses import JSONResponse import uvicorn import numpy as np import cv2 app = FastAPI(title="YOLO Detection API") # Global model model = None @app.on_event("startup") async def load_model(): global model model = ONNXInference("models/yolov8m.onnx", device='cuda') @app.post("/detect") async def detect(file: UploadFile = File(...), conf: float = 0.25): """ Detect objects in uploaded image. """ # Read image contents = await file.read() nparr = np.frombuffer(contents, np.uint8) image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) # Preprocess input_image = preprocess_image(image, 640) # Inference outputs = model.infer(input_image) # Postprocess detections = postprocess_detections(outputs, conf, 0.45) return JSONResponse({ "detections": detections, "image_size": list(image.shape[:2]) }) @app.get("/health") async def health(): return {"status": "healthy", "model_loaded": model is not None} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) ``` --- ## Video Processing Pipelines ### Real-Time Video Detection ```python import cv2 import time from collections import deque class VideoDetector: def __init__(self, model, conf_threshold=0.25, track=True): self.model = model self.conf_threshold = conf_threshold self.track = track self.tracker = ByteTrack() if track else None self.fps_buffer = deque(maxlen=30) def process_video(self, source, output_path=None, show=True): """ Process video stream with detection. Args: source: Video file path, camera index, or RTSP URL output_path: Path to save output video show: Display results in window """ cap = cv2.VideoCapture(source) if output_path: fourcc = cv2.VideoWriter_fourcc(*'mp4v') fps = cap.get(cv2.CAP_PROP_FPS) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) frame_count = 0 start_time = time.time() while cap.isOpened(): ret, frame = cap.read() if not ret: break # Inference t0 = time.perf_counter() detections = self._detect(frame) # Tracking if self.track and len(detections) > 0: detections = self.tracker.update(detections) # Calculate FPS inference_time = time.perf_counter() - t0 self.fps_buffer.append(1 / inference_time) avg_fps = sum(self.fps_buffer) / len(self.fps_buffer) # Draw results frame = self._draw_detections(frame, detections, avg_fps) # Output if output_path: writer.write(frame) if show: cv2.imshow('Detection', frame) if cv2.waitKey(1) == ord('q'): break frame_count += 1 # Cleanup cap.release() if output_path: writer.release() cv2.destroyAllWindows() # Print statistics total_time = time.time() - start_time print(f"Processed {frame_count} frames in {total_time:.1f}s") print(f"Average FPS: {frame_count / total_time:.1f}") def _detect(self, frame): """Run detection on single frame.""" # Preprocess input_tensor = self._preprocess(frame) # Inference outputs = self.model.infer(input_tensor) # Postprocess detections = self._postprocess(outputs, frame.shape[:2]) return detections def _preprocess(self, frame): """Preprocess frame for model input.""" # Resize input_size = 640 image = cv2.resize(frame, (input_size, input_size)) # Normalize and transpose image = image.astype(np.float32) / 255.0 image = np.transpose(image, (2, 0, 1)) image = np.expand_dims(image, axis=0) return image def _draw_detections(self, frame, detections, fps): """Draw detections on frame.""" for det in detections: x1, y1, x2, y2 = det['bbox'] cls = det['class'] conf = det['confidence'] track_id = det.get('track_id', None) # Draw box color = self._get_color(cls) cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), color, 2) # Draw label label = f"{cls}: {conf:.2f}" if track_id: label = f"ID:{track_id} {label}" cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) # Draw FPS cv2.putText(frame, f"FPS: {fps:.1f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) return frame ``` ### Batch Video Processing ```python import concurrent.futures from pathlib import Path def process_videos_batch(video_paths, model, output_dir, max_workers=4): """ Process multiple videos in parallel. """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) def process_single(video_path): detector = VideoDetector(model) output_path = output_dir / f"{Path(video_path).stem}_detected.mp4" detector.process_video(video_path, str(output_path), show=False) return output_path with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(process_single, vp): vp for vp in video_paths} for future in concurrent.futures.as_completed(futures): video_path = futures[future] try: output_path = future.result() print(f"Completed: {video_path} -> {output_path}") except Exception as e: print(f"Failed: {video_path} - {e}") ``` --- ## Monitoring and Observability ### Prometheus Metrics ```python from prometheus_client import Counter, Histogram, Gauge, start_http_server # Define metrics INFERENCE_COUNT = Counter( 'model_inference_total', 'Total number of inferences', ['model_name', 'status'] ) INFERENCE_LATENCY = Histogram( 'model_inference_latency_seconds', 'Inference latency in seconds', ['model_name'], buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0] ) GPU_MEMORY = Gauge( 'gpu_memory_used_bytes', 'GPU memory usage in bytes', ['device'] ) DETECTIONS_COUNT = Counter( 'detections_total', 'Total detections by class', ['model_name', 'class_name'] ) class MetricsWrapper: def __init__(self, model, model_name='yolov8'): self.model = model self.model_name = model_name def infer(self, input_data): """Inference with metrics.""" start_time = time.perf_counter() try: result = self.model.infer(input_data) INFERENCE_COUNT.labels(self.model_name, 'success').inc() # Count detections by class for det in result: DETECTIONS_COUNT.labels(self.model_name, det['class']).inc() return result except Exception as e: INFERENCE_COUNT.labels(self.model_name, 'error').inc() raise finally: latency = time.perf_counter() - start_time INFERENCE_LATENCY.labels(self.model_name).observe(latency) # Update GPU memory if torch.cuda.is_available(): memory = torch.cuda.memory_allocated() GPU_MEMORY.labels('cuda:0').set(memory) # Start metrics server start_http_server(9090) ``` ### Logging Configuration ```python import logging import json from datetime import datetime class StructuredLogger: def __init__(self, name, level=logging.INFO): self.logger = logging.getLogger(name) self.logger.setLevel(level) # JSON formatter handler = logging.StreamHandler() handler.setFormatter(JsonFormatter()) self.logger.addHandler(handler) def log_inference(self, model_name, latency, num_detections, input_shape): self.logger.info(json.dumps({ 'event': 'inference', 'timestamp': datetime.utcnow().isoformat(), 'model_name': model_name, 'latency_ms': latency * 1000, 'num_detections': num_detections, 'input_shape': list(input_shape) })) def log_error(self, model_name, error, input_shape): self.logger.error(json.dumps({ 'event': 'inference_error', 'timestamp': datetime.utcnow().isoformat(), 'model_name': model_name, 'error': str(error), 'error_type': type(error).__name__, 'input_shape': list(input_shape) })) class JsonFormatter(logging.Formatter): def format(self, record): return record.getMessage() ``` --- ## Scaling and Performance ### Batch Processing Optimization ```python class BatchProcessor: def __init__(self, model, max_batch_size=8, max_wait_ms=100): self.model = model self.max_batch_size = max_batch_size self.max_wait_ms = max_wait_ms self.queue = [] self.lock = threading.Lock() self.results = {} async def process(self, image, request_id): """Add image to batch and wait for result.""" future = asyncio.Future() with self.lock: self.queue.append((request_id, image, future)) if len(self.queue) >= self.max_batch_size: self._process_batch() # Wait for result with timeout result = await asyncio.wait_for(future, timeout=5.0) return result def _process_batch(self): """Process accumulated batch.""" batch_items = self.queue[:self.max_batch_size] self.queue = self.queue[self.max_batch_size:] # Stack images images = np.stack([item[1] for item in batch_items]) # Inference outputs = self.model.infer(images) # Return results for i, (request_id, image, future) in enumerate(batch_items): future.set_result(outputs[i]) ``` ### Multi-GPU Inference ```python import torch.nn as nn from torch.nn.parallel import DataParallel class MultiGPUInference: def __init__(self, model, device_ids=None): """ Wrap model for multi-GPU inference. Args: model: PyTorch model device_ids: List of GPU IDs, e.g., [0, 1, 2, 3] """ if device_ids is None: device_ids = list(range(torch.cuda.device_count())) self.device = torch.device('cuda:0') self.model = DataParallel(model, device_ids=device_ids) self.model.to(self.device) self.model.set_mode('inference') def infer(self, images): """ Run inference across GPUs. """ with torch.no_grad(): images = torch.from_numpy(images).to(self.device) outputs = self.model(images) return outputs.cpu().numpy() ``` ### Performance Benchmarking ```python def comprehensive_benchmark(model, input_sizes, batch_sizes, num_iterations=100): """ Benchmark model across different configurations. """ results = [] for input_size in input_sizes: for batch_size in batch_sizes: # Create input dummy = np.random.randn(batch_size, 3, input_size, input_size).astype(np.float32) # Warmup for _ in range(10): model.infer(dummy) # Benchmark latencies = [] for _ in range(num_iterations): start = time.perf_counter() model.infer(dummy) latencies.append(time.perf_counter() - start) # Calculate statistics latencies = np.array(latencies) * 1000 # Convert to ms result = { 'input_size': input_size, 'batch_size': batch_size, 'mean_latency_ms': np.mean(latencies), 'std_latency_ms': np.std(latencies), 'p50_latency_ms': np.percentile(latencies, 50), 'p95_latency_ms': np.percentile(latencies, 95), 'p99_latency_ms': np.percentile(latencies, 99), 'throughput_fps': batch_size * 1000 / np.mean(latencies) } results.append(result) print(f"Size: {input_size}, Batch: {batch_size}") print(f" Latency: {result['mean_latency_ms']:.2f}ms (p99: {result['p99_latency_ms']:.2f}ms)") print(f" Throughput: {result['throughput_fps']:.1f} FPS") return results ``` --- ## Resources - [TensorRT Documentation](https://docs.nvidia.com/deeplearning/tensorrt/) - [ONNX Runtime Documentation](https://onnxruntime.ai/docs/) - [Triton Inference Server](https://github.com/triton-inference-server/server) - [OpenVINO Documentation](https://docs.openvino.ai/) - [CoreML Tools](https://coremltools.readme.io/)