yozkut commited on
Commit
d247227
Β·
verified Β·
1 Parent(s): b5cddd7

Sync from GitHub via huggingface-sync-action

Browse files
Files changed (2) hide show
  1. README.md +17 -0
  2. benchmark_tensorrt.py +184 -0
README.md CHANGED
@@ -183,6 +183,23 @@ if torch.cuda.is_available():
183
  # Inference is now 2-3x faster! πŸš€
184
  ```
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  ---
187
 
188
  ## πŸ‘οΈ Computer Vision & Object Tracking
 
183
  # Inference is now 2-3x faster! πŸš€
184
  ```
185
 
186
+ ### **Performance Benchmarks**
187
+
188
+ Measured on **NVIDIA Tesla T4** (Google Colab) for YOLO26m:
189
+
190
+ | Backend | Hardware | FPS | Latency | TensorRT Speedup | vs CPU |
191
+ |---------|----------|-----|---------|------------------|--------|
192
+ | **TensorRT** | NVIDIA T4 GPU | **132.7** | 7.5ms | **2.69x** | 121.4x |
193
+ | PyTorch | NVIDIA T4 GPU | 49.4 | 20.3ms | 1.0x | 45.1x |
194
+ | PyTorch | CPU | 1.1 | 914.3ms | - | 1.0x |
195
+
196
+ **Key Insights:**
197
+ - πŸš€ **TensorRT optimization provides 2.69x speedup** over PyTorch on the same NVIDIA GPU
198
+ - ⚑ **NVIDIA GPU acceleration** provides 45x speedup over CPU (PyTorch)
199
+ - 🎯 **Combined effect**: 121x faster than CPU inference
200
+
201
+ *Real-time phone detection at 132+ FPS enables responsive, sub-8ms reaction times.*
202
+
203
  ---
204
 
205
  ## πŸ‘οΈ Computer Vision & Object Tracking
benchmark_tensorrt.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ TensorRT vs PyTorch Benchmark Script - 3-Way Comparison
4
+ Tests: TensorRT GPU, PyTorch GPU, PyTorch CPU
5
+ """
6
+
7
+ import time
8
+ import numpy as np
9
+ import torch
10
+
11
+ def benchmark_yolo(model, num_frames=100, warmup_frames=10):
12
+ """Benchmark YOLO detection speed."""
13
+ # Create test frame (640x480 RGB)
14
+ test_frame = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
15
+
16
+ # Warm up
17
+ for _ in range(warmup_frames):
18
+ model(test_frame, verbose=False)
19
+
20
+ # Benchmark
21
+ start_time = time.time()
22
+ for _ in range(num_frames):
23
+ model(test_frame, verbose=False)
24
+ elapsed = time.time() - start_time
25
+
26
+ avg_ms = (elapsed / num_frames) * 1000
27
+ fps = num_frames / elapsed
28
+
29
+ return avg_ms, fps
30
+
31
+ def main():
32
+ from ultralytics import YOLO
33
+
34
+ print("=" * 70)
35
+ print("TensorRT vs PyTorch GPU vs PyTorch CPU Benchmark")
36
+ print("=" * 70)
37
+ print()
38
+
39
+ # Check hardware
40
+ print("Hardware Detection:")
41
+ print(f" CUDA available: {torch.cuda.is_available()}")
42
+ if torch.cuda.is_available():
43
+ print(f" GPU: {torch.cuda.get_device_name(0)}")
44
+ print(f" CUDA version: {torch.version.cuda}")
45
+ print(f" PyTorch version: {torch.__version__}")
46
+ print()
47
+
48
+ # Download model if needed
49
+ print("Downloading YOLO model if needed...")
50
+ YOLO("yolo26m.pt")
51
+ print()
52
+
53
+ results = {}
54
+
55
+ # Test 1: TensorRT (if NVIDIA GPU available)
56
+ if torch.cuda.is_available():
57
+ print("-" * 70)
58
+ print("Test 1: TensorRT on NVIDIA GPU")
59
+ print("-" * 70)
60
+ print(" Initializing TensorRT (will export on first run, ~1-2 min)...")
61
+
62
+ model_tensorrt = YOLO("yolo26m.pt")
63
+ # Export to TensorRT
64
+ try:
65
+ model_tensorrt.export(format='engine', device=0, half=True, workspace=4)
66
+ print(" βœ… TensorRT export complete!")
67
+
68
+ # Load the TensorRT engine
69
+ model_tensorrt = YOLO("yolo26m.engine")
70
+ print(" βœ… Loaded TensorRT engine")
71
+ except Exception as e:
72
+ print(f" ⚠️ TensorRT export failed: {e}")
73
+ print(" Falling back to PyTorch GPU...")
74
+ model_tensorrt = YOLO("yolo26m.pt")
75
+
76
+ print(" Warming up (10 frames)...")
77
+ print(" Running benchmark (100 frames)...")
78
+ avg_ms, fps = benchmark_yolo(model_tensorrt)
79
+ results['tensorrt'] = (fps, avg_ms)
80
+
81
+ print()
82
+ print(" Results:")
83
+ print(f" FPS: {fps:.1f}")
84
+ print(f" Latency: {avg_ms:.1f}ms")
85
+ print()
86
+
87
+ # Test 2: PyTorch on GPU (without TensorRT)
88
+ if torch.cuda.is_available():
89
+ print("-" * 70)
90
+ print("Test 2: PyTorch on NVIDIA GPU (no TensorRT)")
91
+ print("-" * 70)
92
+ print(" Loading PyTorch model on GPU...")
93
+
94
+ # Load fresh model, force to GPU without TensorRT
95
+ model_pytorch_gpu = YOLO("yolo26m.pt")
96
+ # Make sure it's on GPU
97
+ model_pytorch_gpu.to('cuda')
98
+
99
+ print(" Warming up (10 frames)...")
100
+ print(" Running benchmark (100 frames)...")
101
+ avg_ms, fps = benchmark_yolo(model_pytorch_gpu)
102
+ results['pytorch_gpu'] = (fps, avg_ms)
103
+
104
+ print()
105
+ print(" Results:")
106
+ print(f" FPS: {fps:.1f}")
107
+ print(f" Latency: {avg_ms:.1f}ms")
108
+ print()
109
+
110
+ # Test 3: PyTorch on CPU
111
+ print("-" * 70)
112
+ print("Test 3: PyTorch on CPU (baseline)")
113
+ print("-" * 70)
114
+ print(" Loading PyTorch model on CPU...")
115
+
116
+ # Load model explicitly on CPU
117
+ model_cpu = YOLO("yolo26m.pt")
118
+ model_cpu.to('cpu')
119
+
120
+ print(" Warming up (10 frames)...")
121
+ print(" Running benchmark (100 frames)...")
122
+ avg_ms, fps = benchmark_yolo(model_cpu)
123
+ results['cpu'] = (fps, avg_ms)
124
+
125
+ print()
126
+ print(" Results:")
127
+ print(f" FPS: {fps:.1f}")
128
+ print(f" Latency: {avg_ms:.1f}ms")
129
+ print()
130
+
131
+ # Summary
132
+ print("=" * 70)
133
+ print("SUMMARY")
134
+ print("=" * 70)
135
+ print()
136
+
137
+ if 'tensorrt' in results and 'pytorch_gpu' in results and 'cpu' in results:
138
+ fps_tensorrt, ms_tensorrt = results['tensorrt']
139
+ fps_pytorch_gpu, ms_pytorch_gpu = results['pytorch_gpu']
140
+ fps_cpu, ms_cpu = results['cpu']
141
+
142
+ # Calculate speedups
143
+ tensorrt_vs_pytorch = fps_tensorrt / fps_pytorch_gpu
144
+ tensorrt_vs_cpu = fps_tensorrt / fps_cpu
145
+ gpu_vs_cpu = fps_pytorch_gpu / fps_cpu
146
+
147
+ print(f" TensorRT (NVIDIA GPU): {fps_tensorrt:6.1f} FPS ({ms_tensorrt:6.1f}ms)")
148
+ print(f" PyTorch GPU: {fps_pytorch_gpu:6.1f} FPS ({ms_pytorch_gpu:6.1f}ms)")
149
+ print(f" PyTorch CPU: {fps_cpu:6.1f} FPS ({ms_cpu:6.1f}ms)")
150
+ print()
151
+ print(f" πŸš€ TensorRT vs PyTorch GPU: {tensorrt_vs_pytorch:.2f}x faster")
152
+ print(f" πŸ“Š GPU vs CPU (PyTorch): {gpu_vs_cpu:.1f}x faster")
153
+ print(f" 🎯 TensorRT vs CPU (total): {tensorrt_vs_cpu:.1f}x faster")
154
+ print()
155
+ print("=" * 70)
156
+ print("πŸ“‹ Add this table to your README:")
157
+ print("=" * 70)
158
+ print()
159
+ print("| Backend | Hardware | FPS | Latency | TensorRT Speedup | vs CPU |")
160
+ print("|---------|----------|-----|---------|------------------|--------|")
161
+ print(f"| **TensorRT** | NVIDIA GPU | **{fps_tensorrt:.1f}** | {ms_tensorrt:.1f}ms | **{tensorrt_vs_pytorch:.2f}x** | {tensorrt_vs_cpu:.1f}x |")
162
+ print(f"| PyTorch | NVIDIA GPU | {fps_pytorch_gpu:.1f} | {ms_pytorch_gpu:.1f}ms | 1.0x | {gpu_vs_cpu:.1f}x |")
163
+ print(f"| PyTorch | CPU | {fps_cpu:.1f} | {ms_cpu:.1f}ms | - | 1.0x |")
164
+ print()
165
+ print(f"**TensorRT provides {tensorrt_vs_pytorch:.2f}x speedup over PyTorch on the same NVIDIA GPU!**")
166
+
167
+ elif 'tensorrt' in results and 'cpu' in results:
168
+ fps_tensorrt, ms_tensorrt = results['tensorrt']
169
+ fps_cpu, ms_cpu = results['cpu']
170
+ speedup = fps_tensorrt / fps_cpu
171
+
172
+ print(f" TensorRT (NVIDIA GPU): {fps_tensorrt:.1f} FPS ({ms_tensorrt:.1f}ms)")
173
+ print(f" PyTorch (CPU): {fps_cpu:.1f} FPS ({ms_cpu:.1f}ms)")
174
+ print()
175
+ print(f" πŸš€ SPEEDUP: {speedup:.2f}x faster with TensorRT!")
176
+
177
+ else:
178
+ fps_cpu, ms_cpu = results['cpu']
179
+ print(f" PyTorch (CPU only): {fps_cpu:.1f} FPS ({ms_cpu:.1f}ms)")
180
+ print()
181
+ print(" ⚠️ Run on NVIDIA GPU to measure TensorRT speedup!")
182
+
183
+ if __name__ == "__main__":
184
+ main()