#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ XERV CRAYON V5.1.0 - OMNI-BACKEND DEMONSTRATION ================================================ This script demonstrates the "Smashing Experience" of Crayon's Omni-Backend. It showcases: 1. Automatic hardware detection (Auto-Pilot Mode) 2. Manual device override 3. Profile hot-swapping 4. Latency and throughput benchmarks Usage: python demo_omni.py The script will automatically detect your hardware and run appropriate tests. """ import time import sys import os import io # Fix Windows console encoding for emoji support if sys.platform == "win32": try: sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') except Exception: pass # If it fails, just continue without emoji # Add src to path for development sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) from crayon import CrayonVocab, check_backends, get_version, enable_verbose_logging def print_banner(): """Print the demo banner.""" print("=" * 70) print("๐Ÿ–๏ธ XERV CRAYON V{} - OMNI-BACKEND DEMO".format(get_version())) print("=" * 70) print() def demo_auto_mode(): """ AUTO MODE: The "It Just Works" Experience Crayon automatically detects your hardware and selects the best backend: - NVIDIA GPU โ†’ CUDA engine (parallel kernel execution) - AMD GPU โ†’ ROCm engine (HIP kernel execution) - Otherwise โ†’ CPU engine (AVX2/AVX-512 SIMD) """ print("1๏ธโƒฃ INITIALIZING IN AUTO MODE...") print("-" * 50) # Enable logging to see device detection enable_verbose_logging() # Create vocab with auto-detection vocab = CrayonVocab(device="auto") info = vocab.get_info() print(f"\n ๐Ÿ“Š Detection Results:") print(f" โ”œโ”€ Device: {info['device'].upper()}") print(f" โ”œโ”€ Backend: {info['backend']}") print(f" โ”œโ”€ State: {info['device_state']}") if 'hardware' in info: print(f" โ””โ”€ Hardware: {info['hardware'].get('name', 'Unknown')}") if info['hardware'].get('vram_mb'): print(f" โ””โ”€ VRAM: {info['hardware']['vram_mb']} MB") # Show available backends backends = check_backends() available = [k for k, v in backends.items() if v] print(f"\n ๐Ÿ”Œ Available Backends: {', '.join(available)}") # Load default profile print("\n ๐Ÿ“ฆ Loading 'lite' profile...") vocab.load_profile("lite") print(f" โœ… Profile loaded ({vocab.vocab_size} tokens)") return vocab def demo_latency_test(vocab): """ LATENCY TEST: The "Instant" Feel Measures single-string tokenization performance. CPU mode is optimized for latency with minimal overhead. """ print("\n") print("2๏ธโƒฃ LATENCY TEST (Single String)") print("-" * 50) text = "Crayon optimizes tokenization at the silicon level." # Warm-up (important for JIT and cache warming) for _ in range(100): _ = vocab.tokenize(text) # Timed run iterations = 10000 start = time.perf_counter() for _ in range(iterations): tokens = vocab.tokenize(text) end = time.perf_counter() avg_us = ((end - start) / iterations) * 1_000_000 print(f"\n ๐Ÿ“ Input: '{text}'") print(f" ๐Ÿ”ข Tokens: {tokens}") print(f" ๐Ÿ“Š Token Count: {len(tokens)}") print(f" โšก Average Latency: {avg_us:.2f} ยตs/call") print(f" ๐Ÿ”„ Iterations: {iterations:,}") return tokens def demo_profile_hotswap(vocab): """ PROFILE HOT-SWAP: The Context Manager Demonstrates switching vocabulary profiles on-the-fly. Useful when processing mixed content. """ print("\n") print("3๏ธโƒฃ CONTEXT SWITCHING (Profile Hot-Swap)") print("-" * 50) code_snippet = "def forward(self, x): return torch.matmul(x, w)" print(f"\n ๐Ÿ“ Code: '{code_snippet}'") # Tokenize with lite profile print("\n [LITE Profile] Tokenizing code...") tokens_lite = vocab.tokenize(code_snippet) print(f" โ””โ”€ Result: {len(tokens_lite)} tokens") # Switch to standard profile print("\n [STANDARD Profile] Switching context...") with vocab.using_profile("standard"): tokens_std = vocab.tokenize(code_snippet) print(f" โ””โ”€ Result: {len(tokens_std)} tokens") print("\n ๐Ÿ”„ Automatically reverted to 'lite' profile") # Verify we're back to lite current_info = vocab.get_info() print(f" โ””โ”€ Current: {current_info.get('active_profile', 'unknown')}") def demo_batch_throughput(vocab): """ BATCH THROUGHPUT: The Parallel Processing Power Measures batch tokenization performance. GPU mode excels here with parallel kernel execution. """ print("\n") print("4๏ธโƒฃ BATCH THROUGHPUT TEST") print("-" * 50) # Create test batches base_text = "The quick brown fox jumps over the lazy dog." batch_sizes = [100, 1000, 10000] for batch_size in batch_sizes: batch = [base_text] * batch_size # Warm-up _ = vocab.tokenize(batch[:10]) # Timed run start = time.time() results = vocab.tokenize(batch) duration = time.time() - start total_tokens = sum(len(r) for r in results) throughput = batch_size / duration tokens_per_sec = total_tokens / duration print(f"\n ๐Ÿ“ฆ Batch Size: {batch_size:,}") print(f" โฑ๏ธ Duration: {duration:.4f}s") print(f" ๐Ÿš€ Throughput: {throughput:,.0f} docs/sec") print(f" ๐Ÿ“Š Token Rate: {tokens_per_sec:,.0f} tokens/sec") def demo_gpu_smashing(vocab): """ GPU SMASHING: The High-Throughput Experience If running on GPU, demonstrates the massive parallelism available. 100K+ documents processed in seconds. """ print("\n") print("5๏ธโƒฃ GPU SMASH TEST") print("-" * 50) if vocab.device == "cpu": print("\n โ„น๏ธ Running in CPU Mode - Skipping GPU stress test") print(" ๐Ÿ’ก To enable: Run on a machine with NVIDIA/AMD GPU") return # Massive batch batch_size = 100_000 base_text = "The quick brown fox jumps over the lazy dog." print(f"\n ๐Ÿ”ง Generating {batch_size:,} documents...") batch = [base_text] * batch_size print(" ๐Ÿš€ Launching GPU kernel...") start = time.time() results = vocab.tokenize(batch) duration = time.time() - start total_tokens = sum(len(r) for r in results) throughput = batch_size / duration tokens_per_sec = total_tokens / duration print(f"\n โœ… Processed {batch_size:,} documents in {duration:.4f}s") print(f" ๐Ÿ”ฅ Document Throughput: {throughput:,.0f} docs/sec") print(f" ๐Ÿ“Š Token Throughput: {tokens_per_sec:,.0f} tokens/sec") def demo_encode_decode(vocab): """ ENCODE/DECODE: Round-Trip Verification Demonstrates the decode() functionality for debugging and understanding tokenization behavior. """ print("\n") print("6๏ธโƒฃ ENCODE/DECODE ROUND-TRIP") print("-" * 50) test_text = "Hello, Crayon! Testing the tokenizer." print(f"\n ๐Ÿ“ Original: '{test_text}'") # Encode tokens = vocab.tokenize(test_text) print(f" ๐Ÿ”ข Tokens: {tokens}") # Decode (if JSON available) try: decoded = vocab.decode(tokens) print(f" ๐Ÿ“ค Decoded: '{decoded}'") if decoded == test_text: print(" โœ… Perfect round-trip!") else: print(" โš ๏ธ Minor differences (expected with subword tokenization)") except RuntimeError as e: print(f" โš ๏ธ Decode unavailable: {e}") def demo_device_override(): """ MANUAL OVERRIDE: Total Control Demonstrates explicitly selecting a device for specific use cases. """ print("\n") print("7๏ธโƒฃ MANUAL DEVICE OVERRIDE") print("-" * 50) backends = check_backends() print(f"\n ๐Ÿ”Œ Available: {backends}") # Force CPU mode print("\n ๐Ÿ”ต Creating CPU-only instance...") cpu_vocab = CrayonVocab(device="cpu") cpu_vocab.load_profile("lite") info = cpu_vocab.get_info() print(f" โ””โ”€ Device: {info['device']}") print(f" โ””โ”€ Backend: {info['backend']}") # Quick latency test text = "Quick CPU test" start = time.perf_counter() for _ in range(1000): _ = cpu_vocab.tokenize(text) avg_us = ((time.perf_counter() - start) / 1000) * 1_000_000 print(f" โ””โ”€ Latency: {avg_us:.2f} ยตs/call") cpu_vocab.close() # Try CUDA if available if backends.get("cuda"): print("\n ๐ŸŸข Creating CUDA instance...") cuda_vocab = CrayonVocab(device="cuda") cuda_vocab.load_profile("lite") info = cuda_vocab.get_info() print(f" โ””โ”€ Device: {info['device']}") cuda_vocab.close() # Try ROCm if available if backends.get("rocm"): print("\n ๐Ÿ”ด Creating ROCm instance...") rocm_vocab = CrayonVocab(device="rocm") rocm_vocab.load_profile("lite") info = rocm_vocab.get_info() print(f" โ””โ”€ Device: {info['device']}") rocm_vocab.close() def main(): """Run the complete demo.""" print_banner() try: # Main demos vocab = demo_auto_mode() demo_latency_test(vocab) demo_profile_hotswap(vocab) demo_batch_throughput(vocab) demo_gpu_smashing(vocab) demo_encode_decode(vocab) # Cleanup main vocab vocab.close() # Device override demo demo_device_override() print("\n") print("=" * 70) print("โœ… ALL DEMOS COMPLETED SUCCESSFULLY!") print("=" * 70) except Exception as e: print(f"\nโŒ Demo failed: {e}") import traceback traceback.print_exc() return 1 return 0 if __name__ == "__main__": sys.exit(main())