""" Streaming AMD (Answering Machine Detection) — Local Training Script ===================================================================== Fine-tunes WhisperForAudioClassification on telephony audio to classify: 0: human — Live person on the phone 1: voicemail — Voicemail greeting (human-recorded "leave a message" + beep) 2: ivr — IVR system (automated menu, DTMF, robotic TTS) 3: answering_machine — Carrier/generic automated message + beep WHY WHISPER: Voicemail greetings are recorded by real humans — acoustically identical to live speech. The model must understand WHAT is being said ("I'm not available, leave a message" vs "Hello? Who's calling?"). Whisper's encoder was trained on 680K hours of speech and understands content, not just acoustic patterns. HARDWARE: Tested on RTX 5090 (32GB VRAM). Whisper-small uses ~3-4GB. You can run whisper-medium (~8GB) or even whisper-large-v3 (~16GB) on this card. DATASET: AbijahKaj/telephony-amd-dataset (private) - 1,599 samples: 200 real human speech (MINDS14) + 1,399 TTS (edge-tts) - 16kHz mono, up to 10s, telephony channel effects applied - Balanced: ~400 per class USAGE: pip install transformers datasets evaluate torch torchaudio soundfile accelerate # Login to HF for private dataset access: huggingface-cli login python train_local.py # Or with custom settings: python train_local.py --model openai/whisper-medium --epochs 30 --batch-size 16 """ import os import sys import argparse import numpy as np import torch import evaluate from datasets import load_dataset, Audio from transformers import ( AutoFeatureExtractor, WhisperForAudioClassification, TrainingArguments, Trainer, EarlyStoppingCallback, ) from huggingface_hub import login def parse_args(): p = argparse.ArgumentParser(description="Train Whisper AMD classifier") p.add_argument("--model", default="openai/whisper-small", help="Base model (whisper-tiny/small/medium/large-v3)") p.add_argument("--dataset", default="AbijahKaj/telephony-amd-dataset") p.add_argument("--output-dir", default="./amd-checkpoints") p.add_argument("--hub-model-id", default="AbijahKaj/whisper-telephony-amd") p.add_argument("--push-to-hub", action="store_true", default=True) p.add_argument("--no-push", dest="push_to_hub", action="store_false") # Training p.add_argument("--epochs", type=int, default=20) p.add_argument("--batch-size", type=int, default=8) p.add_argument("--grad-accum", type=int, default=4, help="Gradient accumulation steps") p.add_argument("--lr", type=float, default=3e-5) p.add_argument("--warmup-ratio", type=float, default=0.1) p.add_argument("--weight-decay", type=float, default=0.01) p.add_argument("--max-audio-sec", type=float, default=10.0, help="Max audio length in seconds") # Model p.add_argument("--freeze-encoder", action="store_true", default=False, help="Freeze entire encoder (only train projector + classifier)") p.add_argument("--gradient-checkpointing", action="store_true", default=True) # Early stopping p.add_argument("--patience", type=int, default=5, help="Early stopping patience (epochs without improvement)") return p.parse_args() def main(): args = parse_args() # ====== Setup ====== LABELS = ["human", "voicemail", "ivr", "answering_machine"] label2id = {l: str(i) for i, l in enumerate(LABELS)} id2label = {str(i): l for i, l in enumerate(LABELS)} SAMPLE_RATE = 16000 MAX_SAMPLES = int(args.max_audio_sec * SAMPLE_RATE) # Login if needed token = os.environ.get("HF_TOKEN") if token: login(token=token) # Device info if torch.cuda.is_available(): gpu = torch.cuda.get_device_name(0) vram = torch.cuda.get_device_properties(0).total_mem / 1e9 print(f"GPU: {gpu} ({vram:.1f} GB VRAM)") else: print("WARNING: No GPU detected. Training will be very slow on CPU.") # ====== Dataset ====== print(f"\nLoading dataset: {args.dataset}") dataset = load_dataset(args.dataset) dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE)) print(f" Train: {len(dataset['train'])} samples") print(f" Test: {len(dataset['test'])} samples") # Print class distribution for split in ['train', 'test']: labels = dataset[split]['label'] dist = {LABELS[i]: labels.count(i) for i in range(len(LABELS))} print(f" {split}: {dist}") # ====== Model ====== print(f"\nLoading model: {args.model}") feature_extractor = AutoFeatureExtractor.from_pretrained(args.model) model = WhisperForAudioClassification.from_pretrained( args.model, num_labels=len(LABELS), label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True, ) # Freeze strategy if args.freeze_encoder: # Full encoder freeze — only train projector + classifier (fast, less overfitting) model.freeze_encoder() print(" Encoder fully frozen (training projector + classifier only)") else: # Freeze conv layers, fine-tune transformer layers + head model.freeze_encoder() for param in model.encoder.layers.parameters(): param.requires_grad = True print(" Conv layers frozen, transformer layers + head trainable") trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f" Trainable: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)") # ====== Preprocessing ====== print("\nPreprocessing audio → mel spectrograms...") def preprocess(examples): audio_arrays = [x["array"] for x in examples["audio"]] inputs = feature_extractor( audio_arrays, sampling_rate=SAMPLE_RATE, return_tensors="np", padding="max_length", max_length=MAX_SAMPLES, truncation=True, ) return {"input_features": inputs.input_features} encoded = dataset.map(preprocess, remove_columns=["audio"], batched=True, batch_size=16) print(f" Done. Train: {len(encoded['train'])}, Test: {len(encoded['test'])}") # ====== Metrics ====== accuracy_metric = evaluate.load("accuracy") def compute_metrics(eval_pred): preds = np.argmax(eval_pred.predictions, axis=1) acc = accuracy_metric.compute(predictions=preds, references=eval_pred.label_ids) # Per-class accuracy for i, name in enumerate(LABELS): mask = eval_pred.label_ids == i if mask.sum() > 0: acc[f"acc_{name}"] = float((preds[mask] == i).mean()) return acc # ====== Training ====== use_fp16 = torch.cuda.is_available() training_args = TrainingArguments( output_dir=args.output_dir, hub_model_id=args.hub_model_id if args.push_to_hub else None, push_to_hub=args.push_to_hub, num_train_epochs=args.epochs, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, gradient_accumulation_steps=args.grad_accum, learning_rate=args.lr, warmup_ratio=args.warmup_ratio, weight_decay=args.weight_decay, lr_scheduler_type="cosine", fp16=use_fp16, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="accuracy", greater_is_better=True, logging_strategy="steps", logging_steps=10, logging_first_step=True, disable_tqdm=False, # Show progress bar locally save_total_limit=3, dataloader_num_workers=4, seed=42, gradient_checkpointing=args.gradient_checkpointing, ) callbacks = [] if args.patience > 0: callbacks.append(EarlyStoppingCallback(early_stopping_patience=args.patience)) trainer = Trainer( model=model, args=training_args, train_dataset=encoded["train"], eval_dataset=encoded["test"], processing_class=feature_extractor, compute_metrics=compute_metrics, callbacks=callbacks, ) print(f"\n{'='*60}") print(f"Training config:") print(f" Model: {args.model}") print(f" Epochs: {args.epochs} (early stopping patience={args.patience})") print(f" Batch: {args.batch_size} x {args.grad_accum} accum = {args.batch_size * args.grad_accum} effective") print(f" LR: {args.lr}") print(f" FP16: {use_fp16}") print(f" GradCkpt: {args.gradient_checkpointing}") print(f" Push to Hub: {args.push_to_hub}") print(f"{'='*60}\n") trainer.train() # ====== Evaluate ====== print("\n" + "="*60) print("Final evaluation:") results = trainer.evaluate() for k, v in sorted(results.items()): if not k.startswith("eval_runtime"): print(f" {k}: {v:.4f}" if isinstance(v, float) else f" {k}: {v}") # ====== Save ====== if args.push_to_hub: print(f"\nPushing to Hub: {args.hub_model_id}") trainer.push_to_hub(commit_message="Trained Whisper AMD telephony classifier") feature_extractor.push_to_hub(args.hub_model_id) print(f"Model: https://huggingface.co/{args.hub_model_id}") else: save_path = os.path.join(args.output_dir, "final") trainer.save_model(save_path) feature_extractor.save_pretrained(save_path) print(f"Model saved to: {save_path}") print("\nDone!") if __name__ == "__main__": main()