Spaces:

SHAFISK17
/

sense-backend

Running

sense-backend / file_handlers /video_pipeline.py

SHAFI

improved video ocr

c36fd58 13 days ago

14 kB

	"""
	Video Pipeline — 4-channel text extraction for PII detection.

	Channels:
	1. Metadata — ffprobe JSON → flat key: value text
	2. Subtitles — embedded SRT/VTT tracks demuxed via ffmpeg
	3. Visual — EasyOCR (Text) + Presidio Image Analyzer (Entities) on keyframes
	4. Audio — ffmpeg WAV + faster-whisper (base) transcription

	Returns merged plain text with source attribution headers so the
	downstream NLP models receive clearly labelled, scannable content.

	All external calls are guarded with timeouts so this never hangs forever.
	"""
	from __future__ import annotations

	import glob
	import json
	import os
	import re
	import subprocess
	import tempfile
	from typing import Callable, Optional

	try:
	from PIL import Image
	except ImportError:
	pass

	# ── Lazy Whisper loader ──────────────────────────────────────────────────────

	WHISPER_MODEL_SIZE = "base"
	_whisper_model = None


	def _get_whisper():
	global _whisper_model
	if _whisper_model is None:
	try:
	from faster_whisper import WhisperModel
	_whisper_model = WhisperModel(
	WHISPER_MODEL_SIZE,
	device="cpu",
	compute_type="int8", # quantised — runs on CPU without VRAM
	)
	except ImportError:
	raise RuntimeError(
	"faster-whisper is not installed. "
	"Run: pip install faster-whisper"
	)
	return _whisper_model


	# ── Lazy Visual Loaders (EasyOCR & Presidio Image) ───────────────────────────

	_easyocr_reader = None
	_presidio_image_analyzer = None

	def _get_easyocr():
	global _easyocr_reader
	if _easyocr_reader is None:
	try:
	import easyocr
	# Load English, run on CPU for wider compatibility
	_easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
	except ImportError:
	raise RuntimeError("easyocr is not installed.")
	return _easyocr_reader

	def _get_presidio_image_analyzer():
	global _presidio_image_analyzer
	if _presidio_image_analyzer is None:
	try:
	from presidio_image_redactor import ImageAnalyzerEngine
	_presidio_image_analyzer = ImageAnalyzerEngine()
	except ImportError:
	raise RuntimeError("presidio-image-redactor is not installed.")
	return _presidio_image_analyzer


	# ── ffmpeg availability ───────────────────────────────────────────────────────

	def _check_ffmpeg() -> bool:
	"""Return True if ffmpeg/ffprobe are available in PATH."""
	try:
	subprocess.run(
	["ffmpeg", "-version"],
	capture_output=True, check=True, timeout=5,
	)
	return True
	except Exception:
	return False


	# ── Channel 1: Metadata ───────────────────────────────────────────────────────

	def _extract_metadata(video_path: str) -> str:
	"""Use ffprobe to extract all metadata tags as flat key: value text."""
	try:
	result = subprocess.run(
	[
	"ffprobe", "-v", "quiet",
	"-print_format", "json",
	"-show_format", "-show_streams",
	video_path,
	],
	capture_output=True, text=True, timeout=30, check=True,
	)
	data = json.loads(result.stdout)
	lines = ["[METADATA]"]

	fmt_tags = data.get("format", {}).get("tags", {})
	for key, val in fmt_tags.items():
	lines.append(f"{key}: {val}")

	for idx, stream in enumerate(data.get("streams", [])):
	stream_tags = stream.get("tags", {})
	for key, val in stream_tags.items():
	lines.append(f"stream{idx}_{key}: {val}")

	return "\n".join(lines) if len(lines) > 1 else ""
	except Exception as exc:
	return f"[METADATA]\n[Could not extract: {exc}]"


	# ── Channel 2: Subtitles ──────────────────────────────────────────────────────

	_TS_PATTERN = re.compile(r"(\d{2}:\d{2}:\d{2}),\d{3} --> ")
	_TS_CAPTURE = re.compile(r"(\d{2}:\d{2}:\d{2})")


	def _parse_srt(raw: str) -> list[str]:
	"""Convert raw SRT content to clean timestamped lines."""
	lines = raw.splitlines()
	result = []
	i = 0
	while i < len(lines):
	line = lines[i].strip()
	if _TS_PATTERN.match(line):
	ts_match = _TS_CAPTURE.match(line)
	label = ts_match.group(1) if ts_match else ""
	i += 1
	parts: list[str] = []
	while i < len(lines) and lines[i].strip():
	parts.append(lines[i].strip())
	i += 1
	if parts:
	result.append(f"[{label}] {' '.join(parts)}")
	i += 1
	return result


	def _extract_subtitles(video_path: str, tmp_dir: str) -> str:
	"""Demux the first embedded subtitle track; parse and return clean text."""
	srt_path = os.path.join(tmp_dir, "subs.srt")
	try:
	subprocess.run(
	["ffmpeg", "-y", "-i", video_path, "-map", "0:s:0", srt_path],
	capture_output=True, timeout=60, check=True,
	)
	except Exception:
	return "" # No subtitles is normal — not an error

	if not os.path.exists(srt_path) or os.path.getsize(srt_path) == 0:
	return ""

	with open(srt_path, encoding="utf-8", errors="replace") as fh:
	raw = fh.read()

	lines = _parse_srt(raw)
	if not lines:
	return ""
	return "[SUBTITLES]\n" + "\n".join(lines)


	# ── Channel 3: Audio Transcript ───────────────────────────────────────────────

	def _extract_audio_transcript(
	video_path: str,
	tmp_dir: str,
	progress_cb: Optional[Callable[[int, str], None]] = None,
	) -> str:
	"""
	Extract audio track with ffmpeg → transcribe with faster-whisper (base).
	progress_cb(percent: int, detail: str) is called throughout transcription.
	"""
	wav_path = os.path.join(tmp_dir, "audio.wav")

	# Step A: Extract audio as 16 kHz mono WAV (Whisper requirement)
	try:
	subprocess.run(
	[
	"ffmpeg", "-y", "-i", video_path,
	"-vn", # no video
	"-acodec", "pcm_s16le", # 16-bit PCM
	"-ar", "16000", # 16 kHz sample rate
	"-ac", "1", # mono
	wav_path,
	],
	capture_output=True, timeout=180, check=True,
	)
	except subprocess.TimeoutExpired:
	return "[AUDIO TRANSCRIPT]\n[Audio extraction timed out after 3 minutes]"
	except Exception as exc:
	return f"[AUDIO TRANSCRIPT]\n[Audio extraction failed: {exc}]"

	if not os.path.exists(wav_path) or os.path.getsize(wav_path) < 1024:
	return "[AUDIO TRANSCRIPT]\n[No audio track found in this video]"

	# Step B: Transcribe
	try:
	model = _get_whisper()
	if progress_cb:
	progress_cb(0, "Starting Whisper transcription…")

	segments_iter, info = model.transcribe(
	wav_path,
	beam_size=5,
	language=None, # auto-detect
	vad_filter=True, # skip silent sections — faster
	)
	duration = float(info.duration) if info.duration else 1.0

	transcript_lines = ["[AUDIO TRANSCRIPT]"]
	for seg in segments_iter:
	start_s = int(seg.start)
	end_s = int(seg.end)
	ts = f"{start_s // 60:02d}:{start_s % 60:02d} → {end_s // 60:02d}:{end_s % 60:02d}"
	transcript_lines.append(f"[{ts}] {seg.text.strip()}")

	if progress_cb:
	pct = min(int((seg.end / duration) * 100), 99)
	dur_fmt = f"{int(duration // 60):02d}:{int(duration % 60):02d}"
	progress_cb(pct, f"Transcribing audio… {ts} / {dur_fmt}")

	return "\n".join(transcript_lines)

	except Exception as exc:
	return f"[AUDIO TRANSCRIPT]\n[Transcription failed: {exc}]"


	# ── Channel 4: Visual Stream (OCR & Presidio Image) ──────────────────────────

	def _extract_visual_stream(
	video_path: str,
	tmp_dir: str,
	progress_cb: Optional[Callable[[int, str], None]] = None,
	) -> str:
	"""
	Extract 1 frame every 2 seconds.
	Run EasyOCR to get text.
	Run Presidio Image Redactor to find visual PII entities.
	"""
	frames_dir = os.path.join(tmp_dir, "frames")
	os.makedirs(frames_dir, exist_ok=True)

	try:
	subprocess.run(
	[
	"ffmpeg", "-y", "-i", video_path,
	"-vf", "fps=1/2",
	os.path.join(frames_dir, "frame_%04d.jpg")
	],
	capture_output=True, timeout=120, check=True
	)
	except Exception as exc:
	return f"[VISUAL STREAM]\n[Frame extraction failed: {exc}]"

	frame_files = sorted(glob.glob(os.path.join(frames_dir, "*.jpg")))
	if not frame_files:
	return ""

	try:
	reader = _get_easyocr()
	image_analyzer = _get_presidio_image_analyzer()
	except Exception as exc:
	return f"[VISUAL STREAM]\n[Failed to load visual models: {exc}]"

	total_frames = len(frame_files)
	visual_lines = ["[VISUAL STREAM OCR & PRESIDIO]"]
	seen_text = set()

	for idx, frame_path in enumerate(frame_files):
	if progress_cb:
	progress_cb(min(int((idx / total_frames) * 100), 99), f"Scanning visual frame {idx+1}/{total_frames}…")

	time_s = idx * 2
	ts = f"{time_s // 60:02d}:{time_s % 60:02d}"

	try:
	# 1. EasyOCR
	results = reader.readtext(frame_path)
	for (bbox, text, prob) in results:
	if prob > 0.5:
	clean_text = text.strip()
	if len(clean_text) > 3 and clean_text not in seen_text:
	seen_text.add(clean_text)
	visual_lines.append(f"[{ts}] [TEXT] {clean_text}")

	# 2. Presidio Image Redactor
	if 'Image' in globals():
	img = Image.open(frame_path)
	bboxes = image_analyzer.analyze(image=img)
	found_entities = set([b.entity_type for b in bboxes if b.score > 0.4])
	for entity in found_entities:
	visual_lines.append(f"[{ts}] [VISUAL PII (Presidio)] Detected {entity}")
	except Exception:
	continue

	if len(visual_lines) == 1:
	return ""
	return "\n".join(visual_lines)


	# ── Public entry point ────────────────────────────────────────────────────────

	def process_video(
	video_path: str,
	progress_cb: Optional[Callable[[int, str], None]] = None,
	) -> str:
	"""
	Full 4-channel extraction pipeline.
	Returns merged text with source-attribution headers.

	progress_cb(percent: int, detail: str)
	— called throughout; percent maps 0 → 95 (final 5% is model scanning)
	"""
	if not _check_ffmpeg():
	return (
	"[ERROR] ffmpeg not found in PATH.\n"
	"On HuggingFace Spaces add 'ffmpeg' to packages.txt.\n"
	"Locally: https://ffmpeg.org/download.html"
	)

	parts: list[str] = []

	with tempfile.TemporaryDirectory() as tmp_dir:

	# ── Channel 1: Metadata (instant, ~1 s) ─────────────────────────────
	if progress_cb:
	progress_cb(3, "Extracting video metadata…")
	meta = _extract_metadata(video_path)
	if meta:
	parts.append(meta)

	# ── Channel 2: Subtitles (fast, ~2–5 s) ─────────────────────────────
	if progress_cb:
	progress_cb(8, "Demuxing embedded subtitles…")
	subs = _extract_subtitles(video_path, tmp_dir)
	if subs:
	parts.append(subs)

	# ── Channel 3: Visual Stream (EasyOCR & Presidio) ───────────────────
	if progress_cb:
	progress_cb(15, "Initialising Visual Models (EasyOCR & Presidio)…")

	def _visual_progress(pct: int, detail: str) -> None:
	# Remap visual progress: 15 % -> 50 %
	mapped = 15 + int(pct * 0.35)
	if progress_cb:
	progress_cb(mapped, detail)

	visual_text = _extract_visual_stream(video_path, tmp_dir, _visual_progress)
	if visual_text:
	parts.append(visual_text)

	# ── Channel 4: Audio Transcript (slow — Whisper) ─────────────────────
	if progress_cb:
	progress_cb(50, "Initialising Whisper model…")

	def _audio_progress(pct: int, detail: str) -> None:
	# Remap audio progress: 50 % → 93 %
	mapped = 50 + int(pct * 0.43)
	if progress_cb:
	progress_cb(mapped, detail)

	transcript = _extract_audio_transcript(video_path, tmp_dir, _audio_progress)
	if transcript:
	parts.append(transcript)

	if progress_cb:
	progress_cb(95, "Extraction complete — handing off to PII models…")

	if not parts:
	return "[No extractable text found in this video file]"

	return "\n\n".join(parts)