""" Transcription Quality Metrics for HTR Evaluation Provides CER, WER, and character-level diff operations for comparing transcriptions. Used by the comparison widget for engine evaluation. Author: Claude Code Date: 2025-11-05 """ from dataclasses import dataclass from enum import Enum from typing import List, Tuple, Optional import Levenshtein class ComparisonMode(str, Enum): """Semantic mode for user-facing comparison metrics.""" GROUND_TRUTH = "ground_truth" ENGINE_COMPARISON = "engine_comparison" @dataclass class DiffOperation: """ Single character-level diff operation. Used for visualizing differences between reference and hypothesis. """ operation: str # 'equal', 'replace', 'insert', 'delete' ref_char: str # Character from reference (empty for insertions) hyp_char: str # Character from hypothesis (empty for deletions) ref_pos: int # Position in reference string hyp_pos: int # Position in hypothesis string @dataclass class LineMetrics: """ Complete metrics for comparing a single line. Attributes: reference: Ground truth or baseline text hypothesis: Predicted text to compare against reference cer: Character Error Rate (0-100), meaningful as CER only with GT wer: Word Error Rate (0-100), meaningful as WER only with GT match_percent: Percentage of matching characters (0-100) edit_distance: Levenshtein edit distance diff_ops: List of character-level diff operations """ reference: str hypothesis: str cer: float wer: float match_percent: float edit_distance: int diff_ops: List[DiffOperation] @dataclass(frozen=True) class ComparisonDisplayLabels: """User-facing labels and notes for a comparison mode.""" char_rate: str word_rate: str match_rate: str macro_char_rate: str micro_char_rate: str macro_word_rate: str char_rate_column: str word_rate_column: str macro_char_rate_note: str micro_char_rate_note: str macro_word_rate_note: str char_unit_label: str color_thresholds: Tuple[float, float] @dataclass(frozen=True) class ComparisonDisplayMetrics: """User-facing metric values for a single comparison.""" char_rate: float word_rate: float match_percent: float edit_distance: int @dataclass(frozen=True) class ComparisonSummary: """Aggregated user-facing metrics for multiple lines.""" line_count: int total_edit_distance: int total_char_units: int macro_char_rate: float micro_char_rate: float macro_word_rate: float avg_match_percent: float class TranscriptionMetrics: """ Calculate HTR quality metrics. Uses python-Levenshtein for fast edit distance calculation. All methods are static and can be called without instantiation. Example: >>> cer = TranscriptionMetrics.calculate_cer("hello", "helo") >>> print(f"CER: {cer:.2f}%") CER: 20.00% """ @staticmethod def calculate_cer(reference: str, hypothesis: str) -> float: """ Calculate Character Error Rate using Levenshtein distance. CER = (insertions + deletions + substitutions) / total_characters Args: reference: Ground truth text hypothesis: Predicted text Returns: CER as percentage (0.0-100.0) Examples: >>> TranscriptionMetrics.calculate_cer("test", "test") 0.0 >>> TranscriptionMetrics.calculate_cer("test", "text") 25.0 """ # Handle empty strings if not reference: return 100.0 if hypothesis else 0.0 distance = Levenshtein.distance(reference, hypothesis) return (distance / len(reference)) * 100.0 @staticmethod def calculate_wer(reference: str, hypothesis: str) -> float: """ Calculate Word Error Rate using Levenshtein distance. WER = (insertions + deletions + substitutions) / total_words Words are split by whitespace. Typically 3-4x higher than CER for natural language text. Args: reference: Ground truth text hypothesis: Predicted text Returns: WER as percentage (0.0-100.0) Examples: >>> TranscriptionMetrics.calculate_wer("hello world", "hello earth") 50.0 """ # Split into words ref_words = reference.split() hyp_words = hypothesis.split() # Handle empty word lists if not ref_words: return 100.0 if hyp_words else 0.0 # Calculate edit distance between word sequences distance = TranscriptionMetrics._sequence_distance(ref_words, hyp_words) return (distance / len(ref_words)) * 100.0 @staticmethod def calculate_char_disagreement(reference: str, hypothesis: str) -> float: """ Calculate a symmetric GT-free character disagreement rate. Uses the maximum character length as denominator so the result is bounded to 0-100 and remains symmetric when reference/hypothesis swap. """ max_len = max(len(reference), len(hypothesis)) if max_len == 0: return 0.0 distance = Levenshtein.distance(reference, hypothesis) return (distance / max_len) * 100.0 @staticmethod def calculate_word_disagreement(reference: str, hypothesis: str) -> float: """ Calculate a symmetric GT-free word disagreement rate. Uses the maximum word count as denominator so the result is bounded to 0-100 and remains symmetric when reference/hypothesis swap. """ ref_words = reference.split() hyp_words = hypothesis.split() max_words = max(len(ref_words), len(hyp_words)) if max_words == 0: return 0.0 distance = TranscriptionMetrics._sequence_distance(ref_words, hyp_words) return (distance / max_words) * 100.0 @staticmethod def calculate_match_percent(reference: str, hypothesis: str) -> float: """ Calculate match percentage (inverse of normalized edit distance). This is more intuitive than CER for users: higher = better. Match% = (max_length - edit_distance) / max_length * 100 Args: reference: Ground truth text hypothesis: Predicted text Returns: Match percentage (0.0-100.0) Examples: >>> TranscriptionMetrics.calculate_match_percent("test", "test") 100.0 >>> TranscriptionMetrics.calculate_match_percent("test", "text") 75.0 """ max_len = max(len(reference), len(hypothesis)) # Both empty = perfect match if max_len == 0: return 100.0 distance = Levenshtein.distance(reference, hypothesis) return ((max_len - distance) / max_len) * 100.0 @staticmethod def get_diff_operations(reference: str, hypothesis: str) -> List[DiffOperation]: """ Get character-level diff operations for visualization. Uses Levenshtein edit operations to create a list of differences between reference and hypothesis. This is used for color-coded diff display in the GUI. Operation types: - 'equal': Characters match - 'replace': Character substitution - 'insert': Character added in hypothesis - 'delete': Character removed from hypothesis Args: reference: Ground truth text hypothesis: Predicted text Returns: List of DiffOperation objects Examples: >>> ops = TranscriptionMetrics.get_diff_operations("cat", "cut") >>> ops[1].operation 'replace' >>> ops[1].ref_char 'a' >>> ops[1].hyp_char 'u' """ ops = [] # Get edit operations from Levenshtein # Returns list of (operation, ref_pos, hyp_pos) editops = Levenshtein.editops(reference, hypothesis) # Track positions in both strings ref_idx = 0 hyp_idx = 0 for op_type, ref_pos, hyp_pos in editops: # Add any matching characters before this operation while ref_idx < ref_pos and hyp_idx < hyp_pos: ops.append(DiffOperation( operation='equal', ref_char=reference[ref_idx], hyp_char=hypothesis[hyp_idx], ref_pos=ref_idx, hyp_pos=hyp_idx )) ref_idx += 1 hyp_idx += 1 # Add the edit operation if op_type == 'replace': ops.append(DiffOperation( operation='replace', ref_char=reference[ref_pos], hyp_char=hypothesis[hyp_pos], ref_pos=ref_pos, hyp_pos=hyp_pos )) ref_idx = ref_pos + 1 hyp_idx = hyp_pos + 1 elif op_type == 'delete': ops.append(DiffOperation( operation='delete', ref_char=reference[ref_pos], hyp_char='', ref_pos=ref_pos, hyp_pos=hyp_pos )) ref_idx = ref_pos + 1 # hyp_idx stays the same elif op_type == 'insert': ops.append(DiffOperation( operation='insert', ref_char='', hyp_char=hypothesis[hyp_pos], ref_pos=ref_pos, hyp_pos=hyp_pos )) hyp_idx = hyp_pos + 1 # ref_idx stays the same # Add any remaining matching characters while ref_idx < len(reference) and hyp_idx < len(hypothesis): ops.append(DiffOperation( operation='equal', ref_char=reference[ref_idx], hyp_char=hypothesis[hyp_idx], ref_pos=ref_idx, hyp_pos=hyp_idx )) ref_idx += 1 hyp_idx += 1 return ops @staticmethod def get_display_labels(mode: ComparisonMode) -> ComparisonDisplayLabels: """Return user-facing labels for the chosen comparison mode.""" if mode == ComparisonMode.GROUND_TRUTH: return ComparisonDisplayLabels( char_rate="CER", word_rate="WER", match_rate="Match", macro_char_rate="Macro CER", micro_char_rate="Micro CER", macro_word_rate="Macro WER", char_rate_column="CER (%)", word_rate_column="WER (%)", macro_char_rate_note="mean of per-line CERs", micro_char_rate_note="total edits / total ref chars (standard HTR metric)", macro_word_rate_note="mean of per-line WERs", char_unit_label="reference characters", color_thresholds=(5.0, 20.0), ) return ComparisonDisplayLabels( char_rate="Char disagreement", word_rate="Word disagreement", match_rate="Match", macro_char_rate="Macro char disagreement", micro_char_rate="Micro char disagreement", macro_word_rate="Macro word disagreement", char_rate_column="Char disagreement (%)", word_rate_column="Word disagreement (%)", macro_char_rate_note="mean of per-line char disagreement rates", micro_char_rate_note="total edits / total max chars per line (symmetric, GT-free)", macro_word_rate_note="mean of per-line word disagreement rates", char_unit_label="comparison character units", color_thresholds=(15.0, 35.0), ) @staticmethod def get_display_metrics(metrics: LineMetrics, mode: ComparisonMode) -> ComparisonDisplayMetrics: """Map raw edit-distance metrics to the correct user-facing semantics.""" if mode == ComparisonMode.GROUND_TRUTH: char_rate = metrics.cer word_rate = metrics.wer else: char_rate = TranscriptionMetrics.calculate_char_disagreement( metrics.reference, metrics.hypothesis, ) word_rate = TranscriptionMetrics.calculate_word_disagreement( metrics.reference, metrics.hypothesis, ) return ComparisonDisplayMetrics( char_rate=char_rate, word_rate=word_rate, match_percent=metrics.match_percent, edit_distance=metrics.edit_distance, ) @staticmethod def compare_lines(reference: str, hypothesis: str) -> LineMetrics: """ Perform complete comparison of two lines. This is the main entry point for line comparison. It calculates all metrics and generates diff operations in a single call. Args: reference: Ground truth or baseline transcription hypothesis: Engine output to compare Returns: LineMetrics object with all metrics and diff operations Examples: >>> metrics = TranscriptionMetrics.compare_lines("hello", "helo") >>> print(f"CER: {metrics.cer:.2f}%, WER: {metrics.wer:.2f}%") CER: 20.00%, WER: 0.00% """ # Calculate all metrics cer = TranscriptionMetrics.calculate_cer(reference, hypothesis) wer = TranscriptionMetrics.calculate_wer(reference, hypothesis) match = TranscriptionMetrics.calculate_match_percent(reference, hypothesis) distance = Levenshtein.distance(reference, hypothesis) diff_ops = TranscriptionMetrics.get_diff_operations(reference, hypothesis) return LineMetrics( reference=reference, hypothesis=hypothesis, cer=cer, wer=wer, match_percent=match, edit_distance=distance, diff_ops=diff_ops ) @staticmethod def calculate_summary_metrics( references: List[str], hypotheses: List[str], mode: ComparisonMode, ) -> ComparisonSummary: """Calculate aggregated metrics with GT-aware / GT-free semantics.""" line_count = min(len(references), len(hypotheses)) if line_count == 0: return ComparisonSummary( line_count=0, total_edit_distance=0, total_char_units=0, macro_char_rate=0.0, micro_char_rate=0.0, macro_word_rate=0.0, avg_match_percent=100.0, ) raw_metrics = [ TranscriptionMetrics.compare_lines(references[i], hypotheses[i]) for i in range(line_count) ] display_metrics = [ TranscriptionMetrics.get_display_metrics(metrics, mode) for metrics in raw_metrics ] total_edit_distance = sum(m.edit_distance for m in raw_metrics) if mode == ComparisonMode.GROUND_TRUTH: total_char_units = sum(len(references[i]) for i in range(line_count)) else: total_char_units = sum( max(len(references[i]), len(hypotheses[i])) for i in range(line_count) ) micro_char_rate = ( total_edit_distance / total_char_units * 100.0 if total_char_units else 0.0 ) return ComparisonSummary( line_count=line_count, total_edit_distance=total_edit_distance, total_char_units=total_char_units, macro_char_rate=sum(m.char_rate for m in display_metrics) / line_count, micro_char_rate=micro_char_rate, macro_word_rate=sum(m.word_rate for m in display_metrics) / line_count, avg_match_percent=sum(m.match_percent for m in display_metrics) / line_count, ) @staticmethod def calculate_overall_metrics( references: List[str], hypotheses: List[str] ) -> Tuple[float, float, float]: """ Calculate overall metrics for multiple lines. Args: references: List of ground truth texts hypotheses: List of predicted texts (same length as references) Returns: Tuple of (average_cer, average_wer, average_match) Raises: ValueError: If lengths don't match Examples: >>> refs = ["hello", "world"] >>> hyps = ["helo", "world"] >>> cer, wer, match = TranscriptionMetrics.calculate_overall_metrics(refs, hyps) >>> print(f"Overall CER: {cer:.2f}%") Overall CER: 10.00% """ if len(references) != len(hypotheses): raise ValueError( f"Reference and hypothesis lists must have same length " f"(got {len(references)} vs {len(hypotheses)})" ) if not references: return 0.0, 0.0, 100.0 total_cer = 0.0 total_wer = 0.0 total_match = 0.0 for ref, hyp in zip(references, hypotheses): total_cer += TranscriptionMetrics.calculate_cer(ref, hyp) total_wer += TranscriptionMetrics.calculate_wer(ref, hyp) total_match += TranscriptionMetrics.calculate_match_percent(ref, hyp) n = len(references) return (total_cer / n, total_wer / n, total_match / n) @staticmethod def _sequence_distance(reference: List[str], hypothesis: List[str]) -> int: """Levenshtein distance for token sequences.""" if not reference: return len(hypothesis) if not hypothesis: return len(reference) previous_row = list(range(len(hypothesis) + 1)) for i, ref_token in enumerate(reference, start=1): current_row = [i] for j, hyp_token in enumerate(hypothesis, start=1): substitution_cost = 0 if ref_token == hyp_token else 1 current_row.append(min( previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1] + substitution_cost, )) previous_row = current_row return previous_row[-1] # Example usage if __name__ == "__main__": # Test with some examples print("=" * 70) print("TRANSCRIPTION METRICS - Examples") print("=" * 70) print() # Example 1: Exact match ref1 = "hello world" hyp1 = "hello world" metrics1 = TranscriptionMetrics.compare_lines(ref1, hyp1) print(f"Example 1: Exact match") print(f" Reference: '{ref1}'") print(f" Hypothesis: '{hyp1}'") print(f" CER: {metrics1.cer:.2f}%") print(f" WER: {metrics1.wer:.2f}%") print(f" Match: {metrics1.match_percent:.2f}%") print() # Example 2: Single character error ref2 = "test" hyp2 = "text" metrics2 = TranscriptionMetrics.compare_lines(ref2, hyp2) print(f"Example 2: Single substitution") print(f" Reference: '{ref2}'") print(f" Hypothesis: '{hyp2}'") print(f" CER: {metrics2.cer:.2f}%") print(f" WER: {metrics2.wer:.2f}%") print(f" Match: {metrics2.match_percent:.2f}%") print(f" Diff operations:") for op in metrics2.diff_ops: if op.operation != 'equal': print(f" {op.operation}: '{op.ref_char}' -> '{op.hyp_char}' " f"at ref_pos={op.ref_pos}, hyp_pos={op.hyp_pos}") print() # Example 3: Cyrillic text (Church Slavonic) ref3 = "и идѣше поутемь" hyp3 = "и идѣше поутемь" metrics3 = TranscriptionMetrics.compare_lines(ref3, hyp3) print(f"Example 3: Cyrillic text (exact match)") print(f" Reference: '{ref3}'") print(f" Hypothesis: '{hyp3}'") print(f" CER: {metrics3.cer:.2f}%") print(f" WER: {metrics3.wer:.2f}%") print() # Example 4: Cyrillic with error ref4 = "гредоущоу же ѥмоу" hyp4 = "гредоущом же ѥмоу" metrics4 = TranscriptionMetrics.compare_lines(ref4, hyp4) print(f"Example 4: Cyrillic text (one character error)") print(f" Reference: '{ref4}'") print(f" Hypothesis: '{hyp4}'") print(f" CER: {metrics4.cer:.2f}%") print(f" WER: {metrics4.wer:.2f}%") print(f" Match: {metrics4.match_percent:.2f}%") print() # Example 5: Overall metrics refs = [ref1, ref2, ref3, ref4] hyps = [hyp1, hyp2, hyp3, hyp4] avg_cer, avg_wer, avg_match = TranscriptionMetrics.calculate_overall_metrics(refs, hyps) print(f"Example 5: Overall metrics for {len(refs)} lines") print(f" Average CER: {avg_cer:.2f}%") print(f" Average WER: {avg_wer:.2f}%") print(f" Average Match: {avg_match:.2f}%") print() print("=" * 70)