""" Booktitle normalizer: maps verbose venue names to standard abbreviations. Loads rules from data/abbr.tsv (regex → abbreviation). """ import re import csv from pathlib import Path from typing import Optional class BooktitleNormalizer: """Normalizes booktitle/journal names to standard abbreviations.""" def __init__(self, tsv_path: str = None): if tsv_path is None: tsv_path = str(Path(__file__).resolve().parent.parent / "data" / "abbr.tsv") self.rules: list[tuple[re.Pattern, str]] = [] self._load_rules(tsv_path) def _load_rules(self, tsv_path: str): """Load regex → abbreviation rules from TSV file.""" path = Path(tsv_path) if not path.exists(): return with open(path, 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') for row in reader: if len(row) >= 2: pattern_str = row[0].strip() abbr = row[1].strip() # Skip comments and empty lines if not pattern_str or pattern_str.startswith('#'): continue try: self.rules.append((re.compile(pattern_str, re.IGNORECASE), abbr)) except re.error: pass # Skip invalid regex def normalize(self, booktitle: str) -> Optional[str]: """ Normalize a booktitle to its standard abbreviation. Returns the abbreviation if matched, None if no match found. """ if not booktitle: return None for pattern, abbr in self.rules: if pattern.search(booktitle): return abbr return None