| """ |
| BibTeX Sanitizer: Structural and formatting checks for bib entries. |
| |
| Runs as a pre-processing phase before metadata fetch-and-compare, |
| detecting and auto-fixing common formatting issues that crawlers |
| and copy-paste introduce into .bib files. |
| """ |
| import re |
| from datetime import datetime |
| from dataclasses import dataclass, field |
| from typing import List, Optional, Any |
|
|
| CURRENT_YEAR = datetime.now().year |
|
|
| from .parser import BibEntry |
| from .utils import TextNormalizer |
|
|
|
|
| @dataclass |
| class SanitizeFix: |
| """Describes a single sanitization fix applied to a bib entry.""" |
| entry_key: str |
| category: str |
| field: str |
| description: str |
| old_value: str = "" |
| new_value: str = "" |
|
|
|
|
| |
| CONFERENCE_KEYWORDS = [ |
| "conference", "proceedings", "workshop", "symposium", |
| |
| "iclr", "icml", "neurips", "nips", "aaai", "ijcai", |
| |
| "acl", "emnlp", "naacl", "coling", "eacl", |
| |
| "cvpr", "iccv", "eccv", |
| |
| "interspeech", "icassp", |
| |
| "sigir", "kdd", "www", "wsdm", |
| |
| "osdi", "sosp", "nsdi", |
| |
| "international conference", "annual meeting", |
| ] |
|
|
|
|
| class BibSanitizer: |
| """Performs structural and formatting sanity checks on BibEntry objects.""" |
|
|
| def sanitize_all(self, entries: List[BibEntry]) -> dict: |
| """ |
| Run all sanitization checks on a list of entries. |
| Returns dict: {entry_key: [SanitizeFix, ...]} |
| Entries are modified in-place. |
| """ |
| all_fixes = {} |
| for entry in entries: |
| fixes = [] |
| fixes.extend(self._check_dblp_ids(entry)) |
| fixes.extend(self._check_corporate_authors(entry)) |
| fixes.extend(self._check_entry_type(entry)) |
| fixes.extend(self._check_title_capitalization(entry)) |
| fixes.extend(self._check_future_year(entry)) |
| fixes.extend(self._clean_entry_fields(entry)) |
| if fixes: |
| all_fixes[entry.key] = fixes |
| return all_fixes |
|
|
| |
| |
| |
| def _check_dblp_ids(self, entry: BibEntry) -> List[SanitizeFix]: |
| """Strip DBLP disambiguation IDs (4-digit suffixes) from author names.""" |
| fixes = [] |
| if not entry.author: |
| return fixes |
|
|
| raw_authors = TextNormalizer.parse_author_list(entry.author) |
| cleaned_authors = [] |
| any_changed = False |
|
|
| for author in raw_authors: |
| author = author.strip() |
| if TextNormalizer.has_dblp_disambiguation_id(author): |
| cleaned = TextNormalizer.strip_dblp_disambiguation_id(author) |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="dblp_id", |
| field="author", |
| description=f"Stripped DBLP disambiguation ID: '{author}' → '{cleaned}'", |
| old_value=author, |
| new_value=cleaned, |
| )) |
| cleaned_authors.append(cleaned) |
| any_changed = True |
| else: |
| cleaned_authors.append(author) |
|
|
| if any_changed: |
| new_author_str = " and ".join(cleaned_authors) |
| entry.author = new_author_str |
| |
| if 'author' in entry.raw_entry: |
| entry.raw_entry['author'] = new_author_str |
|
|
| return fixes |
|
|
| |
| |
| |
| def _check_corporate_authors(self, entry: BibEntry) -> List[SanitizeFix]: |
| """ |
| Detect single-word author names and wrap in {{double braces}}. |
| |
| BibTeX treats single-word names as a last name, rendering e.g. |
| "KimiTeam" as "K. Team". Wrapping in {{}} prevents this. |
| """ |
| fixes = [] |
| if not entry.author: |
| return fixes |
|
|
| raw_authors = TextNormalizer.parse_author_list(entry.author) |
| new_authors = [] |
| any_changed = False |
|
|
| for author in raw_authors: |
| author = author.strip() |
| |
| if author.startswith('{{') and author.endswith('}}'): |
| new_authors.append(author) |
| continue |
| |
| if author.startswith('{') and author.endswith('}'): |
| new_authors.append(author) |
| continue |
|
|
| |
| |
| stripped = author.strip('{}') |
| if ' ' not in stripped and stripped and stripped[0].isupper() and len(stripped) > 1: |
| wrapped = '{{' + stripped + '}}' |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="corporate_author", |
| field="author", |
| description=f"Corporate author protected: '{author}' → '{wrapped}'", |
| old_value=author, |
| new_value=wrapped, |
| )) |
| new_authors.append(wrapped) |
| any_changed = True |
| else: |
| new_authors.append(author) |
|
|
| if any_changed: |
| new_author_str = " and ".join(new_authors) |
| entry.author = new_author_str |
| if 'author' in entry.raw_entry: |
| entry.raw_entry['author'] = new_author_str |
|
|
| return fixes |
|
|
| |
| |
| |
| def _check_entry_type(self, entry: BibEntry) -> List[SanitizeFix]: |
| """ |
| Detect conference papers incorrectly typed as @article. |
| |
| Heuristics: |
| - Has booktitle field → should be inproceedings |
| - Journal field contains conference keywords → move to booktitle |
| """ |
| fixes = [] |
|
|
| if entry.entry_type.lower() != 'article': |
| return fixes |
|
|
| |
| if entry.booktitle: |
| old_type = entry.entry_type |
| entry.entry_type = 'inproceedings' |
| if 'ENTRYTYPE' in entry.raw_entry: |
| entry.raw_entry['ENTRYTYPE'] = 'inproceedings' |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="entry_type", |
| field="ENTRYTYPE", |
| description=f"Entry has booktitle but was @{old_type} → @inproceedings", |
| old_value=old_type, |
| new_value='inproceedings', |
| )) |
| return fixes |
|
|
| |
| if entry.journal: |
| journal_lower = entry.journal.lower() |
| matched_keyword = None |
| for keyword in CONFERENCE_KEYWORDS: |
| if keyword in journal_lower: |
| matched_keyword = keyword |
| break |
|
|
| if matched_keyword: |
| old_type = entry.entry_type |
| old_journal = entry.journal |
|
|
| |
| entry.booktitle = entry.journal |
| entry.journal = "" |
| entry.entry_type = 'inproceedings' |
|
|
| |
| if 'ENTRYTYPE' in entry.raw_entry: |
| entry.raw_entry['ENTRYTYPE'] = 'inproceedings' |
| entry.raw_entry['booktitle'] = old_journal |
| if 'journal' in entry.raw_entry: |
| del entry.raw_entry['journal'] |
|
|
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="entry_type", |
| field="ENTRYTYPE", |
| description=( |
| f"@{old_type} → @inproceedings " |
| f"(journal '{old_journal}' contains '{matched_keyword}', moved to booktitle)" |
| ), |
| old_value=old_type, |
| new_value='inproceedings', |
| )) |
|
|
| return fixes |
|
|
| |
| |
| |
| def check_doi_title_match(self, entry: BibEntry, fetched_data: Any) -> List[SanitizeFix]: |
| """ |
| Validate that a DOI resolves to the same paper as the bib entry. |
| |
| Called during the fetch phase (requires network), not during |
| the offline sanitize phase. |
| |
| If the DOI metadata title doesn't match the bib entry title, |
| flag the DOI as potentially wrong and remove it. |
| """ |
| fixes = [] |
| if not entry.doi or not fetched_data: |
| return fixes |
|
|
| fetched_title = getattr(fetched_data, 'title', '') |
| if not fetched_title: |
| return fixes |
|
|
| bib_title_norm = TextNormalizer.normalize_for_comparison(entry.title) |
| doi_title_norm = TextNormalizer.normalize_for_comparison(fetched_title) |
|
|
| similarity = TextNormalizer.similarity_ratio(bib_title_norm, doi_title_norm) |
| if len(bib_title_norm) < 100: |
| lev_sim = TextNormalizer.levenshtein_similarity(bib_title_norm, doi_title_norm) |
| similarity = max(similarity, lev_sim) |
|
|
| if similarity < 0.5: |
| old_doi = entry.doi |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="doi_mismatch", |
| field="doi", |
| description=( |
| f"DOI '{old_doi}' resolves to a different title " |
| f"('{fetched_title[:60]}...' vs '{entry.title[:60]}...'). " |
| f"Similarity: {similarity:.0%}. DOI removed." |
| ), |
| old_value=old_doi, |
| new_value="", |
| )) |
| entry.doi = "" |
| if 'doi' in entry.raw_entry: |
| del entry.raw_entry['doi'] |
|
|
| return fixes |
|
|
| |
| |
| |
|
|
| |
| _ACRONYM_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{2,}[a-z]?(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])') |
|
|
| |
| _CAMELCASE_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][a-z]+(?:[\.-]?[A-Z][a-z]*)+)(?![A-Za-z0-9])') |
|
|
| |
| _MIXED_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*\d[A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])') |
|
|
| def _check_title_capitalization(self, entry: BibEntry) -> List[SanitizeFix]: |
| """ |
| Wrap acronyms and proper nouns in {} to protect capitalization. |
| |
| IEEEtran's .bst forces titles to sentence case. |
| Without braces, "SALMONN" becomes "salmonn". |
| """ |
| fixes = [] |
| if not entry.title: |
| return fixes |
|
|
| title = entry.title |
| words_to_protect = set() |
|
|
| |
| for m in self._ACRONYM_RE.finditer(title): |
| word = m.group(1) |
| |
| if word in ('AI', 'ML', 'NLP', 'CV', 'LLM', 'ASR', 'TTS', 'NER', |
| 'QA', 'MT', 'IR', 'RL', 'GAN', 'VAE', 'RNN', 'CNN', |
| 'GPU', 'CPU', 'TPU', 'API', 'URL', 'PDF', 'HTML', |
| 'II', 'III', 'IV', 'VI', 'VII', 'VIII', 'IX', 'XI', |
| 'USB', 'RAM', 'ROM', 'SSD', 'TCP', 'HTTP', 'SSL', |
| 'BERT', 'GPT', 'LSTM', 'MLP', 'FFN', 'LLM'): |
| |
| words_to_protect.add(word) |
| elif len(word) >= 2: |
| words_to_protect.add(word) |
|
|
| |
| for m in self._CAMELCASE_RE.finditer(title): |
| words_to_protect.add(m.group(1)) |
|
|
| |
| for m in self._MIXED_RE.finditer(title): |
| words_to_protect.add(m.group(1)) |
|
|
| if not words_to_protect: |
| return fixes |
|
|
| |
| new_title = title |
| protected_words = [] |
|
|
| for word in sorted(words_to_protect, key=len, reverse=True): |
| |
| |
| if '{' + word + '}' in new_title: |
| continue |
| if '{{' + word + '}}' in new_title: |
| continue |
|
|
| |
| |
| pattern = re.compile(r'(?<!\{)\b' + re.escape(word) + r'\b(?!\})') |
| if pattern.search(new_title): |
| new_title = pattern.sub('{' + word + '}', new_title) |
| protected_words.append(word) |
|
|
| if protected_words and new_title != title: |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="title_case", |
| field="title", |
| description=f"Protected capitalization: {', '.join(protected_words)}", |
| old_value=title, |
| new_value=new_title, |
| )) |
| entry.title = new_title |
| if 'title' in entry.raw_entry: |
| entry.raw_entry['title'] = new_title |
|
|
| return fixes |
|
|
| |
| |
| |
| def _check_future_year(self, entry: BibEntry) -> List[SanitizeFix]: |
| """ |
| Detect entries with year > current year. |
| |
| These are likely arXiv submission dates that will be wrong once |
| the paper is published at a conference. Flag them for forced |
| API lookup so the correct conference year can be found. |
| """ |
| fixes = [] |
| year_str = str(entry.year).strip() |
| if not year_str or not year_str.isdigit(): |
| return fixes |
|
|
| year = int(year_str) |
|
|
| if year > CURRENT_YEAR: |
| |
| entry._force_api_lookup = True |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="future_year", |
| field="year", |
| description=( |
| f"Future year {year} detected (current: {CURRENT_YEAR}). " |
| f"Will force API lookup to find correct year." |
| ), |
| old_value=year_str, |
| new_value="", |
| )) |
| elif year < 1950: |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="future_year", |
| field="year", |
| description=f"Suspiciously old year: {year}", |
| old_value=year_str, |
| new_value="", |
| )) |
|
|
| return fixes |
|
|
| |
| |
| |
| |
| FIELD_REMOVE_POLICY = { |
| "inproceedings": [ |
| "address", "month", "abstract", |
| "archiveprefix", "primaryclass", |
| "biburl", "bibsource", "timestamp", |
| "copyright", "issn", "isbn", |
| ], |
| "article": [ |
| "address", "month", "abstract", |
| "archiveprefix", "primaryclass", |
| "biburl", "bibsource", "timestamp", |
| "copyright", "issn", |
| ], |
| "misc": [ |
| "address", "month", "abstract", |
| "biburl", "bibsource", "timestamp", |
| "copyright", |
| ], |
| } |
|
|
| def _clean_entry_fields(self, entry: BibEntry) -> List[SanitizeFix]: |
| """ |
| Remove junk/noise fields that crawlers often include. |
| These fields add clutter and can cause formatting issues. |
| """ |
| fixes = [] |
| entry_type = entry.entry_type.lower() |
| to_remove = self.FIELD_REMOVE_POLICY.get(entry_type, []) |
|
|
| removed_fields = [] |
| for field_name in to_remove: |
| |
| for raw_key in list(entry.raw_entry.keys()): |
| if raw_key.lower() == field_name.lower() and raw_key not in ('ID', 'ENTRYTYPE'): |
| del entry.raw_entry[raw_key] |
| removed_fields.append(raw_key) |
|
|
| if removed_fields: |
| fixes.append(SanitizeFix( |
| entry_key=entry.key, |
| category="field_cleanup", |
| field="multiple", |
| description=f"Removed junk fields: {', '.join(removed_fields)}", |
| old_value=", ".join(removed_fields), |
| new_value="", |
| )) |
|
|
| return fixes |
|
|
| |
| |
| |
| @staticmethod |
| def find_duplicates(entries: List[BibEntry]) -> dict: |
| """ |
| Find entries that share the same normalized title. |
| Returns {normalized_title: [key1, key2, ...]} for duplicates. |
| """ |
| import re as _re |
| from collections import defaultdict |
|
|
| def _norm(t: str) -> str: |
| t = _re.sub(r'\{([^}]*)\}', r'\1', t) |
| t = _re.sub(r'[^\w\s]', ' ', t.lower()) |
| return _re.sub(r'\s+', ' ', t).strip() |
|
|
| title_map = defaultdict(list) |
| for entry in entries: |
| key = _norm(entry.title) |
| if key: |
| title_map[key].append(entry.key) |
|
|
| return {t: keys for t, keys in title_map.items() if len(keys) > 1} |
|
|