Spaces:

voidful
/

RefCheck

Sleeping

App Files Files Community

voidful commited on 6 days ago

Commit

11a28db

verified ·

1 Parent(s): ec88be4

Add RefCheck Gradio Space

Browse files

Files changed (25) hide show

.DS_Store +0 -0
.gitattributes +4 -0
.gitignore +24 -0
README.md +213 -6
app.py +87 -0
data/abbr.tsv +35 -0
data/index_shards/index_00.json +3 -0
data/index_shards/index_01.json +3 -0
data/index_shards/index_02.json +3 -0
data/index_shards/index_03.json +3 -0
main.py +561 -0
requirements.txt +7 -0
scripts/build_index.py +143 -0
scripts/refresh_db.sh +19 -0
scripts/update_db.py +131 -0
src/__init__.py +1 -0
src/comparator.py +326 -0
src/fetcher.py +254 -0
src/local_db.py +108 -0
src/normalizer.py +51 -0
src/parser.py +316 -0
src/sanitizer.py +493 -0
src/space_service.py +354 -0
src/ui.py +153 -0
src/utils.py +229 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/index_shards/index_00.json filter=lfs diff=lfs merge=lfs -text
+data/index_shards/index_01.json filter=lfs diff=lfs merge=lfs -text
+data/index_shards/index_02.json filter=lfs diff=lfs merge=lfs -text
+data/index_shards/index_03.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+.eggs/
+*.egg-info/
+dist/
+build/
+# Environment
+.env
+.venv/
+venv/
+# IDE
+.idea/
+.vscode/
+*.swp
+# DBLP raw data (regenerate with: python scripts/update_db.py)
+data/raw/
+# Legacy single-file index (replaced by sharded index)
+data/conference_index.json

README.md CHANGED Viewed

@@ -1,13 +1,220 @@
 ---
 title: RefCheck
-emoji: 🌖
-colorFrom: red
 colorTo: indigo
 sdk: gradio
-sdk_version: 6.16.0
-python_version: '3.13'
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: RefCheck
+emoji: 🔍
+colorFrom: blue
 colorTo: indigo
 sdk: gradio
 app_file: app.py
+python_version: 3.11
+suggested_hardware: cpu-basic
+fullWidth: true
+short_description: Upload BibTeX, validate citations, download fixes.
+tags:
+  - bibtex
+  - citations
+  - academic
+  - bibliography
 ---
+# RefCheck 🔍
+> **A Citation Hallucination Detector & Auto-Fixer**
+> Validate and automatically correct your BibTeX bibliography against multiple academic databases.
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
+---
+## Why RefCheck?
+Academic papers often contain citation errors — wrong titles, incorrect authors, mismatched years, or even completely fabricated references (hallucinations from AI tools). **RefCheck** automatically:
+- ✅ **Validates** each citation against 6 academic databases
+- 🔧 **Auto-fixes** metadata mismatches (title, authors, year, DOI)
+- 🗑️ **Removes** unverifiable/hallucinated entries
+- 📊 **Reports** a clear verification summary
+---
+## Features
+### Multi-Source Verification
+RefCheck cross-references your citations against:
+| Source | Lookup Methods |
+|--------|----------------|
+| **arXiv** | arXiv ID, Title search |
+| **CrossRef** | DOI, Title search |
+| **DBLP** | Title search |
+| **Semantic Scholar** | DOI, Title search |
+| **OpenAlex** | DOI, Title search |
+| **Google Scholar** | Title search (disabled by default) |
+### Two-Pass Workflow
+1. **Pass 1 — Validate & Fix**: Checks each entry, auto-corrects metadata, removes invalid citations
+2. **Pass 2 — Verify**: Re-validates the cleaned file to confirm all entries are correct
+---
+## Installation
+```bash
+# Clone the repository
+git clone https://github.com/voidful/RefCheck.git
+cd RefCheck
+# Install dependencies
+pip install -r requirements.txt
+```
+### Requirements
+- Python 3.9+
+- Dependencies: `bibtexparser`, `requests`, `beautifulsoup4`, `rich`, `Unidecode`, `lxml`
+---
+## Usage
+### Hugging Face Space
+This repository is ready to run as a Gradio Space. Create a Hugging Face Space with the Gradio SDK, push these files, and the Space will launch `app.py`.
+The Space UI accepts a `.bib` upload and returns:
+- a corrected BibTeX file
+- a Markdown validation report
+- a list of entries that still need manual review
+### Basic Usage
+```bash
+# Validate and auto-fix a bib file
+python main.py --bib references.bib
+```
+### Command-Line Options
+| Option | Short | Description |
+|--------|-------|-------------|
+| `--bib` | `-b` | Path to your `.bib` file (required) |
+| `--output` | `-o` | Output report path (optional) |
+### Example
+```bash
+# Process your bibliography
+python main.py --bib paper/references.bib
+# With custom output path
+python main.py --bib refs.bib --output validation_report.md
+```
+---
+## How It Works
+```
+┌─────────────────┐
+│  Load .bib file │
+└────────┬────────┘
+         ▼
+┌─────────────────────────────────────────┐
+│  For each entry:                        │
+│  1. Query academic databases            │
+│  2. Compare metadata (title, author, yr)│
+│  3. Calculate confidence score          │
+└────────┬────────────────────────────────┘
+         ▼
+┌─────────────────────────────────────────┐
+│  Decision:                              │
+│  • confidence > 85% → Auto-fix metadata │
+│  • Match found      → Keep as-is        │
+│  • No match         → Remove entry      │
+└────────┬────────────────────────────────┘
+         ▼
+┌─────────────────────────────────────────┐
+│  Save updated .bib file                 │
+│  Run verification pass                  │
+└─────────────────────────────────────────┘
+```
+---
+## Output
+RefCheck displays real-time progress and a final summary:
+```
+📚 BibGuard - Auto-Fix & Verify
+   Target: references.bib
+Found 42 entries. Running validation and auto-fix...
+Validating & Fixing ━━━━━━━━━━━━━━━━━ 100% 42/42 ✓ 38 ⚠ 2 ✗ 2
+✏️  Updates:
+   - Fixed 2 entries (metadata updated)
+   - Removed 2 invalid/hallucinated entries
+✓ File saved.
+🔄 Double checking (Re-validation)...
+Verifying ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 40/40 ✓ 40
+==================================================
+📊 Final Status
+==================================================
+  Total:      40
+  ✓ Verified: 40
+  ⚠ Issues:   0
+  ✗ Not found: 0
+```
+### Status Meanings
+| Symbol | Meaning |
+|--------|---------|
+| ✅ Verified | Entry matches a known publication |
+| ⚠️ Fixed | Metadata was auto-corrected |
+| ❌ Removed | Entry could not be verified (likely hallucination) |
+---
+## Project Structure
+```
+RefCheck/
+├── main.py              # Entry point & workflow orchestration
+├── requirements.txt     # Python dependencies
+├── README.md
+└── src/
+    ├── fetcher.py       # API clients for academic databases
+    ├── comparator.py    # Metadata comparison & scoring
+    ├── parser.py        # BibTeX parsing & saving
+    └── utils.py         # Progress display & text utilities
+```
+---
+## License
+MIT License — see [LICENSE](LICENSE) for details.
+---
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+---
+## Acknowledgments
+Built with:
+- [bibtexparser](https://github.com/sciunto-org/python-bibtexparser) for BibTeX handling
+- [Rich](https://github.com/Textualize/rich) for beautiful terminal output
+- APIs from arXiv, CrossRef, DBLP, Semantic Scholar, and OpenAlex

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import gradio as gr
+from src.space_service import RefCheckOptions, run_refcheck_file
+def _uploaded_path(uploaded: Any) -> str | None:
+    if not uploaded:
+        return None
+    if isinstance(uploaded, str):
+        return uploaded
+    if isinstance(uploaded, dict):
+        return uploaded.get("path") or uploaded.get("name")
+    name = getattr(uploaded, "name", None)
+    if name:
+        return str(name)
+    return None
+def process_bib(
+    uploaded: Any,
+    remove_unverified: bool,
+    enable_google_scholar: bool,
+    max_workers: int,
+) -> tuple[str, str | None, str | None]:
+    file_path = _uploaded_path(uploaded)
+    if not file_path:
+        return "## RefCheck Report\n\nNo BibTeX file was uploaded.", None, None
+    try:
+        options = RefCheckOptions(
+            remove_unverified=remove_unverified,
+            enable_google_scholar=enable_google_scholar,
+            max_workers=int(max_workers),
+        )
+        result = run_refcheck_file(Path(file_path), options)
+        return result.report_markdown, result.fixed_bib_path, result.report_path
+    except Exception as exc:
+        return f"## RefCheck Report\n\nProcessing failed: `{exc}`", None, None
+with gr.Blocks(title="RefCheck") as demo:
+    gr.Markdown("# RefCheck")
+    with gr.Row():
+        with gr.Column(scale=1):
+            bib_file = gr.File(
+                label="BibTeX file",
+                file_types=[".bib", ".txt"],
+                type="filepath",
+            )
+            remove_unverified = gr.Checkbox(
+                label="Remove unverifiable entries",
+                value=True,
+            )
+            enable_google_scholar = gr.Checkbox(
+                label="Google Scholar fallback",
+                value=False,
+            )
+            max_workers = gr.Slider(
+                label="Parallel lookups",
+                minimum=1,
+                maximum=8,
+                step=1,
+                value=4,
+            )
+            run_button = gr.Button("Run RefCheck", variant="primary")
+        with gr.Column(scale=2):
+            report = gr.Markdown(label="Report")
+            fixed_bib = gr.File(label="Fixed BibTeX")
+            report_file = gr.File(label="Markdown report")
+    run_button.click(
+        fn=process_bib,
+        inputs=[bib_file, remove_unverified, enable_google_scholar, max_workers],
+        outputs=[report, fixed_bib, report_file],
+        api_name="refcheck",
+    )
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=2).launch()

data/abbr.tsv ADDED Viewed

	@@ -0,0 +1,35 @@

+# Pattern (regex, case-insensitive)	Abbreviation
+# Speech & Audio
+.*Interspeech.*	Interspeech
+.*IEEE.*International Conference.*Acoustics.*Speech.*Signal Processing.*	ICASSP
+.*IEEE.*Automatic Speech Recognition.*Understanding.*	ASRU
+.*IEEE Spoken Language Technology.*	SLT
+.*IEEE/ACM Transactions on Audio.*Speech.*Language.*	IEEE/ACM Trans. Audio Speech Lang. Process.
+# ML
+.*International Conference on Machine Learning.*	ICML
+.*Advances in Neural Information Processing.*	NeurIPS
+.*Conference on Neural Information Processing.*	NeurIPS
+.*International Conference on Learning Representations.*	ICLR
+.*AAAI Conference on Artificial Intelligence.*	AAAI
+.*International Joint Conference on Artificial Intelligence.*	IJCAI
+.*IEEE.*Conference on Computer Vision and Pattern Recognition.*	CVPR
+.*European Conference on Computer Vision.*	ECCV
+.*IEEE International Conference on Computer Vision[^a].*	ICCV
+# NLP
+.*Annual Meeting.*Association for Computational Linguistics.*	ACL
+.*Empirical Methods in Natural Language Processing.*	EMNLP
+.*North American Chapter.*Association for Computational Linguistics.*	NAACL
+.*European Chapter.*Association for Computational Linguistics.*	EACL
+.*Findings.*EMNLP.*	Findings of EMNLP
+.*Findings.*ACL.*	Findings of ACL
+.*International Conference on Computational Linguistics.*	COLING
+.*Conference.*Machine Translation.*	WMT
+.*Language Resources and Evaluation.*	LREC
+# IR / Web / Data
+.*ACM.*Information Retrieval.*	SIGIR
+.*Knowledge Discovery.*Data Mining.*	KDD
+.*World Wide Web.*	WWW
+.*Web Search and Data Mining.*	WSDM

data/index_shards/index_00.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2522c57f135bb5c3d581c824cb8538e9f84b786a01a3d0535b52457ef91b227
+size 26214218

data/index_shards/index_01.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd3dbc8999aed2796171312da9521a84548b105f32ecb7621e09c97a8c298c7
+size 26214151

data/index_shards/index_02.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:519a53fc56bbeb76feaa0be489bd4cc3727a5261b270846ffd6d1a97d42551b9
+size 26214343

data/index_shards/index_03.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c32e3cc3dd2809c115e33f37030166e2b66eeb0dab7dd0b81010647f799ec93
+size 25401874

main.py ADDED Viewed

	@@ -0,0 +1,561 @@

+#!/usr/bin/env python3
+"""
+BibGuard - Citation Hallucination Detector
+Validates bibliography entries against multiple academic data sources:
+arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
+Usage:
+    python main.py --bib references.bib
+    python main.py --bib references.bib --output report.md
+"""
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import copy
+from src.parser import BibParser
+from src.fetcher import (
+    ArxivFetcher, CrossRefFetcher, DBLPFetcher,
+    SemanticScholarFetcher, OpenAlexFetcher, ScholarFetcher
+)
+from src.comparator import MetadataComparator, EntryReport, resolve_year, CURRENT_YEAR
+from src.sanitizer import BibSanitizer
+from src.local_db import LocalConferenceDB
+from src.ui import BibUI
+from src.utils import ProgressDisplay, TextNormalizer
+@dataclass
+class WorkflowStep:
+    name: str
+    enabled: bool = True
+    display_name: str = ""
+    priority: int = 0
+@dataclass
+class WorkflowConfig:
+    steps: List[WorkflowStep] = field(default_factory=list)
+    def get_enabled_steps(self) -> List[WorkflowStep]:
+        return sorted([s for s in self.steps if s.enabled], key=lambda x: x.priority)
+def get_default_workflow() -> WorkflowConfig:
+    return WorkflowConfig(steps=[
+        WorkflowStep("arxiv_id", True, "arXiv by ID", 0),
+        WorkflowStep("crossref_doi", True, "CrossRef by DOI", 1),
+        WorkflowStep("semantic_scholar", True, "Semantic Scholar", 2),
+        WorkflowStep("dblp", True, "DBLP", 3),
+        WorkflowStep("openalex", True, "OpenAlex", 4),
+        WorkflowStep("arxiv_title", True, "arXiv by Title", 5),
+        WorkflowStep("crossref_title", True, "CrossRef by Title", 6),
+        WorkflowStep("google_scholar", False, "Google Scholar", 7),
+    ])
+def main():
+    parser = argparse.ArgumentParser(
+        description="BibGuard: Citation Fixer & Validator",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--bib", "-b", required=True, help="Path to .bib file")
+    parser.add_argument("--output", "-o", help="Output report path (optional)")
+    args = parser.parse_args()
+    bib_path = Path(args.bib)
+    if not bib_path.exists():
+        print(f"Error: Bib file not found: {args.bib}")
+        sys.exit(1)
+    workflow = get_default_workflow()
+    try:
+        run_fix_and_verify(bib_path, workflow)
+    except KeyboardInterrupt:
+        print("\nCancelled")
+        sys.exit(130)
+def run_fix_and_verify(bib_path: Path, workflow):
+    """Run validation, auto-fix issues, and verify."""
+    progress = ProgressDisplay()
+    bib_parser = BibParser()
+    ui = BibUI()
+    print(f"📚 BibGuard - Auto-Fix & Verify")
+    print(f"   Target: {bib_path}\n")
+    # --- Pass 1: Validate & Fix ---
+    entries = bib_parser.parse_file(str(bib_path))
+    if not entries:
+        print("No entries found")
+        return
+    print(f"Found {len(entries)} entries. Running validation and auto-fix...\n")
+    # Initialize components
+    fetchers = {
+        'arxiv': ArxivFetcher(),
+        'crossref': CrossRefFetcher(),
+        'scholar': ScholarFetcher(),
+        'semantic': SemanticScholarFetcher(),
+        'openalex': OpenAlexFetcher(),
+        'dblp': DBLPFetcher(),
+    }
+    comparator = MetadataComparator()
+    sanitizer = BibSanitizer()
+    fixed_count = 0
+    updated_entries = []
+    fixed_details = {} # Key: entry_key, Value: list of changes
+    removed_details = [] # List of (entry_key, reason)
+    manual_review_queue = [] # List of (entry, best_result, candidates)
+    # --- Phase 0: Sanitize (Offline Checks) ---
+    print("🧹 Running formatting sanity checks...")
+    sanitize_fixes = sanitizer.sanitize_all(entries)
+    ui.show_sanitize_report(sanitize_fixes)
+    # If sanitization made changes, save immediately so Phase 1 works on clean data
+    if sanitize_fixes:
+        bib_parser.save_entries(str(bib_path), entries)
+        # Merge sanitize fixes into fixed_details for the final report
+        for key, fixes in sanitize_fixes.items():
+            if key not in fixed_details:
+                fixed_details[key] = []
+            for fix in fixes:
+                fixed_details[key].append(fix.description)
+            fixed_count += 1
+    # Duplicate detection
+    dupes = sanitizer.find_duplicates(entries)
+    if dupes:
+        print(f"\n⚠ Found {len(dupes)} duplicate title(s):")
+        for title, keys in dupes.items():
+            print(f"  {' / '.join(keys)}")
+        print()
+    # --- Phase 0.5: Local DB Lookup ---
+    local_db = LocalConferenceDB()
+    local_db_loaded = local_db.load()
+    api_needed_entries = entries  # Default: all entries need API
+    if local_db_loaded:
+        api_needed_entries = []
+        local_matched_count = 0
+        for entry in entries:
+            official = local_db.lookup(entry.title)
+            if official:
+                # Apply local DB fix
+                changes = apply_local_fix(entry, official)
+                if changes:
+                    local_matched_count += 1
+                    if entry.key not in fixed_details:
+                        fixed_details[entry.key] = []
+                    fixed_details[entry.key].extend(changes)
+                    fixed_count += 1
+            else:
+                api_needed_entries.append(entry)
+        if local_matched_count > 0:
+            print(f"  📚 Local DB matched: {local_matched_count}, API needed: {len(api_needed_entries)}")
+            bib_parser.save_entries(str(bib_path), entries)
+    # --- Phase 1: Analysis (API Fetch) ---
+    analysis_results = []
+    with progress.progress_context(len(api_needed_entries), "Analyzing Entries") as prog:
+        with ThreadPoolExecutor(max_workers=min(10, max(1, len(api_needed_entries)))) as executor:
+            futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in api_needed_entries}
+            for future in as_completed(futures):
+                entry = futures[future]
+                try:
+                    best_result, candidates = future.result()
+                    analysis_results.append((entry, best_result, candidates))
+                    prog.update(entry.key, "Analyzed", 1)
+                except Exception as e:
+                    prog.mark_error()
+                    prog.update(entry.key, "Failed", 1)
+                    # Keep valid entry even if fetch failed
+                    analysis_results.append((entry, None, []))
+    # --- Phase 2: Meaningful Report ---
+    # Categorize results
+    to_fix = []
+    to_review = []
+    to_remove = []
+    ok_entries = []
+    for entry, best_result, candidates in analysis_results:
+        if not best_result:
+            ok_entries.append(entry)
+            continue
+        # Entries flagged for forced API lookup (e.g., future year) always go to to_fix
+        if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
+            to_fix.append((entry, best_result, candidates))
+        elif best_result.confidence > 0.85 and best_result.fetched_data:
+            to_fix.append((entry, best_result, candidates))
+        elif best_result.is_match:
+            ok_entries.append(entry)
+        elif candidates:
+            to_review.append((entry, best_result, candidates))
+        else:
+            to_remove.append(entry)
+    # Visualize Analysis Report
+    ui.show_analysis_report(ok_entries, to_fix, to_review, to_remove)
+    if not (to_fix or to_review or to_remove):
+        return
+    # --- Phase 3: Apply Fixes ---
+    print(f"\n🚀 Applying fixes...")
+    updated_entries = []
+    # Add OK entries first (preserve order if we cared, but we sort later usually)
+    updated_entries.extend(ok_entries)
+    # Process Fixes
+    for entry, best_result, candidates in to_fix:
+        changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
+        if changes:
+             fixed_count += 1
+             fixed_details[entry.key] = changes
+        updated_entries.append(entry)
+    # Process Removals
+    for entry in to_remove:
+        removed_details.append((entry, "No matching metadata found in any source"))
+        # Do NOT add to updated_entries
+    # Process Reviews (Add to queue)
+    for item in to_review:
+        manual_review_queue.append(item)
+        updated_entries.append(item[0]) # Add tentatively, filter later if removed
+    # --- Interactive Manual Review ---
+    if manual_review_queue:
+        print(f"\n\n🔍 Manual Review Required for {len(manual_review_queue)} entries:")
+        # Sort by key for consistent order
+        manual_review_queue.sort(key=lambda x: x[0].key)
+        entries_to_remove = set()
+        for entry, best_res, candidates in manual_review_queue:
+            ui.show_manual_review(entry, best_res, candidates, apply_fix)
+            while True:
+                choice = input(f"\nSelect [1-{len(candidates)}], (s)kip, (r)emove, or (q)uit: ").strip().lower()
+                if choice == 'q':
+                    print("Exiting manual review.")
+                    # Keep remaining in queue as is (already in updated_entries)
+                    break
+                elif choice == 's':
+                    print("Skipped.")
+                    break
+                elif choice == 'r':
+                    print("Marked for removal.")
+                    entries_to_remove.add(entry.key)
+                    removed_details.append((entry, "Removed by user during manual review"))
+                    break
+                elif choice.isdigit():
+                    idx = int(choice) - 1
+                    if 0 <= idx < len(candidates):
+                        selected = candidates[idx]
+                        changes = apply_fix(entry, selected.fetched_data)
+                        if changes:
+                            fixed_count += 1
+                            if entry.key not in fixed_details: fixed_details[entry.key] = []
+                            fixed_details[entry.key].extend(changes)
+                            print(f"Applied: {', '.join(changes)}")
+                        else:
+                            print("No changes needed for selected source.")
+                        break
+                    else:
+                        print("Invalid selection.")
+                else:
+                    print("Invalid input.")
+            if choice == 'q':
+                break
+        # Filter out removed entries
+        if entries_to_remove:
+            updated_entries = [e for e in updated_entries if e.key not in entries_to_remove]
+    # Overwrite file if changes made
+    # Overwrite file if changes made (beyond Phase 0 sanitization)
+    has_phase1_changes = any(k not in sanitize_fixes for k in fixed_details) or removed_details
+    if has_phase1_changes or fixed_count > len(sanitize_fixes):
+        bib_parser.save_entries(str(bib_path), updated_entries)
+    # --- Pass 2: Double Check ---
+    print("\n🔄 Double checking (Re-validation)...")
+    entries = bib_parser.parse_file(str(bib_path))
+    reports = []
+    with progress.progress_context(len(entries), "Verifying") as prog:
+         with ThreadPoolExecutor(max_workers=min(10, len(entries))) as executor:
+            # Note: validate_entry now returns tuple, need to handle
+            futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in entries}
+            for future in as_completed(futures):
+                entry = futures[future]
+                try:
+                    best_result, _ = future.result() # Ignore candidates in verify pass
+                    reports.append(EntryReport(entry=entry, comparison=best_result))
+                    if best_result.is_match:
+                        prog.mark_success()
+                    else:
+                        prog.mark_error()
+                    prog.update(entry.key, "Verified", 1)
+                except Exception:
+                    prog.mark_error()
+                    prog.update(entry.key, "Failed", 1)
+    # Summary
+    total = len(entries)
+    verified = sum(1 for r in reports if r.comparison and r.comparison.is_match)
+    issues = sum(1 for r in reports if r.comparison and r.comparison.has_issues)
+    not_found = sum(1 for r in reports if r.comparison and not r.comparison.is_match and not r.comparison.has_issues)
+    # Visual Final Status
+    ui.show_final_report(total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details)
+    print("")
+def apply_local_fix(entry, official) -> list:
+    """
+    Apply fixes from local conference DB (ground truth).
+    Only updates year, booktitle, and entry type — not authors or title,
+    since DBLP data for those may have different formatting conventions.
+    """
+    changes = []
+    # Year: conference year is ground truth
+    if official.year and official.year != entry.year:
+        year_int = int(official.year) if official.year.isdigit() else 0
+        if 1950 <= year_int <= CURRENT_YEAR:
+            changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
+            entry.year = official.year
+    # Entry type upgrade: misc/article → inproceedings if booktitle exists
+    if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
+        old_type = entry.entry_type
+        entry.entry_type = 'inproceedings'
+        if 'ENTRYTYPE' in entry.raw_entry:
+            entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
+        # Clear journal if it was arXiv
+        if entry.journal and 'arxiv' in entry.journal.lower():
+            entry.journal = ""
+            if 'journal' in entry.raw_entry:
+                del entry.raw_entry['journal']
+        changes.append(f"Type: @{old_type} → @inproceedings [local_db]")
+    # Booktitle: adopt from DB if missing or different
+    if official.booktitle and not entry.booktitle:
+        entry.booktitle = official.booktitle
+        entry.raw_entry['booktitle'] = official.booktitle
+        changes.append(f"Booktitle: [Added] {official.booktitle[:50]}... [local_db]")
+    # DOI: adopt if missing
+    if official.doi and not entry.doi:
+        entry.doi = official.doi
+        entry.raw_entry['doi'] = official.doi
+        changes.append(f"DOI: [Added] {official.doi} [local_db]")
+    return changes
+def apply_fix(entry, data, all_candidates=None) -> list:
+    """Update entry metadata from fetched data. Returns list of changes strings."""
+    changes = []
+    # Helper to clean string
+    def clean(s): return str(s).strip() if s else ""
+    # Title
+    new_title = clean(data.title)
+    if new_title and new_title.lower() != entry.title.lower():
+        changes.append(f"Title: {entry.title} -> {new_title}")
+        entry.title = new_title
+    # Year: Use resolve_year() if we have multiple candidates
+    if all_candidates:
+        best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
+        if best_year and best_year != entry.year:
+            if int(best_year) > CURRENT_YEAR:
+                changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
+            else:
+                changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
+                entry.year = best_year
+    else:
+        # Single candidate fallback
+        new_year = clean(getattr(data, 'year', ''))
+        if new_year and new_year != entry.year:
+            if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
+                changes.append(f"⚠ Skip suspicious future year {new_year}")
+            else:
+                changes.append(f"Year: {entry.year} -> {new_year}")
+                entry.year = new_year
+    # Author: Smart Merge Strategy
+    # Check for author initial conflict first
+    has_initial_conflict = False
+    if all_candidates:
+        for cand in all_candidates:
+            if hasattr(cand, 'author_initial_conflict') and cand.author_initial_conflict:
+                has_initial_conflict = True
+                break
+    if has_initial_conflict:
+        # Don't overwrite authors when initials conflict
+        changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
+    else:
+        # Normal author merge logic
+        current_authors_raw = TextNormalizer.parse_author_list(entry.author)
+        current_authors_norm = [TextNormalizer.normalize_author_name(a) for a in current_authors_raw]
+        new_authors_list = getattr(data, 'authors', [])
+        if isinstance(new_authors_list, str):
+            new_authors_list = TextNormalizer.parse_author_list(new_authors_list)
+        # Strip DBLP disambiguation IDs from new authors
+        new_authors_list = [TextNormalizer.strip_dblp_disambiguation_id(str(a)) for a in new_authors_list]
+        # Also check if the EXISTING bib authors have DBLP disambiguation IDs baked in
+        for raw_auth in current_authors_raw:
+            if TextNormalizer.has_dblp_disambiguation_id(raw_auth.strip()):
+                changes.append(f"⚠ DBLP disambiguation ID detected in author: '{raw_auth.strip()}'")
+        final_authors = []
+        for new_auth in new_authors_list:
+            new_auth_str = str(new_auth).strip()
+            new_auth_norm = TextNormalizer.normalize_author_name(new_auth_str)
+            # Try to find a match in the existing list
+            match_found = False
+            for i, old_norm in enumerate(current_authors_norm):
+                if old_norm == new_auth_norm:
+                    # Found a match! Use the OLD format
+                    final_authors.append(current_authors_raw[i].strip())
+                    match_found = True
+                    break
+            if not match_found:
+                # New author, use the new string
+                final_authors.append(new_auth_str)
+        # Reconstruct the string
+        new_author_str = " and ".join(final_authors)
+        # Check if the result is effectively different from the original full string
+        def simple_norm(s): return s.lower().replace(" ", "").strip()
+        if simple_norm(new_author_str) != simple_norm(entry.author):
+             old_auth = (entry.author[:50] + '...') if len(entry.author) > 50 else entry.author
+             new_auth_disp = (new_author_str[:50] + '...') if len(new_author_str) > 50 else new_author_str
+             changes.append(f"Author: {old_auth} -> {new_auth_disp}")
+             entry.author = new_author_str
+    # Optional fields (doi, journal, etc.)
+    if hasattr(data, 'doi') and data.doi and not entry.doi:
+        changes.append(f"DOI: [Added] {data.doi}")
+        entry.doi = data.doi
+    return changes
+def validate_entry(entry, workflow, fetchers, comparator):
+    """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
+    from src.utils import TextNormalizer
+    results = []
+    for step in workflow.get_enabled_steps():
+        result = None
+        data = None
+        if step.name == "arxiv_id" and entry.has_arxiv:
+            data = fetchers['arxiv'].fetch_by_id(entry.arxiv_id)
+            if data: result = comparator.compare(entry, data, "arxiv")
+        elif step.name == "crossref_doi" and entry.doi:
+            data = fetchers['crossref'].search_by_doi(entry.doi)
+            if data:
+                # DOI cross-validation: check if the DOI actually resolves to this paper
+                from src.sanitizer import BibSanitizer
+                doi_fixes = BibSanitizer().check_doi_title_match(entry, data)
+                if doi_fixes:
+                    # DOI points to a different work — skip this result
+                    # The fixes have already cleared the bad DOI from the entry
+                    result = None
+                else:
+                    result = comparator.compare(entry, data, "crossref")
+        elif step.name == "semantic_scholar" and entry.title:
+            data = fetchers['semantic'].fetch_by_doi(entry.doi) if entry.doi else None
+            if not data:
+                data = fetchers['semantic'].search_by_title(entry.title)
+            if data: result = comparator.compare(entry, data, "semantic_scholar")
+        elif step.name == "dblp" and entry.title:
+            data = fetchers['dblp'].search_by_title(entry.title)
+            if data: result = comparator.compare(entry, data, "dblp")
+        elif step.name == "openalex" and entry.title:
+            data = fetchers['openalex'].fetch_by_doi(entry.doi) if entry.doi else None
+            if not data:
+                data = fetchers['openalex'].search_by_title(entry.title)
+            if data: result = comparator.compare(entry, data, "openalex")
+        elif step.name == "arxiv_title" and entry.title:
+            metas = fetchers['arxiv'].search_by_title(entry.title)
+            if metas:
+                norm1 = TextNormalizer.normalize_for_comparison(entry.title)
+                best, best_sim = None, 0
+                for m in metas:
+                    sim = TextNormalizer.similarity_ratio(
+                        norm1, TextNormalizer.normalize_for_comparison(m.title)
+                    )
+                    if sim > best_sim:
+                        best, best_sim = m, sim
+                if best and best_sim > 0.5:
+                    result = comparator.compare(entry, best, "arxiv")
+        elif step.name == "crossref_title" and entry.title:
+            data = fetchers['crossref'].search_by_title(entry.title)
+            if data: result = comparator.compare(entry, data, "crossref")
+        elif step.name == "google_scholar" and entry.title:
+            data = fetchers['scholar'].search_by_title(entry.title)
+            if data: result = comparator.compare(entry, data, "scholar")
+        if result:
+            results.append(result)
+    if results:
+        best = max(results, key=lambda r: r.confidence)
+        return best, results
+    # No results
+    return comparator.create_unable_result(entry, "Not found in any data source"), []
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+bibtexparser>=1.4.0
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+rich>=13.7.0
+Unidecode>=1.3.0
+lxml>=5.0.0
+gradio>=4.44.0

scripts/build_index.py ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/usr/bin/env python3
+"""
+Build a title-based index from downloaded DBLP bib files.
+Reads all .bib files in data/raw/ and produces sharded JSON files
+under data/index_shards/ (~25MB each) for GitHub-friendly storage.
+Usage:
+    python scripts/build_index.py
+"""
+import json
+import os
+import re
+import shutil
+import sys
+from pathlib import Path
+try:
+    import bibtexparser
+    from bibtexparser.bparser import BibTexParser
+    from bibtexparser.customization import convert_to_unicode
+except ImportError:
+    print("Error: bibtexparser required. Install: pip install bibtexparser")
+    sys.exit(1)
+MAX_SHARD_MB = 25  # Target shard size in MB
+def normalize_title(title: str) -> str:
+    """Normalize a title for index lookup."""
+    title = re.sub(r'\{([^}]*)\}', r'\1', title)
+    title = re.sub(r'[^\w\s]', ' ', title.lower())
+    return re.sub(r'\s+', ' ', title).strip()
+def write_shards(index: dict, shard_dir: Path):
+    """Split index into ~25MB JSON shard files."""
+    if shard_dir.exists():
+        shutil.rmtree(shard_dir)
+    shard_dir.mkdir(parents=True)
+    shard_num = 0
+    shard_items = []
+    shard_size = 0
+    max_bytes = MAX_SHARD_MB * 1024 * 1024
+    for key, val in index.items():
+        entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))
+        if shard_size + entry_size > max_bytes and shard_items:
+            path = shard_dir / f"index_{shard_num:02d}.json"
+            path.write_text(
+                json.dumps(dict(shard_items), ensure_ascii=False),
+                encoding="utf-8"
+            )
+            mb = path.stat().st_size / 1024 / 1024
+            print(f"  ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
+            shard_num += 1
+            shard_items = []
+            shard_size = 0
+        shard_items.append((key, val))
+        shard_size += entry_size
+    # Last shard
+    if shard_items:
+        path = shard_dir / f"index_{shard_num:02d}.json"
+        path.write_text(
+            json.dumps(dict(shard_items), ensure_ascii=False),
+            encoding="utf-8"
+        )
+        mb = path.stat().st_size / 1024 / 1024
+        print(f"  ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
+        shard_num += 1
+    return shard_num
+def main():
+    raw_dir   = Path(__file__).resolve().parent.parent / "data" / "raw"
+    shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
+    if not raw_dir.exists():
+        print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
+        sys.exit(1)
+    bib_files = sorted(raw_dir.glob("*.bib"))
+    if not bib_files:
+        print(f"No .bib files found in {raw_dir}")
+        sys.exit(1)
+    print(f"📦 Building index from {len(bib_files)} bib files...")
+    index = {}
+    skipped_files = 0
+    for bib_file in bib_files:
+        try:
+            parser = BibTexParser(common_strings=True)
+            parser.customization = convert_to_unicode
+            with open(bib_file, encoding="utf-8", errors="replace") as f:
+                db = bibtexparser.load(f, parser=parser)
+        except Exception as e:
+            print(f"  ⚠ Skip {bib_file.name}: {e}")
+            skipped_files += 1
+            continue
+        for entry in db.entries:
+            title = entry.get("title", "")
+            if not title:
+                continue
+            key = normalize_title(title)
+            if not key:
+                continue
+            if key not in index:
+                index[key] = {
+                    "title":     title.rstrip('.'),
+                    "author":    entry.get("author", ""),
+                    "year":      entry.get("year", ""),
+                    "booktitle": entry.get("booktitle", ""),
+                    "journal":   entry.get("journal", ""),
+                    "doi":       entry.get("doi", ""),
+                    "url":       entry.get("url", ""),
+                    "pages":     entry.get("pages", ""),
+                    "volume":    entry.get("volume", ""),
+                    "_type":     entry.get("ENTRYTYPE", "inproceedings"),
+                    "_source":   bib_file.stem,
+                }
+    print(f"\n📂 Writing sharded index...")
+    n_shards = write_shards(index, shard_dir)
+    total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
+    print(f"\n✅ Index: {len(index):,} unique entries → {n_shards} shards ({total_mb:.1f} MB total)")
+    print(f"   Saved to: {shard_dir}/")
+    if skipped_files:
+        print(f"   ⚠ {skipped_files} file(s) skipped due to parse errors")
+if __name__ == "__main__":
+    main()

scripts/refresh_db.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+# Refresh the local DBLP conference database.
+# Run this before paper submission to ensure the DB is up to date.
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+echo "🔄 Refreshing conference database..."
+# 1. Download new bib files (only those not yet downloaded)
+python "$SCRIPT_DIR/update_db.py"
+# 2. Rebuild the index
+python "$SCRIPT_DIR/build_index.py"
+echo ""
+echo "✅ DB refreshed."
+echo "   Run: python main.py --bib your_paper.bib"

scripts/update_db.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#!/usr/bin/env python3
+"""
+Download conference/journal proceedings from DBLP as BibTeX files.
+Uses the DBLP venue-based search API which is more reliable than
+the TOC-based .bht queries (which often return 404 or single entries).
+API format:
+  https://dblp.org/search/publ/api
+    ?q=venue:{VenueName}: year:{year}:
+    &h=1000        # max results per batch
+    &f={offset}    # pagination offset
+    &format=bib1   # BibTeX format
+Usage:
+    python scripts/update_db.py
+"""
+import requests
+import time
+import sys
+from pathlib import Path
+DBLP_API = "https://dblp.org/search/publ/api"
+# (dblp_venue_name, output_prefix, years)
+# dblp_venue_name: exact venue string used in DBLP's venue: filter
+# output_prefix:   filename prefix for saved .bib files
+CONFERENCES = [
+    # ── Speech & Audio ──────────────────────────────────────────
+    ("INTERSPEECH",  "interspeech",  range(2018, 2027)),
+    ("ICASSP",       "icassp",       range(2018, 2027)),
+    ("ASRU",         "asru",         [2019, 2021, 2023, 2025]),
+    ("SLT",          "slt",          [2018, 2021, 2022, 2024]),
+    # ── ML / AI ─────────────────────────────────────────────────
+    ("ICML",         "icml",         range(2018, 2027)),
+    ("NeurIPS",      "neurips",      range(2017, 2027)),
+    ("ICLR",         "iclr",         range(2018, 2027)),
+    ("AAAI",         "aaai",         range(2018, 2027)),
+    ("IJCAI",        "ijcai",        range(2018, 2027)),
+    ("CVPR",         "cvpr",         range(2018, 2027)),
+    ("ECCV",         "eccv",         [2018, 2020, 2022, 2024]),
+    ("ICCV",         "iccv",         [2019, 2021, 2023, 2025]),
+    # ── NLP ─────────────────────────────────────────────────────
+    ("ACL",          "acl",          range(2018, 2027)),       # includes Findings
+    ("EMNLP",        "emnlp",        range(2018, 2027)),       # includes Findings
+    ("NAACL",        "naacl",        range(2018, 2027)),
+    ("EACL",         "eacl",         range(2018, 2027)),
+    ("LREC/COLING",  "coling",       [2024, 2025]),
+    # Older COLING uses different venue
+    # ("COLING",       "coling",       [2018, 2020, 2022]),
+    # ── IR / Web / Data ─────────────────────────────────────────
+    ("SIGIR",        "sigir",        range(2018, 2027)),
+    ("KDD",          "kdd",          range(2018, 2027)),
+    ("WWW",          "www",          range(2018, 2027)),
+    ("WSDM",         "wsdm",         range(2018, 2027)),
+]
+# Journals use venue search too
+JOURNALS = [
+    ("IEEE ACM Trans Audio Speech Lang Process",  "taslp",  range(2018, 2027)),
+    ("Trans. Assoc. Comput. Linguistics",         "tacl",   range(2018, 2027)),
+]
+def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
+    """Download a conference/journal year from DBLP using venue search."""
+    out_file = out_dir / f"{prefix}{year}.bib"
+    if out_file.exists():
+        return  # Skip if already downloaded
+    query = f"venue:{venue_name}: year:{year}:"
+    all_bib = []
+    offset = 0
+    while True:
+        try:
+            r = requests.get(DBLP_API, params={
+                "q": query, "h": 1000, "f": offset,
+                "format": "bib1",
+            }, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
+            text = r.text.strip()
+        except Exception as e:
+            print(f"  ✗ {prefix}{year}: network error ({e})")
+            return
+        # Check for HTML error pages
+        if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
+            break
+        all_bib.append(text)
+        n_entries = text.count("@")
+        if n_entries < 1000:
+            break
+        offset += 1000
+        time.sleep(1)
+    if all_bib:
+        total = sum(b.count("@") for b in all_bib)
+        out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
+        print(f"  ✓ {prefix}{year}: {total} entries")
+    else:
+        print(f"  ✗ {prefix}{year}: not on DBLP yet")
+def main():
+    out = Path(__file__).resolve().parent.parent / "data" / "raw"
+    out.mkdir(parents=True, exist_ok=True)
+    print("📥 Downloading conference proceedings from DBLP...")
+    total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
+    done = 0
+    for venue, prefix, years in CONFERENCES:
+        for y in years:
+            download_venue(venue, prefix, y, out)
+            done += 1
+            time.sleep(0.5)
+    print(f"\n📥 Downloading journal volumes from DBLP...")
+    for venue, prefix, years in JOURNALS:
+        for y in years:
+            download_venue(venue, prefix, y, out)
+            time.sleep(0.5)
+    print(f"\n✅ Done. Run: python scripts/build_index.py")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Bibliography Checker Package"""

src/comparator.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Metadata comparison between bib entries and fetched metadata.
+"""
+from datetime import datetime
+from dataclasses import dataclass
+from typing import Optional, List, Union, Any, Tuple
+from .parser import BibEntry
+from .utils import TextNormalizer
+CURRENT_YEAR = datetime.now().year
+# Year source priority: lower number = more trustworthy
+YEAR_SOURCE_PRIORITY = {
+    "crossref":          0,   # DOI-verified, most accurate
+    "dblp":              1,   # Conference proceedings
+    "openalex":          2,
+    "semantic_scholar":   3,
+    "arxiv_journal_ref": 4,   # arXiv's journal_ref field
+    "scholar":           5,
+    "arxiv":             99,  # arXiv submission date — last resort
+}
+def resolve_year(candidates: list, bib_year: str = "") -> Tuple[Optional[str], Optional[str]]:
+    """
+    Pick the best year across all candidate results using source priority.
+    Conference/journal year always beats arXiv submission year.
+    Never returns a future year.
+    Args:
+        candidates: list of ComparisonResult objects
+        bib_year: the current bib entry year (fallback)
+    Returns:
+        (best_year, best_source) or (None, None)
+    """
+    pool = []
+    for cand in candidates:
+        if not cand or not cand.fetched_data:
+            continue
+        source = cand.source
+        fetched_year = str(getattr(cand.fetched_data, 'year', '') or '').strip()
+        if not fetched_year or not fetched_year.isdigit():
+            continue
+        # Check for conference_year from arXiv journal_ref
+        conf_year = str(getattr(cand.fetched_data, 'conference_year', '') or '').strip()
+        if source == "arxiv" and conf_year and conf_year.isdigit():
+            pool.append((YEAR_SOURCE_PRIORITY.get("arxiv_journal_ref", 4), conf_year, "arxiv_journal_ref"))
+        priority = YEAR_SOURCE_PRIORITY.get(source, 50)
+        pool.append((priority, fetched_year, source))
+    if not pool:
+        return None, None
+    pool.sort()
+    # Pick best year that isn't in the future
+    for _, year, source in pool:
+        if int(year) <= CURRENT_YEAR:
+            return year, source
+    # All years are future — return None
+    return None, None
+@dataclass
+class ComparisonResult:
+    """Result of comparing bib entry with fetched metadata."""
+    entry_key: str
+    # Title comparison
+    title_match: bool
+    title_similarity: float
+    bib_title: str
+    fetched_title: str
+    # Author comparison
+    author_match: bool
+    author_similarity: float
+    bib_authors: list[str]
+    fetched_authors: list[str]
+    # Year comparison
+    year_match: bool
+    bib_year: str
+    fetched_year: str
+    # Overall assessment
+    is_match: bool
+    confidence: float
+    issues: list[str]
+    source: str
+    # Raw metadata for auto-fixing
+    fetched_data: Any = None
+    # Author initial conflict flag
+    author_initial_conflict: bool = False
+    @property
+    def has_issues(self) -> bool:
+        return len(self.issues) > 0
+@dataclass
+class EntryReport:
+    """Complete report for a single bib entry."""
+    entry: BibEntry
+    comparison: Optional[ComparisonResult]
+    evaluations: list = None
+    def __post_init__(self):
+        if self.evaluations is None:
+            self.evaluations = []
+class MetadataComparator:
+    """Compares bibliography entries with fetched metadata."""
+    # Thresholds for matching
+    TITLE_THRESHOLD = 0.8
+    AUTHOR_THRESHOLD = 0.6
+    def __init__(self):
+        self.normalizer = TextNormalizer
+    def compare(self, bib_entry: BibEntry, fetched_data: Any, source_name: str) -> ComparisonResult:
+        """
+        Generic comparison method for any data source.
+        fetched_data must have 'title', 'year', and 'authors' attributes.
+        """
+        issues = []
+        # --- Title Comparison ---
+        bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
+        fetched_title_norm = self.normalizer.normalize_for_comparison(fetched_data.title)
+        title_similarity = self.normalizer.similarity_ratio(bib_title_norm, fetched_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, fetched_title_norm)
+            title_similarity = max(title_similarity, lev_sim)
+        title_match = title_similarity >= self.TITLE_THRESHOLD
+        if not title_match:
+            issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
+        # --- Author Comparison ---
+        bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
+        # Check for DBLP disambiguation IDs in bib entry author names
+        raw_author_list = self.normalizer.parse_author_list(bib_entry.author)
+        for raw_auth in raw_author_list:
+            if self.normalizer.has_dblp_disambiguation_id(raw_auth.strip()):
+                issues.append(f"DBLP disambiguation ID in author: '{raw_auth.strip()}'")
+        # Handle different author formats (list vs string)
+        fetched_authors_raw = getattr(fetched_data, 'authors', [])
+        if isinstance(fetched_authors_raw, str):
+            # Scholar style: "Author1, Author2"
+            fetched_authors_raw = [a.strip() for a in fetched_authors_raw.split(',')]
+        fetched_authors = [
+            self.normalizer.normalize_author_name(str(a))
+            for a in fetched_authors_raw
+        ]
+        author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
+        author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        if not author_match:
+            issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
+        # --- Year Comparison ---
+        bib_year = str(bib_entry.year).strip()
+        fetched_year = str(getattr(fetched_data, 'year', '')).strip()
+        year_match = bib_year == fetched_year
+        if not year_match and bib_year and fetched_year:
+            issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
+        # --- Overall Assessment ---
+        is_match = title_match and author_match
+        # Simple weighted confidence score
+        confidence = (
+            title_similarity * 0.5 +
+            author_similarity * 0.3 +
+            (1.0 if year_match else 0.5) * 0.2
+        )
+        # --- Author Initial Conflict Detection ---
+        author_initial_conflict = self._check_author_initial_conflict(
+            bib_authors, fetched_authors,
+            self.normalizer.parse_author_list(bib_entry.author),
+            fetched_authors_raw
+        )
+        if author_initial_conflict:
+            issues.append("Author initial conflict detected (e.g., first-name initials differ)")
+            # Cap confidence — don't auto-adopt these authors
+            confidence = min(confidence, 0.7)
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=title_match,
+            title_similarity=title_similarity,
+            bib_title=bib_entry.title,
+            fetched_title=fetched_data.title,
+            author_match=author_match,
+            author_similarity=author_similarity,
+            bib_authors=bib_authors,
+            fetched_authors=fetched_authors,
+            year_match=year_match,
+            bib_year=bib_year,
+            fetched_year=fetched_year,
+            is_match=is_match,
+            confidence=confidence,
+            issues=issues,
+            source=source_name,
+            fetched_data=fetched_data,
+            author_initial_conflict=author_initial_conflict
+        )
+    def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
+        """Create result when metadata couldn't be fetched."""
+        return ComparisonResult(
+            entry_key=bib_entry.key,
+            title_match=False, title_similarity=0.0,
+            bib_title=bib_entry.title, fetched_title="",
+            author_match=False, author_similarity=0.0,
+            bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[],
+            year_match=False, bib_year=bib_entry.year, fetched_year="",
+            is_match=False, confidence=0.0,
+            issues=[reason], source="unable",
+            fetched_data=None
+        )
+    def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
+        """Compare two author lists."""
+        if not list1 and not list2: return 1.0
+        if not list1 or not list2: return 0.0
+        total_similarity = 0.0
+        for author1 in list1:
+            best_match = 0.0
+            for author2 in list2:
+                if self._names_match(author1, author2):
+                    best_match = 1.0
+                    break
+                sim = self.normalizer.similarity_ratio(author1, author2)
+                best_match = max(best_match, sim)
+            total_similarity += best_match
+        return total_similarity / len(list1)
+    def _names_match(self, name1: str, name2: str) -> bool:
+        """Check if two names match (handles abbreviated names)."""
+        def split_name(n):
+            parts = n.lower().replace('.', '').split()
+            return parts
+        words1 = split_name(name1)
+        words2 = split_name(name2)
+        if not words1 or not words2: return False
+        # Last name must match (assuming last word is last name)
+        if words1[-1] != words2[-1]:
+             return False
+        # First name check:
+        if len(words1) > 1 and len(words2) > 1:
+            f1 = words1[0]
+            f2 = words2[0]
+            # If one is just an initial
+            if len(f1) == 1 or len(f2) == 1:
+                if f1[0] != f2[0]: return False
+            else:
+                # Both full names - must match
+                if f1 != f2: return False
+        return True
+    def _check_author_initial_conflict(
+        self,
+        bib_authors_norm: list[str],
+        fetched_authors_norm: list[str],
+        bib_authors_raw: list[str],
+        fetched_authors_raw: list,
+    ) -> bool:
+        """
+        Detect when first-name initials clearly conflict between
+        bib entry and fetched data.
+        e.g., "Y. Zhou" (bib) vs "Henry Zhou" (fetched) → True (Y ≠ H)
+        This prevents blindly overwriting authors with wrong names.
+        """
+        # Compare by position — aligned authors
+        min_len = min(len(bib_authors_norm), len(fetched_authors_norm))
+        if min_len == 0:
+            return False
+        for i in range(min_len):
+            bib_parts = bib_authors_norm[i].split()
+            fetched_parts = fetched_authors_norm[i].split()
+            if len(bib_parts) < 2 or len(fetched_parts) < 2:
+                continue
+            # Last name must match to consider this a potential conflict
+            if bib_parts[-1] != fetched_parts[-1]:
+                continue
+            bib_first = bib_parts[0]
+            fetched_first = fetched_parts[0]
+            # Both have first name info (not empty)
+            if not bib_first or not fetched_first:
+                continue
+            # If initials differ, it's a conflict
+            if bib_first[0] != fetched_first[0]:
+                return True
+        return False

src/fetcher.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+Unified metadata fetchers for BibGuard.
+"""
+import re
+import time
+import random
+import requests
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from typing import Optional, Any
+from urllib.parse import quote
+from bs4 import BeautifulSoup
+@dataclass
+class FetchResult:
+    """Unified fetch result."""
+    title: str = ""
+    authors: list[str] | str = ""
+    year: str = ""
+    doi: str = ""
+    url: str = ""
+    source: str = ""
+    conference_year: str = ""   # Year from journal_ref / conference proceedings
+    year_source: str = ""      # Where the year came from
+    def __post_init__(self):
+        if self.authors is None: self.authors = []
+        if isinstance(self.authors, str) and self.authors:
+            # Simple split if string provided
+            self.authors = [a.strip() for a in re.split(r',| and ', self.authors) if a.strip()]
+class BaseFetcher:
+    """Base class for fetchers."""
+    def _rate_limit(self, delay: float, last_time: float) -> float:
+        elapsed = time.time() - last_time
+        if elapsed < delay:
+            time.sleep(delay - elapsed)
+        return time.time()
+class ArxivFetcher(BaseFetcher):
+    """Fetches metadata from arXiv API."""
+    API_BASE = "http://export.arxiv.org/api/query"
+    def __init__(self):
+        self._last_req = 0.0
+    def fetch_by_id(self, arxiv_id: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(3.0, self._last_req)
+        clean_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE).strip()
+        try:
+            resp = requests.get(self.API_BASE, params={'id_list': clean_id, 'max_results': 1}, timeout=30)
+            return self._parse(resp.text)
+        except Exception: return None
+    def search_by_title(self, title: str) -> list[FetchResult]:
+        self._last_req = self._rate_limit(3.0, self._last_req)
+        clean = re.sub(r'[^\w\s]', ' ', title).strip()
+        try:
+            resp = requests.get(self.API_BASE, params={'search_query': f'ti:"{clean}"', 'max_results': 3}, timeout=30)
+            return self._parse(resp.text, multiple=True)
+        except Exception: return []
+    def _parse(self, xml: str, multiple=False) -> Optional[FetchResult] | list[FetchResult]:
+        try:
+            root = ET.fromstring(xml)
+            ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
+            entries = root.findall('atom:entry', ns)
+            results = []
+            for entry in entries:
+                id_txt = entry.find('atom:id', ns).text
+                title = entry.find('atom:title', ns).text.strip()
+                authors = [a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)]
+                pub = entry.find('atom:published', ns).text
+                year = pub[:4] if pub else ""
+                doi_elem = entry.find('arxiv:doi', ns)
+                doi = doi_elem.text if doi_elem is not None else ""
+                # Extract conference year from journal_ref if available
+                conference_year = ""
+                journal_ref_elem = entry.find('arxiv:journal_ref', ns)
+                if journal_ref_elem is not None and journal_ref_elem.text:
+                    jr_text = journal_ref_elem.text.strip()
+                    year_match = re.search(r'\b(19|20)\d{2}\b', jr_text)
+                    if year_match:
+                        conference_year = year_match.group(0)
+                result = FetchResult(
+                    title=title,
+                    authors=authors,
+                    year=year,
+                    doi=doi,
+                    url=id_txt,
+                    source="arxiv",
+                    conference_year=conference_year,
+                    year_source="arxiv_journal_ref" if conference_year else "arxiv_submission",
+                )
+                results.append(result)
+            if multiple: return results
+            return results[0] if results else None
+        except Exception:
+            return [] if multiple else None
+class CrossRefFetcher(BaseFetcher):
+    """Fetches from CrossRef API."""
+    API_BASE = "https://api.crossref.org/works"
+    def __init__(self, email=None):
+        self._last_req = 0.0
+        self.headers = {'User-Agent': f'BibGuard/1.0 (mailto:{email or "user@example.com"})'}
+    def search_by_title(self, title: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(0.2, self._last_req)
+        try:
+            resp = requests.get(self.API_BASE, params={'query.bibliographic': title, 'rows': 1}, headers=self.headers, timeout=10)
+            data = resp.json()['message']['items']
+            if data: return self._parse(data[0])
+        except Exception: pass
+        return None
+    def search_by_doi(self, doi: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(0.2, self._last_req)
+        try:
+            resp = requests.get(f"{self.API_BASE}/{quote(doi)}", headers=self.headers, timeout=10)
+            return self._parse(resp.json()['message'])
+        except Exception: return None
+    def _parse(self, item: dict) -> FetchResult:
+        title = item.get('title', [''])[0]
+        authors = [f"{a.get('given','')} {a.get('family','')}".strip() for a in item.get('author', [])]
+        year = str(item.get('published-print', {}).get('date-parts', [[None]])[0][0] or "")
+        return FetchResult(title, authors, year, item.get('DOI', ''), item.get('URL', ''), "crossref")
+class DBLPFetcher(BaseFetcher):
+    """Fetches from DBLP."""
+    API_BASE = "https://dblp.org/search/publ/api"
+    # DBLP disambiguation ID: 4-digit suffix appended to author names
+    # e.g. "Tian Tan 0019", "Wei Li 0119"
+    _DISAMBIG_RE = re.compile(r'\s+\d{4}\s*$')
+    def __init__(self):
+        self._last_req = 0.0
+    @staticmethod
+    def _strip_disambig(name: str) -> str:
+        """Strip DBLP disambiguation suffix from author name."""
+        return DBLPFetcher._DISAMBIG_RE.sub('', name).strip()
+    def search_by_title(self, title: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(1.0, self._last_req)
+        try:
+            resp = requests.get(self.API_BASE, params={'q': title, 'format': 'json', 'h': 1}, timeout=10)
+            hits = resp.json().get('result', {}).get('hits', {}).get('hit', [])
+            if hits:
+                info = hits[0]['info']
+                authors = info.get('authors', {}).get('author', [])
+                if isinstance(authors, dict): authors = [self._strip_disambig(authors.get('text', ''))]
+                elif isinstance(authors, list): authors = [self._strip_disambig(a.get('text', '')) for a in authors]
+                return FetchResult(info.get('title', '').rstrip('.'), authors, info.get('year', ''), info.get('doi', ''), info.get('url', ''), "dblp")
+        except Exception: pass
+        return None
+class SemanticScholarFetcher(BaseFetcher):
+    """Fetches from Semantic Scholar."""
+    API_BASE = "https://api.semanticscholar.org/graph/v1/paper"
+    def __init__(self):
+        self._last_req = 0.0
+    def search_by_title(self, title: str) -> Optional[FetchResult]:
+        return self._fetch(f"{self.API_BASE}/search", {'query': title, 'limit': 1, 'fields': 'title,authors,year,doi,url'})
+    def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
+        return self._fetch(f"{self.API_BASE}/DOI:{doi}", {'fields': 'title,authors,year,doi,url'})
+    def _fetch(self, url, params) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(2.0, self._last_req)
+        try:
+            resp = requests.get(url, params=params, timeout=10)
+            data = resp.json()
+            if 'data' in data and data['data']: data = data['data'][0] # Handle search result
+            if 'error' in data: return None
+            authors = [a['name'] for a in data.get('authors', [])]
+            return FetchResult(data.get('title', ''), authors, str(data.get('year', '')), data.get('doi', ''), data.get('url', ''), "semantic_scholar")
+        except Exception: return None
+class OpenAlexFetcher(BaseFetcher):
+    """Fetches from OpenAlex."""
+    API_BASE = "https://api.openalex.org/works"
+    def __init__(self):
+        self._last_req = 0.0
+    def search_by_title(self, title: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(0.2, self._last_req)
+        try:
+            resp = requests.get(self.API_BASE, params={'search': title, 'per-page': 1}, timeout=10)
+            data = resp.json().get('results', [])
+            if data: return self._parse(data[0])
+        except Exception: pass
+        return None
+    def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
+        self._last_req = self._rate_limit(0.2, self._last_req)
+        try:
+            resp = requests.get(f"{self.API_BASE}/https://doi.org/{doi}", timeout=10)
+            return self._parse(resp.json())
+        except Exception: return None
+    def _parse(self, data: dict) -> FetchResult:
+        authors = [a['author']['display_name'] for a in data.get('authorships', [])]
+        doi = data.get('doi', '').replace('https://doi.org/', '')
+        return FetchResult(data.get('title', ''), authors, str(data.get('publication_year', '')), doi, data.get('id', ''), "openalex")
+class ScholarFetcher(BaseFetcher):
+    """Google Scholar Scraper (Fallback)."""
+    SEARCH_URL = "https://scholar.google.com/scholar"
+    def __init__(self):
+        self._last_req = 0.0
+        self._session = requests.Session()
+        self._blocked = False
+    def search_by_title(self, title: str) -> Optional[FetchResult]:
+        if self._blocked: return None
+        self._last_req = self._rate_limit(5.0 + random.random() * 3, self._last_req) # Polite delay
+        try:
+            headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
+            resp = self._session.get(self.SEARCH_URL, params={'q': f'"{title}"', 'hl': 'en', 'num': 1}, headers=headers, timeout=30)
+            if resp.status_code == 429 or 'unusual traffic' in resp.text:
+                self._blocked = True
+                return None
+            return self._parse(resp.text)
+        except Exception: return None
+    def _parse(self, html: str) -> Optional[FetchResult]:
+        soup = BeautifulSoup(html, 'lxml')
+        entry = soup.find('div', class_='gs_ri')
+        if not entry: return None
+        title_tag = entry.find('h3', class_='gs_rt')
+        title = title_tag.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '').strip()
+        url = title_tag.find('a')['href'] if title_tag.find('a') else ""
+        meta = entry.find('div', class_='gs_a').get_text(strip=True)
+        # Attempt to extract year
+        year_match = re.search(r'\b(19|20)\d{2}\b', meta)
+        year = year_match.group(0) if year_match else ""
+        # Attempt to extract authors (before " - ")
+        authors = meta.split(' - ')[0]
+        return FetchResult(title, authors, year, "", url, "scholar")

src/local_db.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Local Conference Database: fast, offline title lookup against DBLP index.
+This module provides a local database of conference/journal proceedings
+downloaded from DBLP. It serves as a "ground truth" source that eliminates
+the need for network API calls for entries that match known publications.
+"""
+import json
+import re
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+def _normalize(title: str) -> str:
+    """Normalize a title for index lookup (must match build_index.py)."""
+    title = re.sub(r'\{([^}]*)\}', r'\1', title)
+    title = re.sub(r'[^\w\s]', ' ', title.lower())
+    return re.sub(r'\s+', ' ', title).strip()
+@dataclass
+class LocalMatch:
+    """Result from a local DB lookup."""
+    title: str
+    author: str
+    year: str
+    booktitle: str
+    journal: str
+    doi: str
+    url: str
+    pages: str
+    volume: str
+    entry_type: str
+    source_file: str
+class LocalConferenceDB:
+    """Title-based lookup against locally cached DBLP proceedings."""
+    def __init__(self, index_dir: str = None):
+        if index_dir is None:
+            base = Path(__file__).resolve().parent.parent / "data"
+            self._shard_dir = base / "index_shards"
+            self._legacy_path = base / "conference_index.json"
+        else:
+            self._shard_dir = Path(index_dir)
+            self._legacy_path = Path(index_dir).parent / "conference_index.json"
+        self._idx: dict = {}
+        self._loaded = False
+    def load(self) -> bool:
+        """Load index from shards or legacy single file. Returns True if successful."""
+        try:
+            # Try sharded index first
+            if self._shard_dir.exists():
+                shard_files = sorted(self._shard_dir.glob("index_*.json"))
+                if shard_files:
+                    for shard_path in shard_files:
+                        shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
+                        self._idx.update(shard_data)
+                    self._loaded = True
+                    print(f"  📚 Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).")
+                    return True
+            # Fallback: legacy single file
+            if self._legacy_path.exists():
+                self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8"))
+                self._loaded = True
+                print(f"  📚 Local DB: {len(self._idx):,} entries loaded.")
+                return True
+            print("  ⚠ Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py")
+            return False
+        except Exception as e:
+            print(f"  ⚠ Failed to load local DB: {e}")
+            return False
+    @property
+    def is_loaded(self) -> bool:
+        return self._loaded and len(self._idx) > 0
+    def lookup(self, title: str) -> Optional[LocalMatch]:
+        """
+        Look up an entry by title.
+        Returns LocalMatch if found, None otherwise.
+        """
+        if not self._loaded:
+            return None
+        key = _normalize(title)
+        data = self._idx.get(key)
+        if not data:
+            return None
+        return LocalMatch(
+            title=data.get("title", ""),
+            author=data.get("author", ""),
+            year=data.get("year", ""),
+            booktitle=data.get("booktitle", ""),
+            journal=data.get("journal", ""),
+            doi=data.get("doi", ""),
+            url=data.get("url", ""),
+            pages=data.get("pages", ""),
+            volume=data.get("volume", ""),
+            entry_type=data.get("_type", "inproceedings"),
+            source_file=data.get("_source", ""),
+        )

src/normalizer.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Booktitle normalizer: maps verbose venue names to standard abbreviations.
+Loads rules from data/abbr.tsv (regex → abbreviation).
+"""
+import re
+import csv
+from pathlib import Path
+from typing import Optional
+class BooktitleNormalizer:
+    """Normalizes booktitle/journal names to standard abbreviations."""
+    def __init__(self, tsv_path: str = None):
+        if tsv_path is None:
+            tsv_path = str(Path(__file__).resolve().parent.parent / "data" / "abbr.tsv")
+        self.rules: list[tuple[re.Pattern, str]] = []
+        self._load_rules(tsv_path)
+    def _load_rules(self, tsv_path: str):
+        """Load regex → abbreviation rules from TSV file."""
+        path = Path(tsv_path)
+        if not path.exists():
+            return
+        with open(path, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f, delimiter='\t')
+            for row in reader:
+                if len(row) >= 2:
+                    pattern_str = row[0].strip()
+                    abbr = row[1].strip()
+                    # Skip comments and empty lines
+                    if not pattern_str or pattern_str.startswith('#'):
+                        continue
+                    try:
+                        self.rules.append((re.compile(pattern_str, re.IGNORECASE), abbr))
+                    except re.error:
+                        pass  # Skip invalid regex
+    def normalize(self, booktitle: str) -> Optional[str]:
+        """
+        Normalize a booktitle to its standard abbreviation.
+        Returns the abbreviation if matched, None if no match found.
+        """
+        if not booktitle:
+            return None
+        for pattern, abbr in self.rules:
+            if pattern.search(booktitle):
+                return abbr
+        return None

src/parser.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+BibTeX file parser.
+"""
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+from pathlib import Path
+import bibtexparser
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.customization import convert_to_unicode
+@dataclass
+class BibEntry:
+    """Represents a parsed bibliography entry."""
+    key: str
+    entry_type: str
+    title: str = ""
+    author: str = ""
+    year: str = ""
+    abstract: str = ""
+    url: str = ""
+    doi: str = ""
+    arxiv_id: str = ""
+    journal: str = ""
+    booktitle: str = ""
+    publisher: str = ""
+    pages: str = ""
+    volume: str = ""
+    number: str = ""
+    raw_entry: dict = field(default_factory=dict)
+    @property
+    def has_arxiv(self) -> bool:
+        """Check if entry has arXiv information."""
+        return bool(self.arxiv_id)
+    @property
+    def search_query(self) -> str:
+        """Get search query for this entry."""
+        return self.title or self.key
+class BibParser:
+    """Parser for .bib files."""
+    # Patterns for extracting arXiv IDs
+    ARXIV_PATTERNS = [
+        # New format: 2301.00001 or 2301.00001v1
+        r'(\d{4}\.\d{4,5}(?:v\d+)?)',
+        # Old format: hep-th/9901001 or math.GT/0309136
+        r'([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+        # arXiv: prefix
+        r'arXiv:(\d{4}\.\d{4,5}(?:v\d+)?)',
+        r'arXiv:([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+    ]
+    # URL patterns for arXiv
+    ARXIV_URL_PATTERNS = [
+        r'arxiv\.org/abs/(\d{4}\.\d{4,5}(?:v\d+)?)',
+        r'arxiv\.org/abs/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
+        r'arxiv\.org/pdf/(\d{4}\.\d{4,5}(?:v\d+)?)(?:\.pdf)?',
+        r'arxiv\.org/pdf/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)(?:\.pdf)?',
+    ]
+    def __init__(self):
+        self.entries: list[BibEntry] = []
+    def parse_file(self, filepath: str) -> list[BibEntry]:
+        """Parse a .bib file and return list of entries."""
+        path = Path(filepath)
+        if not path.exists():
+            raise FileNotFoundError(f"Bib file not found: {filepath}")
+        with open(path, 'r', encoding='utf-8', errors='replace') as f:
+            content = f.read()
+        return self.parse_content(content)
+    def parse_content(self, content: str) -> list[BibEntry]:
+        """Parse bib content string."""
+        parser = BibTexParser(common_strings=True)
+        parser.customization = convert_to_unicode
+        try:
+            bib_database = bibtexparser.loads(content, parser=parser)
+        except Exception as e:
+            raise ValueError(f"Failed to parse bib content: {e}")
+        self.entries = []
+        for entry in bib_database.entries:
+            bib_entry = self._convert_entry(entry)
+            self.entries.append(bib_entry)
+        return self.entries
+    def _convert_entry(self, entry: dict) -> BibEntry:
+        """Convert a bibtexparser entry to BibEntry."""
+        # Extract basic fields
+        bib_entry = BibEntry(
+            key=entry.get('ID', ''),
+            entry_type=entry.get('ENTRYTYPE', ''),
+            title=entry.get('title', ''),
+            author=entry.get('author', ''),
+            year=entry.get('year', ''),
+            abstract=entry.get('abstract', ''),
+            url=entry.get('url', ''),
+            doi=entry.get('doi', ''),
+            journal=entry.get('journal', ''),
+            booktitle=entry.get('booktitle', ''),
+            publisher=entry.get('publisher', ''),
+            pages=entry.get('pages', ''),
+            volume=entry.get('volume', ''),
+            number=entry.get('number', ''),
+            raw_entry=entry.copy()
+        )
+        # Extract arXiv ID
+        bib_entry.arxiv_id = self._extract_arxiv_id(entry)
+        return bib_entry
+    def _extract_arxiv_id(self, entry: dict) -> str:
+        """Extract arXiv ID from entry."""
+        # Check eprint field first
+        eprint = entry.get('eprint', '')
+        if eprint:
+            arxiv_id = self._parse_arxiv_id(eprint)
+            if arxiv_id:
+                return arxiv_id
+        # Check arxiv field
+        arxiv = entry.get('arxiv', '')
+        if arxiv:
+            arxiv_id = self._parse_arxiv_id(arxiv)
+            if arxiv_id:
+                return arxiv_id
+        # Check URL field
+        url = entry.get('url', '')
+        if url:
+            for pattern in self.ARXIV_URL_PATTERNS:
+                match = re.search(pattern, url, re.IGNORECASE)
+                if match:
+                    return match.group(1)
+        # Check journal field for "arXiv preprint arXiv:XXXX.XXXXX" format
+        journal = entry.get('journal', '')
+        if journal and 'arxiv' in journal.lower():
+            arxiv_id = self._parse_arxiv_id(journal)
+            if arxiv_id:
+                return arxiv_id
+        # Check note field
+        note = entry.get('note', '')
+        if note:
+            arxiv_id = self._parse_arxiv_id(note)
+            if arxiv_id:
+                return arxiv_id
+        return ""
+    def _parse_arxiv_id(self, text: str) -> str:
+        """Parse arXiv ID from text."""
+        for pattern in self.ARXIV_PATTERNS:
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1)
+        return ""
+    def get_entry_by_key(self, key: str) -> Optional[BibEntry]:
+        """Get entry by citation key."""
+        for entry in self.entries:
+            if entry.key == key:
+                return entry
+        return None
+    def filter_file(self, input_path: str, output_path: str, keys_to_keep: set[str]):
+        """
+        Create a new bib file containing only specified keys.
+        Preserves original formatting, comments, and strings.
+        """
+        with open(input_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        filtered_content = self._filter_content(content, keys_to_keep)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(filtered_content)
+    def _filter_content(self, content: str, keys_to_keep: set[str]) -> str:
+        """Filter content string keeping only specified keys."""
+        ranges_to_remove = []
+        i = 0
+        length = len(content)
+        while i < length:
+            if content[i] == '@':
+                start = i
+                # Find opening brace
+                brace_open = content.find('{', i)
+                if brace_open == -1:
+                    i += 1
+                    continue
+                # Get entry type
+                entry_type = content[i+1:brace_open].strip().lower()
+                # Skip comments
+                if entry_type == 'comment':
+                    i = brace_open + 1
+                    continue
+                # Find matching closing brace to determine entry end
+                balance = 1
+                j = brace_open + 1
+                in_quote = False
+                while j < length and balance > 0:
+                    char = content[j]
+                    # Handle escaped characters
+                    if char == '\\':
+                        j += 2
+                        continue
+                    if char == '"':
+                        in_quote = not in_quote
+                    elif not in_quote:
+                        if char == '{':
+                            balance += 1
+                        elif char == '}':
+                            balance -= 1
+                    j += 1
+                end = j
+                # Extract key (between { and ,)
+                # Only for standard entries, not @string or @preamble
+                if entry_type not in ('string', 'preamble'):
+                    # Find comma or end of entry
+                    # Key is usually the first token after {
+                    key_part = content[brace_open+1:end]
+                    comma_pos = key_part.find(',')
+                    if comma_pos != -1:
+                        key = key_part[:comma_pos].strip()
+                        # If key is NOT in keep list, mark for removal
+                        if key not in keys_to_keep:
+                            ranges_to_remove.append((start, end))
+                i = end
+            else:
+                i += 1
+        # Reconstruct content
+        new_content = []
+        last_pos = 0
+        for start, end in ranges_to_remove:
+            new_content.append(content[last_pos:start])
+            # Clean up whitespace after removed entry
+            last_pos = end
+            while last_pos < length and content[last_pos] in ' \t\r':
+                last_pos += 1
+            if last_pos < length and content[last_pos] == '\n':
+                last_pos += 1
+        new_content.append(content[last_pos:])
+        return "".join(new_content)
+    def save_entries(self, filepath: str, entries: list[BibEntry]):
+        """Save entries to a .bib file."""
+        db = bibtexparser.bibdatabase.BibDatabase()
+        db_entries = []
+        for entry in entries:
+            # Start with raw entry to preserve custom fields
+            db_entry = entry.raw_entry.copy()
+            # Update with potentially modified fields
+            db_entry['ID'] = entry.key
+            db_entry['ENTRYTYPE'] = entry.entry_type
+            if entry.title: db_entry['title'] = entry.title
+            if entry.author: db_entry['author'] = entry.author
+            if entry.year: db_entry['year'] = entry.year
+            if entry.journal: db_entry['journal'] = entry.journal
+            if entry.booktitle: db_entry['booktitle'] = entry.booktitle
+            if entry.publisher: db_entry['publisher'] = entry.publisher
+            if entry.pages: db_entry['pages'] = entry.pages
+            if entry.volume: db_entry['volume'] = entry.volume
+            if entry.number: db_entry['number'] = entry.number
+            if entry.doi: db_entry['doi'] = entry.doi
+            elif 'doi' in db_entry:
+                # DOI was removed (e.g., by DOI mismatch sanitizer)
+                del db_entry['doi']
+            if entry.url: db_entry['url'] = entry.url
+            # Handle entry type consistency:
+            # inproceedings should use booktitle, not journal
+            if entry.entry_type.lower() == 'inproceedings':
+                if not entry.journal and 'journal' in db_entry:
+                    del db_entry['journal']
+            # article should use journal, not booktitle
+            elif entry.entry_type.lower() == 'article':
+                if not entry.booktitle and 'booktitle' in db_entry:
+                    del db_entry['booktitle']
+            db_entries.append(db_entry)
+        db.entries = db_entries
+        with open(filepath, 'w', encoding='utf-8') as f:
+            bibtexparser.dump(db, f)

src/sanitizer.py ADDED Viewed

	@@ -0,0 +1,493 @@

+"""
+BibTeX Sanitizer: Structural and formatting checks for bib entries.
+Runs as a pre-processing phase before metadata fetch-and-compare,
+detecting and auto-fixing common formatting issues that crawlers
+and copy-paste introduce into .bib files.
+"""
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import List, Optional, Any
+CURRENT_YEAR = datetime.now().year
+from .parser import BibEntry
+from .utils import TextNormalizer
+@dataclass
+class SanitizeFix:
+    """Describes a single sanitization fix applied to a bib entry."""
+    entry_key: str
+    category: str      # e.g., "dblp_id", "corporate_author", "entry_type", "title_case", "doi_mismatch"
+    field: str         # which field was affected
+    description: str   # human-readable description
+    old_value: str = ""
+    new_value: str = ""
+# Known conference name keywords for entry type detection
+CONFERENCE_KEYWORDS = [
+    "conference", "proceedings", "workshop", "symposium",
+    # Top ML/AI
+    "iclr", "icml", "neurips", "nips", "aaai", "ijcai",
+    # NLP
+    "acl", "emnlp", "naacl", "coling", "eacl",
+    # Vision
+    "cvpr", "iccv", "eccv",
+    # Speech
+    "interspeech", "icassp",
+    # IR/Data
+    "sigir", "kdd", "www", "wsdm",
+    # Systems
+    "osdi", "sosp", "nsdi",
+    # General
+    "international conference", "annual meeting",
+]
+class BibSanitizer:
+    """Performs structural and formatting sanity checks on BibEntry objects."""
+    def sanitize_all(self, entries: List[BibEntry]) -> dict:
+        """
+        Run all sanitization checks on a list of entries.
+        Returns dict: {entry_key: [SanitizeFix, ...]}
+        Entries are modified in-place.
+        """
+        all_fixes = {}
+        for entry in entries:
+            fixes = []
+            fixes.extend(self._check_dblp_ids(entry))
+            fixes.extend(self._check_corporate_authors(entry))
+            fixes.extend(self._check_entry_type(entry))
+            fixes.extend(self._check_title_capitalization(entry))
+            fixes.extend(self._check_future_year(entry))
+            fixes.extend(self._clean_entry_fields(entry))
+            if fixes:
+                all_fixes[entry.key] = fixes
+        return all_fixes
+    # ------------------------------------------------------------------
+    # Check 1: DBLP Disambiguation ID Cleanup
+    # ------------------------------------------------------------------
+    def _check_dblp_ids(self, entry: BibEntry) -> List[SanitizeFix]:
+        """Strip DBLP disambiguation IDs (4-digit suffixes) from author names."""
+        fixes = []
+        if not entry.author:
+            return fixes
+        raw_authors = TextNormalizer.parse_author_list(entry.author)
+        cleaned_authors = []
+        any_changed = False
+        for author in raw_authors:
+            author = author.strip()
+            if TextNormalizer.has_dblp_disambiguation_id(author):
+                cleaned = TextNormalizer.strip_dblp_disambiguation_id(author)
+                fixes.append(SanitizeFix(
+                    entry_key=entry.key,
+                    category="dblp_id",
+                    field="author",
+                    description=f"Stripped DBLP disambiguation ID: '{author}' → '{cleaned}'",
+                    old_value=author,
+                    new_value=cleaned,
+                ))
+                cleaned_authors.append(cleaned)
+                any_changed = True
+            else:
+                cleaned_authors.append(author)
+        if any_changed:
+            new_author_str = " and ".join(cleaned_authors)
+            entry.author = new_author_str
+            # Also update raw_entry so save_entries doesn't re-introduce the IDs
+            if 'author' in entry.raw_entry:
+                entry.raw_entry['author'] = new_author_str
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 2: Corporate / Institutional Author Protection
+    # ------------------------------------------------------------------
+    def _check_corporate_authors(self, entry: BibEntry) -> List[SanitizeFix]:
+        """
+        Detect single-word author names and wrap in {{double braces}}.
+        BibTeX treats single-word names as a last name, rendering e.g.
+        "KimiTeam" as "K. Team". Wrapping in {{}} prevents this.
+        """
+        fixes = []
+        if not entry.author:
+            return fixes
+        raw_authors = TextNormalizer.parse_author_list(entry.author)
+        new_authors = []
+        any_changed = False
+        for author in raw_authors:
+            author = author.strip()
+            # Already wrapped in double braces
+            if author.startswith('{{') and author.endswith('}}'):
+                new_authors.append(author)
+                continue
+            # Already wrapped in single braces (check if it's a corporate name)
+            if author.startswith('{') and author.endswith('}'):
+                new_authors.append(author)
+                continue
+            # Single-word author (no spaces) that starts with uppercase
+            # e.g., "KimiTeam", "OpenAI", "Google"
+            stripped = author.strip('{}')
+            if ' ' not in stripped and stripped and stripped[0].isupper() and len(stripped) > 1:
+                wrapped = '{{' + stripped + '}}'
+                fixes.append(SanitizeFix(
+                    entry_key=entry.key,
+                    category="corporate_author",
+                    field="author",
+                    description=f"Corporate author protected: '{author}' → '{wrapped}'",
+                    old_value=author,
+                    new_value=wrapped,
+                ))
+                new_authors.append(wrapped)
+                any_changed = True
+            else:
+                new_authors.append(author)
+        if any_changed:
+            new_author_str = " and ".join(new_authors)
+            entry.author = new_author_str
+            if 'author' in entry.raw_entry:
+                entry.raw_entry['author'] = new_author_str
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 3: Entry Type Correction (article → inproceedings)
+    # ------------------------------------------------------------------
+    def _check_entry_type(self, entry: BibEntry) -> List[SanitizeFix]:
+        """
+        Detect conference papers incorrectly typed as @article.
+        Heuristics:
+        - Has booktitle field → should be inproceedings
+        - Journal field contains conference keywords → move to booktitle
+        """
+        fixes = []
+        if entry.entry_type.lower() != 'article':
+            return fixes
+        # Case 1: Has booktitle but typed as article
+        if entry.booktitle:
+            old_type = entry.entry_type
+            entry.entry_type = 'inproceedings'
+            if 'ENTRYTYPE' in entry.raw_entry:
+                entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="entry_type",
+                field="ENTRYTYPE",
+                description=f"Entry has booktitle but was @{old_type} → @inproceedings",
+                old_value=old_type,
+                new_value='inproceedings',
+            ))
+            return fixes
+        # Case 2: Journal field contains conference keywords
+        if entry.journal:
+            journal_lower = entry.journal.lower()
+            matched_keyword = None
+            for keyword in CONFERENCE_KEYWORDS:
+                if keyword in journal_lower:
+                    matched_keyword = keyword
+                    break
+            if matched_keyword:
+                old_type = entry.entry_type
+                old_journal = entry.journal
+                # Move journal → booktitle
+                entry.booktitle = entry.journal
+                entry.journal = ""
+                entry.entry_type = 'inproceedings'
+                # Update raw_entry
+                if 'ENTRYTYPE' in entry.raw_entry:
+                    entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
+                entry.raw_entry['booktitle'] = old_journal
+                if 'journal' in entry.raw_entry:
+                    del entry.raw_entry['journal']
+                fixes.append(SanitizeFix(
+                    entry_key=entry.key,
+                    category="entry_type",
+                    field="ENTRYTYPE",
+                    description=(
+                        f"@{old_type} → @inproceedings "
+                        f"(journal '{old_journal}' contains '{matched_keyword}', moved to booktitle)"
+                    ),
+                    old_value=old_type,
+                    new_value='inproceedings',
+                ))
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 4: DOI-Title Cross-Validation
+    # ------------------------------------------------------------------
+    def check_doi_title_match(self, entry: BibEntry, fetched_data: Any) -> List[SanitizeFix]:
+        """
+        Validate that a DOI resolves to the same paper as the bib entry.
+        Called during the fetch phase (requires network), not during
+        the offline sanitize phase.
+        If the DOI metadata title doesn't match the bib entry title,
+        flag the DOI as potentially wrong and remove it.
+        """
+        fixes = []
+        if not entry.doi or not fetched_data:
+            return fixes
+        fetched_title = getattr(fetched_data, 'title', '')
+        if not fetched_title:
+            return fixes
+        bib_title_norm = TextNormalizer.normalize_for_comparison(entry.title)
+        doi_title_norm = TextNormalizer.normalize_for_comparison(fetched_title)
+        similarity = TextNormalizer.similarity_ratio(bib_title_norm, doi_title_norm)
+        if len(bib_title_norm) < 100:
+            lev_sim = TextNormalizer.levenshtein_similarity(bib_title_norm, doi_title_norm)
+            similarity = max(similarity, lev_sim)
+        if similarity < 0.5:
+            old_doi = entry.doi
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="doi_mismatch",
+                field="doi",
+                description=(
+                    f"DOI '{old_doi}' resolves to a different title "
+                    f"('{fetched_title[:60]}...' vs '{entry.title[:60]}...'). "
+                    f"Similarity: {similarity:.0%}. DOI removed."
+                ),
+                old_value=old_doi,
+                new_value="",
+            ))
+            entry.doi = ""
+            if 'doi' in entry.raw_entry:
+                del entry.raw_entry['doi']
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 5: Title Capitalization Protection (for IEEEtran)
+    # ------------------------------------------------------------------
+    # Pattern: 2+ uppercase letters (acronyms like MMAU, SALMONN, GPT, BEATs)
+    _ACRONYM_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{2,}[a-z]?(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
+    # Pattern: CamelCase words (SpeechT5, HuBERT, ChatGPT, AudioPaLM)
+    _CAMELCASE_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][a-z]+(?:[\.-]?[A-Z][a-z]*)+)(?![A-Za-z0-9])')
+    # Pattern: Word with mixed case + digits, optionally with dots/hyphens (GPT-4o, Llama3, Qwen2.5-Omni)
+    _MIXED_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*\d[A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
+    def _check_title_capitalization(self, entry: BibEntry) -> List[SanitizeFix]:
+        """
+        Wrap acronyms and proper nouns in {} to protect capitalization.
+        IEEEtran's .bst forces titles to sentence case.
+        Without braces, "SALMONN" becomes "salmonn".
+        """
+        fixes = []
+        if not entry.title:
+            return fixes
+        title = entry.title
+        words_to_protect = set()
+        # Find acronyms (e.g., MMAU, CREMA-D, SALMONN)
+        for m in self._ACRONYM_RE.finditer(title):
+            word = m.group(1)
+            # Skip very common short words that might be false positives
+            if word in ('AI', 'ML', 'NLP', 'CV', 'LLM', 'ASR', 'TTS', 'NER',
+                        'QA', 'MT', 'IR', 'RL', 'GAN', 'VAE', 'RNN', 'CNN',
+                        'GPU', 'CPU', 'TPU', 'API', 'URL', 'PDF', 'HTML',
+                        'II', 'III', 'IV', 'VI', 'VII', 'VIII', 'IX', 'XI',
+                        'USB', 'RAM', 'ROM', 'SSD', 'TCP', 'HTTP', 'SSL',
+                        'BERT', 'GPT', 'LSTM', 'MLP', 'FFN', 'LLM'):
+                # Still protect these! They're valid acronyms
+                words_to_protect.add(word)
+            elif len(word) >= 2:
+                words_to_protect.add(word)
+        # Find CamelCase (e.g., SpeechT5, HuBERT, ChatGPT, BEATs)
+        for m in self._CAMELCASE_RE.finditer(title):
+            words_to_protect.add(m.group(1))
+        # Find mixed-case+digit patterns (e.g., GPT4, Llama3)
+        for m in self._MIXED_RE.finditer(title):
+            words_to_protect.add(m.group(1))
+        if not words_to_protect:
+            return fixes
+        # Apply protection: wrap each word in {} if not already braced
+        new_title = title
+        protected_words = []
+        for word in sorted(words_to_protect, key=len, reverse=True):
+            # Check if this word is already inside braces
+            # Look for {word} already in the title
+            if '{' + word + '}' in new_title:
+                continue
+            if '{{' + word + '}}' in new_title:
+                continue
+            # Replace the bare word with {word}
+            # Use word boundary to avoid partial matches
+            pattern = re.compile(r'(?<!\{)\b' + re.escape(word) + r'\b(?!\})')
+            if pattern.search(new_title):
+                new_title = pattern.sub('{' + word + '}', new_title)
+                protected_words.append(word)
+        if protected_words and new_title != title:
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="title_case",
+                field="title",
+                description=f"Protected capitalization: {', '.join(protected_words)}",
+                old_value=title,
+                new_value=new_title,
+            ))
+            entry.title = new_title
+            if 'title' in entry.raw_entry:
+                entry.raw_entry['title'] = new_title
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 6: Future Year Detection
+    # ------------------------------------------------------------------
+    def _check_future_year(self, entry: BibEntry) -> List[SanitizeFix]:
+        """
+        Detect entries with year > current year.
+        These are likely arXiv submission dates that will be wrong once
+        the paper is published at a conference. Flag them for forced
+        API lookup so the correct conference year can be found.
+        """
+        fixes = []
+        year_str = str(entry.year).strip()
+        if not year_str or not year_str.isdigit():
+            return fixes
+        year = int(year_str)
+        if year > CURRENT_YEAR:
+            # Flag the entry for forced API lookup
+            entry._force_api_lookup = True
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="future_year",
+                field="year",
+                description=(
+                    f"Future year {year} detected (current: {CURRENT_YEAR}). "
+                    f"Will force API lookup to find correct year."
+                ),
+                old_value=year_str,
+                new_value="",  # Will be resolved by API
+            ))
+        elif year < 1950:
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="future_year",
+                field="year",
+                description=f"Suspiciously old year: {year}",
+                old_value=year_str,
+                new_value="",
+            ))
+        return fixes
+    # ------------------------------------------------------------------
+    # Check 7: Field Cleanup Policy
+    # ------------------------------------------------------------------
+    # Fields to remove per entry type
+    FIELD_REMOVE_POLICY = {
+        "inproceedings": [
+            "address", "month", "abstract",
+            "archiveprefix", "primaryclass",
+            "biburl", "bibsource", "timestamp",
+            "copyright", "issn", "isbn",
+        ],
+        "article": [
+            "address", "month", "abstract",
+            "archiveprefix", "primaryclass",
+            "biburl", "bibsource", "timestamp",
+            "copyright", "issn",
+        ],
+        "misc": [
+            "address", "month", "abstract",
+            "biburl", "bibsource", "timestamp",
+            "copyright",
+        ],
+    }
+    def _clean_entry_fields(self, entry: BibEntry) -> List[SanitizeFix]:
+        """
+        Remove junk/noise fields that crawlers often include.
+        These fields add clutter and can cause formatting issues.
+        """
+        fixes = []
+        entry_type = entry.entry_type.lower()
+        to_remove = self.FIELD_REMOVE_POLICY.get(entry_type, [])
+        removed_fields = []
+        for field_name in to_remove:
+            # Check in raw_entry (case-insensitive)
+            for raw_key in list(entry.raw_entry.keys()):
+                if raw_key.lower() == field_name.lower() and raw_key not in ('ID', 'ENTRYTYPE'):
+                    del entry.raw_entry[raw_key]
+                    removed_fields.append(raw_key)
+        if removed_fields:
+            fixes.append(SanitizeFix(
+                entry_key=entry.key,
+                category="field_cleanup",
+                field="multiple",
+                description=f"Removed junk fields: {', '.join(removed_fields)}",
+                old_value=", ".join(removed_fields),
+                new_value="",
+            ))
+        return fixes
+    # ------------------------------------------------------------------
+    # Standalone: Duplicate Detection
+    # ------------------------------------------------------------------
+    @staticmethod
+    def find_duplicates(entries: List[BibEntry]) -> dict:
+        """
+        Find entries that share the same normalized title.
+        Returns {normalized_title: [key1, key2, ...]} for duplicates.
+        """
+        import re as _re
+        from collections import defaultdict
+        def _norm(t: str) -> str:
+            t = _re.sub(r'\{([^}]*)\}', r'\1', t)
+            t = _re.sub(r'[^\w\s]', ' ', t.lower())
+            return _re.sub(r'\s+', ' ', t).strip()
+        title_map = defaultdict(list)
+        for entry in entries:
+            key = _norm(entry.title)
+            if key:
+                title_map[key].append(entry.key)
+        return {t: keys for t, keys in title_map.items() if len(keys) > 1}

src/space_service.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Non-interactive RefCheck workflow for Hugging Face Spaces.
+"""
+from __future__ import annotations
+import tempfile
+from dataclasses import dataclass, field
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from main import (
+    apply_fix,
+    apply_local_fix,
+    get_default_workflow,
+    validate_entry,
+)
+from src.comparator import EntryReport, MetadataComparator
+from src.fetcher import (
+    ArxivFetcher,
+    CrossRefFetcher,
+    DBLPFetcher,
+    OpenAlexFetcher,
+    ScholarFetcher,
+    SemanticScholarFetcher,
+)
+from src.local_db import LocalConferenceDB
+from src.parser import BibEntry, BibParser
+from src.sanitizer import BibSanitizer, SanitizeFix
+@dataclass
+class RefCheckOptions:
+    """Options for a non-interactive RefCheck run."""
+    remove_unverified: bool = True
+    enable_google_scholar: bool = False
+    max_workers: int = 4
+@dataclass
+class RefCheckResult:
+    """Artifacts and summary produced by a Space run."""
+    total_input: int = 0
+    total_output: int = 0
+    verified: int = 0
+    issues: int = 0
+    not_found: int = 0
+    fixed_details: dict[str, list[str]] = field(default_factory=dict)
+    removed_details: list[tuple[str, str, str]] = field(default_factory=list)
+    review_details: list[dict[str, Any]] = field(default_factory=list)
+    duplicate_details: dict[str, list[str]] = field(default_factory=dict)
+    sanitize_fixes: dict[str, list[SanitizeFix]] = field(default_factory=dict)
+    local_matches: int = 0
+    local_db_loaded: bool = False
+    fixed_bib_path: str = ""
+    report_path: str = ""
+    report_markdown: str = ""
+def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = None) -> RefCheckResult:
+    """Validate and fix an uploaded BibTeX file without interactive prompts."""
+    options = options or RefCheckOptions()
+    source_path = Path(file_path)
+    parser = BibParser()
+    entries = parser.parse_file(str(source_path))
+    result = RefCheckResult(total_input=len(entries))
+    if not entries:
+        result.report_markdown = "## RefCheck Report\n\nNo BibTeX entries were found."
+        result.report_path = _write_report(result.report_markdown)
+        result.fixed_bib_path = _write_bib(parser, [], source_path.stem)
+        return result
+    sanitizer = BibSanitizer()
+    result.sanitize_fixes = sanitizer.sanitize_all(entries)
+    _record_sanitize_fixes(result.fixed_details, result.sanitize_fixes)
+    result.duplicate_details = sanitizer.find_duplicates(entries)
+    result.local_db_loaded, api_entries, result.local_matches = _apply_local_db(entries, result.fixed_details)
+    fetchers = _build_fetchers()
+    workflow = get_default_workflow()
+    for step in workflow.steps:
+        if step.name == "google_scholar":
+            step.enabled = options.enable_google_scholar
+    comparator = MetadataComparator()
+    analysis = _analyze_entries(api_entries, workflow, fetchers, comparator, options.max_workers)
+    actions: dict[str, tuple[str, Any, list[Any]]] = {}
+    for entry, best_result, candidates in analysis:
+        if not best_result:
+            actions[entry.key] = ("keep", None, [])
+        elif getattr(entry, "_force_api_lookup", False) and best_result.fetched_data:
+            actions[entry.key] = ("fix", best_result, candidates)
+        elif best_result.confidence > 0.85 and best_result.fetched_data:
+            actions[entry.key] = ("fix", best_result, candidates)
+        elif best_result.is_match:
+            actions[entry.key] = ("keep", best_result, candidates)
+        elif candidates:
+            actions[entry.key] = ("review", best_result, candidates)
+        else:
+            actions[entry.key] = ("remove", best_result, candidates)
+    updated_entries: list[BibEntry] = []
+    for entry in entries:
+        action, best_result, candidates = actions.get(entry.key, ("keep", None, []))
+        if action == "fix":
+            changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
+            if changes:
+                result.fixed_details.setdefault(entry.key, []).extend(changes)
+            updated_entries.append(entry)
+        elif action == "review":
+            result.review_details.append(_review_payload(entry, best_result, candidates))
+            updated_entries.append(entry)
+        elif action == "remove":
+            if options.remove_unverified:
+                result.removed_details.append((entry.key, entry.title, "No matching metadata found in any source"))
+            else:
+                result.review_details.append(
+                    {
+                        "key": entry.key,
+                        "title": entry.title,
+                        "reason": "No matching metadata found in any source",
+                        "candidates": [],
+                    }
+                )
+                updated_entries.append(entry)
+        else:
+            updated_entries.append(entry)
+    result.total_output = len(updated_entries)
+    fixed_path = _write_bib(parser, updated_entries, source_path.stem)
+    result.fixed_bib_path = fixed_path
+    verified_entries = parser.parse_file(fixed_path)
+    verification_reports = _verify_entries(
+        verified_entries,
+        workflow,
+        fetchers,
+        comparator,
+        options.max_workers,
+    )
+    result.verified = sum(1 for r in verification_reports if r.comparison and r.comparison.is_match)
+    result.issues = sum(1 for r in verification_reports if r.comparison and r.comparison.has_issues)
+    result.not_found = sum(
+        1
+        for r in verification_reports
+        if r.comparison and not r.comparison.is_match and not r.comparison.has_issues
+    )
+    result.report_markdown = _build_report(result, verification_reports)
+    result.report_path = _write_report(result.report_markdown)
+    return result
+def _build_fetchers() -> dict[str, Any]:
+    return {
+        "arxiv": ArxivFetcher(),
+        "crossref": CrossRefFetcher(),
+        "scholar": ScholarFetcher(),
+        "semantic": SemanticScholarFetcher(),
+        "openalex": OpenAlexFetcher(),
+        "dblp": DBLPFetcher(),
+    }
+def _analyze_entries(
+    entries: list[BibEntry],
+    workflow: Any,
+    fetchers: dict[str, Any],
+    comparator: MetadataComparator,
+    max_workers: int,
+) -> list[tuple[BibEntry, Any, list[Any]]]:
+    if not entries:
+        return []
+    analysis: list[tuple[BibEntry, Any, list[Any]]] = []
+    worker_count = min(max(1, max_workers), len(entries))
+    with ThreadPoolExecutor(max_workers=worker_count) as executor:
+        futures = {
+            executor.submit(validate_entry, entry, workflow, fetchers, comparator): entry
+            for entry in entries
+        }
+        for future in as_completed(futures):
+            entry = futures[future]
+            try:
+                best_result, candidates = future.result()
+            except Exception:
+                best_result, candidates = None, []
+            analysis.append((entry, best_result, candidates))
+    return analysis
+def _verify_entries(
+    entries: list[BibEntry],
+    workflow: Any,
+    fetchers: dict[str, Any],
+    comparator: MetadataComparator,
+    max_workers: int,
+) -> list[EntryReport]:
+    reports: list[EntryReport] = []
+    for entry, best_result, _ in _analyze_entries(entries, workflow, fetchers, comparator, max_workers):
+        reports.append(EntryReport(entry=entry, comparison=best_result))
+    return reports
+def _record_sanitize_fixes(
+    fixed_details: dict[str, list[str]],
+    sanitize_fixes: dict[str, list[SanitizeFix]],
+) -> None:
+    for key, fixes in sanitize_fixes.items():
+        fixed_details.setdefault(key, [])
+        fixed_details[key].extend(fix.description for fix in fixes)
+def _apply_local_db(
+    entries: list[BibEntry],
+    fixed_details: dict[str, list[str]],
+) -> tuple[bool, list[BibEntry], int]:
+    local_db = _load_local_db()
+    if not local_db.is_loaded:
+        return False, entries, 0
+    api_entries = []
+    match_count = 0
+    for entry in entries:
+        official = local_db.lookup(entry.title)
+        if not official:
+            api_entries.append(entry)
+            continue
+        changes = apply_local_fix(entry, official)
+        match_count += 1
+        if changes:
+            fixed_details.setdefault(entry.key, []).extend(changes)
+    return True, api_entries, match_count
+@lru_cache(maxsize=1)
+def _load_local_db() -> LocalConferenceDB:
+    local_db = LocalConferenceDB()
+    local_db.load()
+    return local_db
+def _review_payload(entry: BibEntry, best_result: Any, candidates: list[Any]) -> dict[str, Any]:
+    return {
+        "key": entry.key,
+        "title": entry.title,
+        "reason": "; ".join(best_result.issues) if best_result and best_result.issues else "Ambiguous match",
+        "candidates": [
+            {
+                "source": candidate.source,
+                "confidence": candidate.confidence,
+                "title": getattr(candidate.fetched_data, "title", ""),
+                "year": getattr(candidate.fetched_data, "year", ""),
+                "doi": getattr(candidate.fetched_data, "doi", ""),
+            }
+            for candidate in candidates[:5]
+        ],
+    }
+def _write_bib(parser: BibParser, entries: list[BibEntry], original_stem: str) -> str:
+    out_dir = Path(tempfile.mkdtemp(prefix="refcheck_"))
+    out_path = out_dir / f"{original_stem or 'references'}_refcheck_fixed.bib"
+    parser.save_entries(str(out_path), entries)
+    return str(out_path)
+def _write_report(markdown: str) -> str:
+    out_dir = Path(tempfile.mkdtemp(prefix="refcheck_report_"))
+    out_path = out_dir / "refcheck_report.md"
+    out_path.write_text(markdown, encoding="utf-8")
+    return str(out_path)
+def _build_report(result: RefCheckResult, reports: list[EntryReport]) -> str:
+    lines = [
+        "## RefCheck Report",
+        "",
+        "### Summary",
+        "",
+        f"- Input entries: {result.total_input}",
+        f"- Output entries: {result.total_output}",
+        f"- Verified after fix: {result.verified}",
+        f"- Remaining issues: {result.issues}",
+        f"- Not found after fix: {result.not_found}",
+        f"- Local DB loaded: {'yes' if result.local_db_loaded else 'no'}",
+        f"- Local DB matches: {result.local_matches}",
+        "",
+    ]
+    if result.removed_details:
+        lines.extend(["### Removed", ""])
+        for key, title, reason in result.removed_details:
+            lines.append(f"- `{key}`: {title} ({reason})")
+        lines.append("")
+    if result.fixed_details:
+        lines.extend(["### Fixed", ""])
+        for key, changes in sorted(result.fixed_details.items()):
+            lines.append(f"- `{key}`")
+            for change in changes:
+                lines.append(f"  - {change}")
+        lines.append("")
+    if result.duplicate_details:
+        lines.extend(["### Duplicate Titles", ""])
+        for title, keys in result.duplicate_details.items():
+            lines.append(f"- `{', '.join(keys)}`: {title}")
+        lines.append("")
+    if result.review_details:
+        lines.extend(["### Needs Review", ""])
+        for item in result.review_details:
+            lines.append(f"- `{item['key']}`: {item['title']}")
+            lines.append(f"  - Reason: {item['reason']}")
+            for candidate in item["candidates"]:
+                lines.append(
+                    "  - Candidate: "
+                    f"{candidate['source']} "
+                    f"(confidence {candidate['confidence']:.2f}) "
+                    f"{candidate['title']} "
+                    f"{candidate['year']} "
+                    f"{candidate['doi']}".strip()
+                )
+        lines.append("")
+    remaining = [
+        report
+        for report in reports
+        if report.comparison and not report.comparison.is_match
+    ]
+    if remaining:
+        lines.extend(["### Verification Issues", ""])
+        for report in remaining:
+            comparison = report.comparison
+            issues = "; ".join(comparison.issues) if comparison.issues else "Not matched"
+            lines.append(
+                f"- `{report.entry.key}` via {comparison.source} "
+                f"(confidence {comparison.confidence:.2f}): {issues}"
+            )
+        lines.append("")
+    return "\n".join(lines).strip() + "\n"

src/ui.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.tree import Tree
+import copy
+class BibUI:
+    """Handles all terminal UI interactions for BibGuard."""
+    def __init__(self):
+        self.console = Console()
+    def show_analysis_report(self, ok_entries, to_fix, to_review, to_remove):
+        """Display the initial analysis summary table."""
+        table = Table(title="📊 Analysis Report", show_header=True, header_style="bold magenta")
+        table.add_column("Category", style="cyan")
+        table.add_column("Count", justify="right")
+        table.add_column("Description")
+        table.add_row("✅ Correct", str(len(ok_entries)), "Entries match valid metadata")
+        table.add_row("🛠️  To Fix", str(len(to_fix)), "[green]High confidence auto-fixes[/green]")
+        table.add_row("🔍 Review", str(len(to_review)), "[yellow]Ambiguous or low confidence[/yellow]")
+        table.add_row("🗑️  Remove", str(len(to_remove)), "[red]No metadata found (Hallucinations)[/red]")
+        self.console.print(table)
+        if not (to_fix or to_review or to_remove):
+            self.console.print(Panel("[green]✓ No issues found. All entries are valid.[/green]", title="Status"))
+    def show_manual_review(self, entry, best_res, candidates, apply_fix_func):
+        """Display manual review table for a single entry."""
+        self.console.print(f"\n[bold]Entry: {entry.key}[/bold]")
+        self.console.print(f"Title: {entry.title}")
+        self.console.print(f"Year:  {entry.year}")
+        self.console.print(f"Auth:  {entry.author}")
+        cand_table = Table(show_header=True, header_style="bold blue")
+        cand_table.add_column("#", style="dim", width=4)
+        cand_table.add_column("Source", style="cyan", width=12)
+        cand_table.add_column("Conf", justify="right")
+        cand_table.add_column("Candidate Metadata (Fetched)", style="white")
+        cand_table.add_column("Proposed Changes", style="green")
+        for i, cand in enumerate(candidates, 1):
+            # We need to simulate the fix to show changes
+            # We pass the apply_fix function to avoid circular dependency or logic duplication
+            temp_entry = copy.deepcopy(entry)
+            changes = apply_fix_func(temp_entry, cand.fetched_data)
+            change_desc = "\n".join(changes) if changes else "[dim]No changes[/dim]"
+            conf_style = "green" if cand.confidence > 0.7 else "yellow" if cand.confidence > 0.4 else "red"
+            # Format the candidate's actual metadata
+            fd = cand.fetched_data
+            meta_lines = []
+            if getattr(fd, 'title', None):
+                meta_lines.append(f"[bold]Title:[/bold] {fd.title[:60] + '...' if len(fd.title) > 60 else fd.title}")
+            if getattr(fd, 'authors', None):
+                a_str = " and ".join(fd.authors)
+                meta_lines.append(f"[bold]Authors:[/bold] {a_str[:60] + '...' if len(a_str) > 60 else a_str}")
+            if getattr(fd, 'year', None):
+                meta_lines.append(f"[bold]Year:[/bold] {fd.year}")
+            if getattr(fd, 'doi', None):
+                meta_lines.append(f"[bold]DOI:[/bold] {fd.doi}")
+            meta_desc = "\n".join(meta_lines) if meta_lines else "[dim]No metadata details[/dim]"
+            cand_table.add_row(
+                str(i),
+                cand.source,
+                f"[{conf_style}]{cand.confidence:.2f}[/{conf_style}]",
+                meta_desc,
+                change_desc
+            )
+        self.console.print(cand_table)
+    def show_final_report(self, total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details):
+        """Display the verification status and modification tree."""
+        # Visual Final Status
+        status_table = Table(box=None, padding=(0, 2))
+        status_table.add_column("Metric", style="bold")
+        status_table.add_column("Value", justify="right")
+        status_table.add_row("Total Entries", str(total))
+        status_table.add_row("Verified", f"[green]{verified}[/green]")
+        status_table.add_row("Issues", f"[red]{issues}[/red]" if issues > 0 else "0")
+        status_table.add_row("Not Found", f"[yellow]{not_found}[/yellow]" if not_found > 0 else "0")
+        self.console.print(Panel(status_table, title="📊 Final Status", expand=False))
+        if issues > 0:
+            self.console.print("\n[bold red]⚠ Remaining Issues (Not Auto-Fixed):[/bold red]")
+            for r in reports:
+                if r.comparison and r.comparison.has_issues:
+                    self.console.print(f"  - [bold]{r.entry.key}[/bold] (Conf: {r.comparison.confidence:.2f}): {', '.join(r.comparison.issues)}")
+        # Report fixes and removals
+        if fixed_count > 0 or removed_details:
+            tree = Tree("✏️  Modifications Report")
+            if removed_details:
+                rem_node = tree.add(f"[red]Removed {len(removed_details)} entries[/red]")
+                for entry, reason in removed_details:
+                     rem_node.add(f"[bold]{entry.key}[/bold]: \"{entry.title}\" ([italic]{reason}[/italic])")
+            if fixed_count > 0:
+                fix_node = tree.add(f"[green]Fixed {fixed_count} entries[/green]")
+                for key, changes in fixed_details.items():
+                    entry_node = fix_node.add(f"[bold]{key}[/bold]")
+                    for change in changes:
+                        entry_node.add(change)
+            self.console.print(tree)
+            self.console.print("\n[green]✓ Changes applied and saved to file.[/green]")
+        else:
+            self.console.print("\n[green]✓ No changes were needed.[/green]")
+    def show_sanitize_report(self, sanitize_fixes: dict):
+        """Display sanitization results as a rich tree."""
+        if not sanitize_fixes:
+            self.console.print("[green]✓ No formatting issues found.[/green]\n")
+            return
+        # Category display info
+        category_info = {
+            "dblp_id": ("🔢", "DBLP Disambiguation ID Cleanup", "red"),
+            "corporate_author": ("🏢", "Corporate Author Protection", "yellow"),
+            "entry_type": ("📋", "Entry Type Correction", "cyan"),
+            "title_case": ("🔤", "Title Capitalization Protection", "blue"),
+            "doi_mismatch": ("🔗", "DOI Mismatch", "red"),
+            "future_year": ("📅", "Future Year Detection", "magenta"),
+            "field_cleanup": ("🧹", "Junk Field Removal", "dim"),
+        }
+        total_fixes = sum(len(fixes) for fixes in sanitize_fixes.values())
+        tree = Tree(f"🧹 Sanitization Report ({total_fixes} fixes in {len(sanitize_fixes)} entries)")
+        # Group fixes by category across all entries
+        by_category = {}
+        for entry_key, fixes in sanitize_fixes.items():
+            for fix in fixes:
+                if fix.category not in by_category:
+                    by_category[fix.category] = []
+                by_category[fix.category].append(fix)
+        for cat, fixes in by_category.items():
+            icon, label, color = category_info.get(cat, ("❓", cat, "white"))
+            cat_node = tree.add(f"{icon} [{color}]{label} ({len(fixes)})[/{color}]")
+            for fix in fixes:
+                cat_node.add(f"[bold]{fix.entry_key}[/bold]: {fix.description}")
+        self.console.print(tree)
+        self.console.print("")

src/utils.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Utilities for BibGuard: Normalization and Progress Display.
+"""
+import re
+import unicodedata
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional, List
+from unidecode import unidecode
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
+class TextNormalizer:
+    """Utility class for normalizing text for comparison."""
+    # DBLP disambiguation ID pattern: 4-digit number at end of author name
+    # e.g. "Tian Tan 0019", "Wei Li 0119", "Zejun Ma 0001"
+    DBLP_DISAMBIG_PATTERN = re.compile(r'\s+\d{4}\s*$')
+    # LaTeX command patterns
+    LATEX_COMMANDS = [
+        (r'\\textbf\{([^}]*)\}', r'\1'),
+        (r'\\textit\{([^}]*)\}', r'\1'),
+        (r'\\emph\{([^}]*)\}', r'\1'),
+        (r'\\textrm\{([^}]*)\}', r'\1'),
+        (r'\\texttt\{([^}]*)\}', r'\1'),
+        (r'\\textsf\{([^}]*)\}', r'\1'),
+        (r'\\textsc\{([^}]*)\}', r'\1'),
+        (r'\\text\{([^}]*)\}', r'\1'),
+        (r'\\mathrm\{([^}]*)\}', r'\1'),
+        (r'\\mathbf\{([^}]*)\}', r'\1'),
+        (r'\\mathit\{([^}]*)\}', r'\1'),
+        (r'\\url\{([^}]*)\}', r'\1'),
+        (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'),
+    ]
+    # LaTeX special character mappings
+    LATEX_CHARS = {
+        r'\&': '&',
+        r'\%': '%',
+        r'\$': '$',
+        r'\#': '#',
+        r'\_': '_',
+        r'\{': '{',
+        r'\}': '}',
+        r'\~': '~',
+        r'\^': '^',
+        r'``': '"',
+        r"''": '"',
+        r'`': "'",
+        r"'": "'",
+        r'--': '–',
+        r'---': '—',
+    }
+    # LaTeX accent commands
+    LATEX_ACCENTS = [
+        (r"\\'([aeiouAEIOU])", r'\1'),  # acute
+        (r'\\`([aeiouAEIOU])', r'\1'),   # grave
+        (r'\\^([aeiouAEIOU])', r'\1'),   # circumflex
+        (r'\\"([aeiouAEIOU])', r'\1'),   # umlaut
+        (r'\\~([nNaAoO])', r'\1'),       # tilde
+        (r'\\c\{([cC])\}', r'\1'),       # cedilla
+        (r"\\'{([aeiouAEIOU])}", r'\1'),
+        (r'\\`{([aeiouAEIOU])}', r'\1'),
+        (r'\\^{([aeiouAEIOU])}', r'\1'),
+        (r'\\"{([aeiouAEIOU])}', r'\1'),
+        (r'\\~{([nNaAoO])}', r'\1'),
+    ]
+    @classmethod
+    def normalize_latex(cls, text: str) -> str:
+        """Remove LaTeX formatting commands."""
+        if not text: return ""
+        result = text
+        for pattern, replacement in cls.LATEX_COMMANDS:
+            result = re.sub(pattern, replacement, result)
+        for pattern, replacement in cls.LATEX_ACCENTS:
+            result = re.sub(pattern, replacement, result)
+        for latex_char, normal_char in cls.LATEX_CHARS.items():
+            result = result.replace(latex_char, normal_char)
+        return re.sub(r'[{}]', '', result)
+    @classmethod
+    def normalize_unicode(cls, text: str) -> str:
+        """Normalize Unicode characters to ASCII."""
+        if not text: return ""
+        text = unicodedata.normalize('NFKD', text)
+        return unidecode(text)
+    @classmethod
+    def normalize_for_comparison(cls, text: str) -> str:
+        """Full normalization pipeline for text comparison."""
+        if not text: return ""
+        text = cls.normalize_latex(text)
+        text = cls.normalize_unicode(text)
+        text = text.lower()
+        text = re.sub(r'\s+', ' ', text).strip()
+        return re.sub(r'[^\w\s]', '', text)
+    @classmethod
+    def strip_dblp_disambiguation_id(cls, name: str) -> str:
+        """Strip DBLP disambiguation suffix (4-digit number) from author name.
+        DBLP appends codes like '0001', '0019' to disambiguate homonymous authors.
+        e.g. 'Tian Tan 0019' -> 'Tian Tan'
+             'Wei Li 0119'   -> 'Wei Li'
+        """
+        if not name:
+            return name
+        return cls.DBLP_DISAMBIG_PATTERN.sub('', name).strip()
+    @classmethod
+    def has_dblp_disambiguation_id(cls, name: str) -> bool:
+        """Check if an author name contains a DBLP disambiguation ID."""
+        if not name:
+            return False
+        return bool(cls.DBLP_DISAMBIG_PATTERN.search(name))
+    @classmethod
+    def normalize_author_name(cls, name: str) -> str:
+        """Normalize author name format."""
+        if not name: return ""
+        name = cls.normalize_latex(name)
+        name = cls.normalize_unicode(name)
+        # Strip DBLP disambiguation IDs before further processing
+        name = cls.strip_dblp_disambiguation_id(name)
+        name = re.sub(r'\s+', ' ', name).strip()
+        if ',' in name:
+            parts = name.split(',', 1)
+            if len(parts) == 2:
+                name = f"{parts[1].strip()} {parts[0].strip()}"
+        name = name.lower()
+        return re.sub(r'[^\w\s]', '', name)
+    @classmethod
+    def parse_author_list(cls, authors: str) -> list[str]:
+        """Parse author string into a list of raw author names."""
+        if not authors: return []
+        # Split by ' and ', keeping original formatting
+        return re.split(r'\s+and\s+', authors, flags=re.IGNORECASE)
+    @classmethod
+    def normalize_author_list(cls, authors: str) -> list[str]:
+        """Parse and normalize a list of authors."""
+        if not authors: return []
+        author_list = cls.parse_author_list(authors)
+        normalized = []
+        for author in author_list:
+            norm = cls.normalize_author_name(author.strip())
+            if norm: normalized.append(norm)
+        return normalized
+    @classmethod
+    def similarity_ratio(cls, text1: str, text2: str) -> float:
+        """Calculate Jaccard similarity between two strings."""
+        if not text1 or not text2: return 0.0
+        words1, words2 = set(text1.split()), set(text2.split())
+        if not words1 and not words2: return 1.0
+        if not words1 or not words2: return 0.0
+        return len(words1 & words2) / len(words1 | words2)
+    @classmethod
+    def levenshtein_similarity(cls, s1: str, s2: str) -> float:
+        """Calculate normalized Levenshtein similarity."""
+        if not s1 and not s2: return 1.0
+        if not s1 or not s2: return 0.0
+        m, n = len(s1), len(s2)
+        dp = [list(range(n + 1))] + [[i] + [0]*n for i in range(1, m + 1)]
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                dp[i][j] = dp[i-1][j-1] if s1[i-1] == s2[j-1] else min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
+        return 1.0 - (dp[m][n] / max(m, n))
+@dataclass
+class ProgressStats:
+    """Statistics for progress display."""
+    total: int = 0
+    processed: int = 0
+    success: int = 0
+    warnings: int = 0
+    errors: int = 0
+class ProgressDisplay:
+    """Rich terminal progress display."""
+    def __init__(self):
+        self.console = Console()
+        self.stats = ProgressStats()
+        self._progress: Optional[Progress] = None
+        self._task = None
+    @contextmanager
+    def progress_context(self, total: int, description: str = "Processing"):
+        """Context manager for progress display."""
+        self.stats.total = total
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(bar_width=40),
+            TaskProgressColumn(),
+            TimeElapsedColumn(),
+            console=self.console,
+            transient=False
+        ) as progress:
+            self._progress = progress
+            self._task = progress.add_task(description, total=total)
+            try:
+                yield self
+            finally:
+                self._progress = None
+                self._task = None
+    def update(self, entry_key: str = "", task: str = "", advance: int = 0):
+        """Update progress display."""
+        if self._progress and self._task is not None:
+            desc = f"[cyan]{entry_key}[/cyan] - {task}" if entry_key else task
+            self._progress.update(self._task, description=desc, advance=advance)
+            self.stats.processed += advance
+    def mark_success(self): self.stats.success += 1
+    def mark_warning(self): self.stats.warnings += 1
+    def mark_error(self): self.stats.errors += 1
+    def print_error(self, message: str):
+        self.console.print(f"  [red]✗[/red] {message}")