voidful commited on
Commit
11a28db
·
verified ·
1 Parent(s): ec88be4

Add RefCheck Gradio Space

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/index_shards/index_00.json filter=lfs diff=lfs merge=lfs -text
37
+ data/index_shards/index_01.json filter=lfs diff=lfs merge=lfs -text
38
+ data/index_shards/index_02.json filter=lfs diff=lfs merge=lfs -text
39
+ data/index_shards/index_03.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ .eggs/
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+
10
+ # Environment
11
+ .env
12
+ .venv/
13
+ venv/
14
+
15
+ # IDE
16
+ .idea/
17
+ .vscode/
18
+ *.swp
19
+
20
+ # DBLP raw data (regenerate with: python scripts/update_db.py)
21
+ data/raw/
22
+
23
+ # Legacy single-file index (replaced by sharded index)
24
+ data/conference_index.json
README.md CHANGED
@@ -1,13 +1,220 @@
1
  ---
2
  title: RefCheck
3
- emoji: 🌖
4
- colorFrom: red
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.16.0
8
- python_version: '3.13'
9
  app_file: app.py
10
- pinned: false
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: RefCheck
3
+ emoji: 🔍
4
+ colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
 
 
7
  app_file: app.py
8
+ python_version: 3.11
9
+ suggested_hardware: cpu-basic
10
+ fullWidth: true
11
+ short_description: Upload BibTeX, validate citations, download fixes.
12
+ tags:
13
+ - bibtex
14
+ - citations
15
+ - academic
16
+ - bibliography
17
  ---
18
 
19
+ # RefCheck 🔍
20
+
21
+ > **A Citation Hallucination Detector & Auto-Fixer**
22
+ > Validate and automatically correct your BibTeX bibliography against multiple academic databases.
23
+
24
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
25
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
26
+
27
+ ---
28
+
29
+ ## Why RefCheck?
30
+
31
+ Academic papers often contain citation errors — wrong titles, incorrect authors, mismatched years, or even completely fabricated references (hallucinations from AI tools). **RefCheck** automatically:
32
+
33
+ - ✅ **Validates** each citation against 6 academic databases
34
+ - 🔧 **Auto-fixes** metadata mismatches (title, authors, year, DOI)
35
+ - 🗑️ **Removes** unverifiable/hallucinated entries
36
+ - 📊 **Reports** a clear verification summary
37
+
38
+ ---
39
+
40
+ ## Features
41
+
42
+ ### Multi-Source Verification
43
+
44
+ RefCheck cross-references your citations against:
45
+
46
+ | Source | Lookup Methods |
47
+ |--------|----------------|
48
+ | **arXiv** | arXiv ID, Title search |
49
+ | **CrossRef** | DOI, Title search |
50
+ | **DBLP** | Title search |
51
+ | **Semantic Scholar** | DOI, Title search |
52
+ | **OpenAlex** | DOI, Title search |
53
+ | **Google Scholar** | Title search (disabled by default) |
54
+
55
+ ### Two-Pass Workflow
56
+
57
+ 1. **Pass 1 — Validate & Fix**: Checks each entry, auto-corrects metadata, removes invalid citations
58
+ 2. **Pass 2 — Verify**: Re-validates the cleaned file to confirm all entries are correct
59
+
60
+ ---
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ # Clone the repository
66
+ git clone https://github.com/voidful/RefCheck.git
67
+ cd RefCheck
68
+
69
+ # Install dependencies
70
+ pip install -r requirements.txt
71
+ ```
72
+
73
+ ### Requirements
74
+
75
+ - Python 3.9+
76
+ - Dependencies: `bibtexparser`, `requests`, `beautifulsoup4`, `rich`, `Unidecode`, `lxml`
77
+
78
+ ---
79
+
80
+ ## Usage
81
+
82
+ ### Hugging Face Space
83
+
84
+ This repository is ready to run as a Gradio Space. Create a Hugging Face Space with the Gradio SDK, push these files, and the Space will launch `app.py`.
85
+
86
+ The Space UI accepts a `.bib` upload and returns:
87
+
88
+ - a corrected BibTeX file
89
+ - a Markdown validation report
90
+ - a list of entries that still need manual review
91
+
92
+ ### Basic Usage
93
+
94
+ ```bash
95
+ # Validate and auto-fix a bib file
96
+ python main.py --bib references.bib
97
+ ```
98
+
99
+ ### Command-Line Options
100
+
101
+ | Option | Short | Description |
102
+ |--------|-------|-------------|
103
+ | `--bib` | `-b` | Path to your `.bib` file (required) |
104
+ | `--output` | `-o` | Output report path (optional) |
105
+
106
+ ### Example
107
+
108
+ ```bash
109
+ # Process your bibliography
110
+ python main.py --bib paper/references.bib
111
+
112
+ # With custom output path
113
+ python main.py --bib refs.bib --output validation_report.md
114
+ ```
115
+
116
+ ---
117
+
118
+ ## How It Works
119
+
120
+ ```
121
+ ┌─────────────────┐
122
+ │ Load .bib file │
123
+ └────────┬────────┘
124
+
125
+ ┌─────────────────────────────────────────┐
126
+ │ For each entry: │
127
+ │ 1. Query academic databases │
128
+ │ 2. Compare metadata (title, author, yr)│
129
+ │ 3. Calculate confidence score │
130
+ └────────┬────────────────────────────────┘
131
+
132
+ ┌─────────────────────────────────────────┐
133
+ │ Decision: │
134
+ │ • confidence > 85% → Auto-fix metadata │
135
+ │ • Match found → Keep as-is │
136
+ │ • No match → Remove entry │
137
+ └────────┬────────────────────────────────┘
138
+
139
+ ┌─────────────────────────────────────────┐
140
+ │ Save updated .bib file │
141
+ │ Run verification pass │
142
+ └─────────────────────────────────────────┘
143
+ ```
144
+
145
+ ---
146
+
147
+ ## Output
148
+
149
+ RefCheck displays real-time progress and a final summary:
150
+
151
+ ```
152
+ 📚 BibGuard - Auto-Fix & Verify
153
+ Target: references.bib
154
+
155
+ Found 42 entries. Running validation and auto-fix...
156
+
157
+ Validating & Fixing ━━━━━━━━━━━━━━━━━ 100% 42/42 ✓ 38 ⚠ 2 ✗ 2
158
+
159
+ ✏️ Updates:
160
+ - Fixed 2 entries (metadata updated)
161
+ - Removed 2 invalid/hallucinated entries
162
+ ✓ File saved.
163
+
164
+ 🔄 Double checking (Re-validation)...
165
+
166
+ Verifying ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 40/40 ✓ 40
167
+
168
+ ==================================================
169
+ 📊 Final Status
170
+ ==================================================
171
+ Total: 40
172
+ ✓ Verified: 40
173
+ ⚠ Issues: 0
174
+ ✗ Not found: 0
175
+ ```
176
+
177
+ ### Status Meanings
178
+
179
+ | Symbol | Meaning |
180
+ |--------|---------|
181
+ | ✅ Verified | Entry matches a known publication |
182
+ | ⚠️ Fixed | Metadata was auto-corrected |
183
+ | ❌ Removed | Entry could not be verified (likely hallucination) |
184
+
185
+ ---
186
+
187
+ ## Project Structure
188
+
189
+ ```
190
+ RefCheck/
191
+ ├── main.py # Entry point & workflow orchestration
192
+ ├── requirements.txt # Python dependencies
193
+ ├── README.md
194
+ └── src/
195
+ ├── fetcher.py # API clients for academic databases
196
+ ├── comparator.py # Metadata comparison & scoring
197
+ ├── parser.py # BibTeX parsing & saving
198
+ └── utils.py # Progress display & text utilities
199
+ ```
200
+
201
+ ---
202
+
203
+ ## License
204
+
205
+ MIT License — see [LICENSE](LICENSE) for details.
206
+
207
+ ---
208
+
209
+ ## Contributing
210
+
211
+ Contributions are welcome! Please feel free to submit a Pull Request.
212
+
213
+ ---
214
+
215
+ ## Acknowledgments
216
+
217
+ Built with:
218
+ - [bibtexparser](https://github.com/sciunto-org/python-bibtexparser) for BibTeX handling
219
+ - [Rich](https://github.com/Textualize/rich) for beautiful terminal output
220
+ - APIs from arXiv, CrossRef, DBLP, Semantic Scholar, and OpenAlex
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import gradio as gr
7
+
8
+ from src.space_service import RefCheckOptions, run_refcheck_file
9
+
10
+
11
+ def _uploaded_path(uploaded: Any) -> str | None:
12
+ if not uploaded:
13
+ return None
14
+ if isinstance(uploaded, str):
15
+ return uploaded
16
+ if isinstance(uploaded, dict):
17
+ return uploaded.get("path") or uploaded.get("name")
18
+ name = getattr(uploaded, "name", None)
19
+ if name:
20
+ return str(name)
21
+ return None
22
+
23
+
24
+ def process_bib(
25
+ uploaded: Any,
26
+ remove_unverified: bool,
27
+ enable_google_scholar: bool,
28
+ max_workers: int,
29
+ ) -> tuple[str, str | None, str | None]:
30
+ file_path = _uploaded_path(uploaded)
31
+ if not file_path:
32
+ return "## RefCheck Report\n\nNo BibTeX file was uploaded.", None, None
33
+
34
+ try:
35
+ options = RefCheckOptions(
36
+ remove_unverified=remove_unverified,
37
+ enable_google_scholar=enable_google_scholar,
38
+ max_workers=int(max_workers),
39
+ )
40
+ result = run_refcheck_file(Path(file_path), options)
41
+ return result.report_markdown, result.fixed_bib_path, result.report_path
42
+ except Exception as exc:
43
+ return f"## RefCheck Report\n\nProcessing failed: `{exc}`", None, None
44
+
45
+
46
+ with gr.Blocks(title="RefCheck") as demo:
47
+ gr.Markdown("# RefCheck")
48
+
49
+ with gr.Row():
50
+ with gr.Column(scale=1):
51
+ bib_file = gr.File(
52
+ label="BibTeX file",
53
+ file_types=[".bib", ".txt"],
54
+ type="filepath",
55
+ )
56
+ remove_unverified = gr.Checkbox(
57
+ label="Remove unverifiable entries",
58
+ value=True,
59
+ )
60
+ enable_google_scholar = gr.Checkbox(
61
+ label="Google Scholar fallback",
62
+ value=False,
63
+ )
64
+ max_workers = gr.Slider(
65
+ label="Parallel lookups",
66
+ minimum=1,
67
+ maximum=8,
68
+ step=1,
69
+ value=4,
70
+ )
71
+ run_button = gr.Button("Run RefCheck", variant="primary")
72
+
73
+ with gr.Column(scale=2):
74
+ report = gr.Markdown(label="Report")
75
+ fixed_bib = gr.File(label="Fixed BibTeX")
76
+ report_file = gr.File(label="Markdown report")
77
+
78
+ run_button.click(
79
+ fn=process_bib,
80
+ inputs=[bib_file, remove_unverified, enable_google_scholar, max_workers],
81
+ outputs=[report, fixed_bib, report_file],
82
+ api_name="refcheck",
83
+ )
84
+
85
+
86
+ if __name__ == "__main__":
87
+ demo.queue(default_concurrency_limit=2).launch()
data/abbr.tsv ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pattern (regex, case-insensitive) Abbreviation
2
+ # Speech & Audio
3
+ .*Interspeech.* Interspeech
4
+ .*IEEE.*International Conference.*Acoustics.*Speech.*Signal Processing.* ICASSP
5
+ .*IEEE.*Automatic Speech Recognition.*Understanding.* ASRU
6
+ .*IEEE Spoken Language Technology.* SLT
7
+ .*IEEE/ACM Transactions on Audio.*Speech.*Language.* IEEE/ACM Trans. Audio Speech Lang. Process.
8
+
9
+ # ML
10
+ .*International Conference on Machine Learning.* ICML
11
+ .*Advances in Neural Information Processing.* NeurIPS
12
+ .*Conference on Neural Information Processing.* NeurIPS
13
+ .*International Conference on Learning Representations.* ICLR
14
+ .*AAAI Conference on Artificial Intelligence.* AAAI
15
+ .*International Joint Conference on Artificial Intelligence.* IJCAI
16
+ .*IEEE.*Conference on Computer Vision and Pattern Recognition.* CVPR
17
+ .*European Conference on Computer Vision.* ECCV
18
+ .*IEEE International Conference on Computer Vision[^a].* ICCV
19
+
20
+ # NLP
21
+ .*Annual Meeting.*Association for Computational Linguistics.* ACL
22
+ .*Empirical Methods in Natural Language Processing.* EMNLP
23
+ .*North American Chapter.*Association for Computational Linguistics.* NAACL
24
+ .*European Chapter.*Association for Computational Linguistics.* EACL
25
+ .*Findings.*EMNLP.* Findings of EMNLP
26
+ .*Findings.*ACL.* Findings of ACL
27
+ .*International Conference on Computational Linguistics.* COLING
28
+ .*Conference.*Machine Translation.* WMT
29
+ .*Language Resources and Evaluation.* LREC
30
+
31
+ # IR / Web / Data
32
+ .*ACM.*Information Retrieval.* SIGIR
33
+ .*Knowledge Discovery.*Data Mining.* KDD
34
+ .*World Wide Web.* WWW
35
+ .*Web Search and Data Mining.* WSDM
data/index_shards/index_00.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2522c57f135bb5c3d581c824cb8538e9f84b786a01a3d0535b52457ef91b227
3
+ size 26214218
data/index_shards/index_01.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd3dbc8999aed2796171312da9521a84548b105f32ecb7621e09c97a8c298c7
3
+ size 26214151
data/index_shards/index_02.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:519a53fc56bbeb76feaa0be489bd4cc3727a5261b270846ffd6d1a97d42551b9
3
+ size 26214343
data/index_shards/index_03.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c32e3cc3dd2809c115e33f37030166e2b66eeb0dab7dd0b81010647f799ec93
3
+ size 25401874
main.py ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BibGuard - Citation Hallucination Detector
4
+
5
+ Validates bibliography entries against multiple academic data sources:
6
+ arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
7
+
8
+ Usage:
9
+ python main.py --bib references.bib
10
+ python main.py --bib references.bib --output report.md
11
+ """
12
+ import argparse
13
+ import sys
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from dataclasses import dataclass, field
17
+ from typing import List, Optional
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ import threading
20
+ import copy
21
+
22
+ from src.parser import BibParser
23
+ from src.fetcher import (
24
+ ArxivFetcher, CrossRefFetcher, DBLPFetcher,
25
+ SemanticScholarFetcher, OpenAlexFetcher, ScholarFetcher
26
+ )
27
+ from src.comparator import MetadataComparator, EntryReport, resolve_year, CURRENT_YEAR
28
+ from src.sanitizer import BibSanitizer
29
+ from src.local_db import LocalConferenceDB
30
+ from src.ui import BibUI
31
+ from src.utils import ProgressDisplay, TextNormalizer
32
+ @dataclass
33
+ class WorkflowStep:
34
+ name: str
35
+ enabled: bool = True
36
+ display_name: str = ""
37
+ priority: int = 0
38
+
39
+ @dataclass
40
+ class WorkflowConfig:
41
+ steps: List[WorkflowStep] = field(default_factory=list)
42
+ def get_enabled_steps(self) -> List[WorkflowStep]:
43
+ return sorted([s for s in self.steps if s.enabled], key=lambda x: x.priority)
44
+
45
+ def get_default_workflow() -> WorkflowConfig:
46
+ return WorkflowConfig(steps=[
47
+ WorkflowStep("arxiv_id", True, "arXiv by ID", 0),
48
+ WorkflowStep("crossref_doi", True, "CrossRef by DOI", 1),
49
+ WorkflowStep("semantic_scholar", True, "Semantic Scholar", 2),
50
+ WorkflowStep("dblp", True, "DBLP", 3),
51
+ WorkflowStep("openalex", True, "OpenAlex", 4),
52
+ WorkflowStep("arxiv_title", True, "arXiv by Title", 5),
53
+ WorkflowStep("crossref_title", True, "CrossRef by Title", 6),
54
+ WorkflowStep("google_scholar", False, "Google Scholar", 7),
55
+ ])
56
+
57
+ def main():
58
+ parser = argparse.ArgumentParser(
59
+ description="BibGuard: Citation Fixer & Validator",
60
+ formatter_class=argparse.RawDescriptionHelpFormatter
61
+ )
62
+
63
+ parser.add_argument("--bib", "-b", required=True, help="Path to .bib file")
64
+ parser.add_argument("--output", "-o", help="Output report path (optional)")
65
+
66
+ args = parser.parse_args()
67
+
68
+ bib_path = Path(args.bib)
69
+ if not bib_path.exists():
70
+ print(f"Error: Bib file not found: {args.bib}")
71
+ sys.exit(1)
72
+
73
+ workflow = get_default_workflow()
74
+
75
+ try:
76
+ run_fix_and_verify(bib_path, workflow)
77
+ except KeyboardInterrupt:
78
+ print("\nCancelled")
79
+ sys.exit(130)
80
+
81
+
82
+ def run_fix_and_verify(bib_path: Path, workflow):
83
+ """Run validation, auto-fix issues, and verify."""
84
+ progress = ProgressDisplay()
85
+ bib_parser = BibParser()
86
+ ui = BibUI()
87
+
88
+ print(f"📚 BibGuard - Auto-Fix & Verify")
89
+ print(f" Target: {bib_path}\n")
90
+
91
+ # --- Pass 1: Validate & Fix ---
92
+ entries = bib_parser.parse_file(str(bib_path))
93
+ if not entries:
94
+ print("No entries found")
95
+ return
96
+
97
+ print(f"Found {len(entries)} entries. Running validation and auto-fix...\n")
98
+
99
+ # Initialize components
100
+ fetchers = {
101
+ 'arxiv': ArxivFetcher(),
102
+ 'crossref': CrossRefFetcher(),
103
+ 'scholar': ScholarFetcher(),
104
+ 'semantic': SemanticScholarFetcher(),
105
+ 'openalex': OpenAlexFetcher(),
106
+ 'dblp': DBLPFetcher(),
107
+ }
108
+ comparator = MetadataComparator()
109
+ sanitizer = BibSanitizer()
110
+
111
+ fixed_count = 0
112
+ updated_entries = []
113
+ fixed_details = {} # Key: entry_key, Value: list of changes
114
+ removed_details = [] # List of (entry_key, reason)
115
+ manual_review_queue = [] # List of (entry, best_result, candidates)
116
+
117
+ # --- Phase 0: Sanitize (Offline Checks) ---
118
+ print("🧹 Running formatting sanity checks...")
119
+ sanitize_fixes = sanitizer.sanitize_all(entries)
120
+ ui.show_sanitize_report(sanitize_fixes)
121
+
122
+ # If sanitization made changes, save immediately so Phase 1 works on clean data
123
+ if sanitize_fixes:
124
+ bib_parser.save_entries(str(bib_path), entries)
125
+ # Merge sanitize fixes into fixed_details for the final report
126
+ for key, fixes in sanitize_fixes.items():
127
+ if key not in fixed_details:
128
+ fixed_details[key] = []
129
+ for fix in fixes:
130
+ fixed_details[key].append(fix.description)
131
+ fixed_count += 1
132
+
133
+ # Duplicate detection
134
+ dupes = sanitizer.find_duplicates(entries)
135
+ if dupes:
136
+ print(f"\n⚠ Found {len(dupes)} duplicate title(s):")
137
+ for title, keys in dupes.items():
138
+ print(f" {' / '.join(keys)}")
139
+ print()
140
+
141
+ # --- Phase 0.5: Local DB Lookup ---
142
+ local_db = LocalConferenceDB()
143
+ local_db_loaded = local_db.load()
144
+
145
+ api_needed_entries = entries # Default: all entries need API
146
+ if local_db_loaded:
147
+ api_needed_entries = []
148
+ local_matched_count = 0
149
+ for entry in entries:
150
+ official = local_db.lookup(entry.title)
151
+ if official:
152
+ # Apply local DB fix
153
+ changes = apply_local_fix(entry, official)
154
+ if changes:
155
+ local_matched_count += 1
156
+ if entry.key not in fixed_details:
157
+ fixed_details[entry.key] = []
158
+ fixed_details[entry.key].extend(changes)
159
+ fixed_count += 1
160
+ else:
161
+ api_needed_entries.append(entry)
162
+
163
+ if local_matched_count > 0:
164
+ print(f" 📚 Local DB matched: {local_matched_count}, API needed: {len(api_needed_entries)}")
165
+ bib_parser.save_entries(str(bib_path), entries)
166
+
167
+ # --- Phase 1: Analysis (API Fetch) ---
168
+ analysis_results = []
169
+
170
+ with progress.progress_context(len(api_needed_entries), "Analyzing Entries") as prog:
171
+ with ThreadPoolExecutor(max_workers=min(10, max(1, len(api_needed_entries)))) as executor:
172
+ futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in api_needed_entries}
173
+
174
+ for future in as_completed(futures):
175
+ entry = futures[future]
176
+ try:
177
+ best_result, candidates = future.result()
178
+ analysis_results.append((entry, best_result, candidates))
179
+ prog.update(entry.key, "Analyzed", 1)
180
+ except Exception as e:
181
+ prog.mark_error()
182
+ prog.update(entry.key, "Failed", 1)
183
+ # Keep valid entry even if fetch failed
184
+ analysis_results.append((entry, None, []))
185
+
186
+ # --- Phase 2: Meaningful Report ---
187
+ # Categorize results
188
+ to_fix = []
189
+ to_review = []
190
+ to_remove = []
191
+ ok_entries = []
192
+
193
+ for entry, best_result, candidates in analysis_results:
194
+ if not best_result:
195
+ ok_entries.append(entry)
196
+ continue
197
+
198
+ # Entries flagged for forced API lookup (e.g., future year) always go to to_fix
199
+ if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
200
+ to_fix.append((entry, best_result, candidates))
201
+ elif best_result.confidence > 0.85 and best_result.fetched_data:
202
+ to_fix.append((entry, best_result, candidates))
203
+ elif best_result.is_match:
204
+ ok_entries.append(entry)
205
+ elif candidates:
206
+ to_review.append((entry, best_result, candidates))
207
+ else:
208
+ to_remove.append(entry)
209
+
210
+ # Visualize Analysis Report
211
+ ui.show_analysis_report(ok_entries, to_fix, to_review, to_remove)
212
+
213
+ if not (to_fix or to_review or to_remove):
214
+ return
215
+
216
+ # --- Phase 3: Apply Fixes ---
217
+ print(f"\n🚀 Applying fixes...")
218
+
219
+ updated_entries = []
220
+ # Add OK entries first (preserve order if we cared, but we sort later usually)
221
+ updated_entries.extend(ok_entries)
222
+
223
+ # Process Fixes
224
+ for entry, best_result, candidates in to_fix:
225
+ changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
226
+ if changes:
227
+ fixed_count += 1
228
+ fixed_details[entry.key] = changes
229
+ updated_entries.append(entry)
230
+
231
+ # Process Removals
232
+ for entry in to_remove:
233
+ removed_details.append((entry, "No matching metadata found in any source"))
234
+ # Do NOT add to updated_entries
235
+
236
+ # Process Reviews (Add to queue)
237
+ for item in to_review:
238
+ manual_review_queue.append(item)
239
+ updated_entries.append(item[0]) # Add tentatively, filter later if removed
240
+
241
+
242
+ # --- Interactive Manual Review ---
243
+ if manual_review_queue:
244
+ print(f"\n\n🔍 Manual Review Required for {len(manual_review_queue)} entries:")
245
+
246
+ # Sort by key for consistent order
247
+ manual_review_queue.sort(key=lambda x: x[0].key)
248
+
249
+ entries_to_remove = set()
250
+
251
+ for entry, best_res, candidates in manual_review_queue:
252
+ ui.show_manual_review(entry, best_res, candidates, apply_fix)
253
+
254
+ while True:
255
+ choice = input(f"\nSelect [1-{len(candidates)}], (s)kip, (r)emove, or (q)uit: ").strip().lower()
256
+
257
+ if choice == 'q':
258
+ print("Exiting manual review.")
259
+ # Keep remaining in queue as is (already in updated_entries)
260
+ break
261
+ elif choice == 's':
262
+ print("Skipped.")
263
+ break
264
+ elif choice == 'r':
265
+ print("Marked for removal.")
266
+ entries_to_remove.add(entry.key)
267
+ removed_details.append((entry, "Removed by user during manual review"))
268
+ break
269
+ elif choice.isdigit():
270
+ idx = int(choice) - 1
271
+ if 0 <= idx < len(candidates):
272
+ selected = candidates[idx]
273
+ changes = apply_fix(entry, selected.fetched_data)
274
+ if changes:
275
+ fixed_count += 1
276
+ if entry.key not in fixed_details: fixed_details[entry.key] = []
277
+ fixed_details[entry.key].extend(changes)
278
+ print(f"Applied: {', '.join(changes)}")
279
+ else:
280
+ print("No changes needed for selected source.")
281
+ break
282
+ else:
283
+ print("Invalid selection.")
284
+ else:
285
+ print("Invalid input.")
286
+
287
+ if choice == 'q':
288
+ break
289
+
290
+ # Filter out removed entries
291
+ if entries_to_remove:
292
+ updated_entries = [e for e in updated_entries if e.key not in entries_to_remove]
293
+
294
+ # Overwrite file if changes made
295
+ # Overwrite file if changes made (beyond Phase 0 sanitization)
296
+ has_phase1_changes = any(k not in sanitize_fixes for k in fixed_details) or removed_details
297
+ if has_phase1_changes or fixed_count > len(sanitize_fixes):
298
+ bib_parser.save_entries(str(bib_path), updated_entries)
299
+
300
+
301
+ # --- Pass 2: Double Check ---
302
+ print("\n🔄 Double checking (Re-validation)...")
303
+
304
+ entries = bib_parser.parse_file(str(bib_path))
305
+ reports = []
306
+
307
+ with progress.progress_context(len(entries), "Verifying") as prog:
308
+ with ThreadPoolExecutor(max_workers=min(10, len(entries))) as executor:
309
+ # Note: validate_entry now returns tuple, need to handle
310
+ futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in entries}
311
+
312
+ for future in as_completed(futures):
313
+ entry = futures[future]
314
+ try:
315
+ best_result, _ = future.result() # Ignore candidates in verify pass
316
+ reports.append(EntryReport(entry=entry, comparison=best_result))
317
+
318
+ if best_result.is_match:
319
+ prog.mark_success()
320
+ else:
321
+ prog.mark_error()
322
+ prog.update(entry.key, "Verified", 1)
323
+ except Exception:
324
+ prog.mark_error()
325
+ prog.update(entry.key, "Failed", 1)
326
+
327
+ # Summary
328
+ total = len(entries)
329
+ verified = sum(1 for r in reports if r.comparison and r.comparison.is_match)
330
+ issues = sum(1 for r in reports if r.comparison and r.comparison.has_issues)
331
+ not_found = sum(1 for r in reports if r.comparison and not r.comparison.is_match and not r.comparison.has_issues)
332
+
333
+
334
+ # Visual Final Status
335
+ ui.show_final_report(total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details)
336
+ print("")
337
+
338
+ def apply_local_fix(entry, official) -> list:
339
+ """
340
+ Apply fixes from local conference DB (ground truth).
341
+ Only updates year, booktitle, and entry type — not authors or title,
342
+ since DBLP data for those may have different formatting conventions.
343
+ """
344
+ changes = []
345
+
346
+ # Year: conference year is ground truth
347
+ if official.year and official.year != entry.year:
348
+ year_int = int(official.year) if official.year.isdigit() else 0
349
+ if 1950 <= year_int <= CURRENT_YEAR:
350
+ changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
351
+ entry.year = official.year
352
+
353
+ # Entry type upgrade: misc/article → inproceedings if booktitle exists
354
+ if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
355
+ old_type = entry.entry_type
356
+ entry.entry_type = 'inproceedings'
357
+ if 'ENTRYTYPE' in entry.raw_entry:
358
+ entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
359
+ # Clear journal if it was arXiv
360
+ if entry.journal and 'arxiv' in entry.journal.lower():
361
+ entry.journal = ""
362
+ if 'journal' in entry.raw_entry:
363
+ del entry.raw_entry['journal']
364
+ changes.append(f"Type: @{old_type} → @inproceedings [local_db]")
365
+
366
+ # Booktitle: adopt from DB if missing or different
367
+ if official.booktitle and not entry.booktitle:
368
+ entry.booktitle = official.booktitle
369
+ entry.raw_entry['booktitle'] = official.booktitle
370
+ changes.append(f"Booktitle: [Added] {official.booktitle[:50]}... [local_db]")
371
+
372
+ # DOI: adopt if missing
373
+ if official.doi and not entry.doi:
374
+ entry.doi = official.doi
375
+ entry.raw_entry['doi'] = official.doi
376
+ changes.append(f"DOI: [Added] {official.doi} [local_db]")
377
+
378
+ return changes
379
+
380
+
381
+ def apply_fix(entry, data, all_candidates=None) -> list:
382
+ """Update entry metadata from fetched data. Returns list of changes strings."""
383
+ changes = []
384
+
385
+ # Helper to clean string
386
+ def clean(s): return str(s).strip() if s else ""
387
+
388
+ # Title
389
+ new_title = clean(data.title)
390
+ if new_title and new_title.lower() != entry.title.lower():
391
+ changes.append(f"Title: {entry.title} -> {new_title}")
392
+ entry.title = new_title
393
+
394
+ # Year: Use resolve_year() if we have multiple candidates
395
+ if all_candidates:
396
+ best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
397
+ if best_year and best_year != entry.year:
398
+ if int(best_year) > CURRENT_YEAR:
399
+ changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
400
+ else:
401
+ changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
402
+ entry.year = best_year
403
+ else:
404
+ # Single candidate fallback
405
+ new_year = clean(getattr(data, 'year', ''))
406
+ if new_year and new_year != entry.year:
407
+ if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
408
+ changes.append(f"⚠ Skip suspicious future year {new_year}")
409
+ else:
410
+ changes.append(f"Year: {entry.year} -> {new_year}")
411
+ entry.year = new_year
412
+
413
+ # Author: Smart Merge Strategy
414
+ # Check for author initial conflict first
415
+ has_initial_conflict = False
416
+ if all_candidates:
417
+ for cand in all_candidates:
418
+ if hasattr(cand, 'author_initial_conflict') and cand.author_initial_conflict:
419
+ has_initial_conflict = True
420
+ break
421
+
422
+ if has_initial_conflict:
423
+ # Don't overwrite authors when initials conflict
424
+ changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
425
+ else:
426
+ # Normal author merge logic
427
+ current_authors_raw = TextNormalizer.parse_author_list(entry.author)
428
+ current_authors_norm = [TextNormalizer.normalize_author_name(a) for a in current_authors_raw]
429
+
430
+ new_authors_list = getattr(data, 'authors', [])
431
+ if isinstance(new_authors_list, str):
432
+ new_authors_list = TextNormalizer.parse_author_list(new_authors_list)
433
+
434
+ # Strip DBLP disambiguation IDs from new authors
435
+ new_authors_list = [TextNormalizer.strip_dblp_disambiguation_id(str(a)) for a in new_authors_list]
436
+
437
+ # Also check if the EXISTING bib authors have DBLP disambiguation IDs baked in
438
+ for raw_auth in current_authors_raw:
439
+ if TextNormalizer.has_dblp_disambiguation_id(raw_auth.strip()):
440
+ changes.append(f"⚠ DBLP disambiguation ID detected in author: '{raw_auth.strip()}'")
441
+
442
+ final_authors = []
443
+
444
+ for new_auth in new_authors_list:
445
+ new_auth_str = str(new_auth).strip()
446
+ new_auth_norm = TextNormalizer.normalize_author_name(new_auth_str)
447
+
448
+ # Try to find a match in the existing list
449
+ match_found = False
450
+ for i, old_norm in enumerate(current_authors_norm):
451
+ if old_norm == new_auth_norm:
452
+ # Found a match! Use the OLD format
453
+ final_authors.append(current_authors_raw[i].strip())
454
+ match_found = True
455
+ break
456
+
457
+ if not match_found:
458
+ # New author, use the new string
459
+ final_authors.append(new_auth_str)
460
+
461
+ # Reconstruct the string
462
+ new_author_str = " and ".join(final_authors)
463
+
464
+ # Check if the result is effectively different from the original full string
465
+ def simple_norm(s): return s.lower().replace(" ", "").strip()
466
+
467
+ if simple_norm(new_author_str) != simple_norm(entry.author):
468
+ old_auth = (entry.author[:50] + '...') if len(entry.author) > 50 else entry.author
469
+ new_auth_disp = (new_author_str[:50] + '...') if len(new_author_str) > 50 else new_author_str
470
+ changes.append(f"Author: {old_auth} -> {new_auth_disp}")
471
+ entry.author = new_author_str
472
+
473
+ # Optional fields (doi, journal, etc.)
474
+ if hasattr(data, 'doi') and data.doi and not entry.doi:
475
+ changes.append(f"DOI: [Added] {data.doi}")
476
+ entry.doi = data.doi
477
+
478
+ return changes
479
+
480
+
481
+ def validate_entry(entry, workflow, fetchers, comparator):
482
+ """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
483
+ from src.utils import TextNormalizer
484
+
485
+ results = []
486
+
487
+ for step in workflow.get_enabled_steps():
488
+ result = None
489
+ data = None
490
+
491
+ if step.name == "arxiv_id" and entry.has_arxiv:
492
+ data = fetchers['arxiv'].fetch_by_id(entry.arxiv_id)
493
+ if data: result = comparator.compare(entry, data, "arxiv")
494
+
495
+ elif step.name == "crossref_doi" and entry.doi:
496
+ data = fetchers['crossref'].search_by_doi(entry.doi)
497
+ if data:
498
+ # DOI cross-validation: check if the DOI actually resolves to this paper
499
+ from src.sanitizer import BibSanitizer
500
+ doi_fixes = BibSanitizer().check_doi_title_match(entry, data)
501
+ if doi_fixes:
502
+ # DOI points to a different work — skip this result
503
+ # The fixes have already cleared the bad DOI from the entry
504
+ result = None
505
+ else:
506
+ result = comparator.compare(entry, data, "crossref")
507
+
508
+ elif step.name == "semantic_scholar" and entry.title:
509
+ data = fetchers['semantic'].fetch_by_doi(entry.doi) if entry.doi else None
510
+ if not data:
511
+ data = fetchers['semantic'].search_by_title(entry.title)
512
+ if data: result = comparator.compare(entry, data, "semantic_scholar")
513
+
514
+ elif step.name == "dblp" and entry.title:
515
+ data = fetchers['dblp'].search_by_title(entry.title)
516
+ if data: result = comparator.compare(entry, data, "dblp")
517
+
518
+ elif step.name == "openalex" and entry.title:
519
+ data = fetchers['openalex'].fetch_by_doi(entry.doi) if entry.doi else None
520
+ if not data:
521
+ data = fetchers['openalex'].search_by_title(entry.title)
522
+ if data: result = comparator.compare(entry, data, "openalex")
523
+
524
+ elif step.name == "arxiv_title" and entry.title:
525
+ metas = fetchers['arxiv'].search_by_title(entry.title)
526
+ if metas:
527
+ norm1 = TextNormalizer.normalize_for_comparison(entry.title)
528
+ best, best_sim = None, 0
529
+ for m in metas:
530
+ sim = TextNormalizer.similarity_ratio(
531
+ norm1, TextNormalizer.normalize_for_comparison(m.title)
532
+ )
533
+ if sim > best_sim:
534
+ best, best_sim = m, sim
535
+ if best and best_sim > 0.5:
536
+ result = comparator.compare(entry, best, "arxiv")
537
+
538
+ elif step.name == "crossref_title" and entry.title:
539
+ data = fetchers['crossref'].search_by_title(entry.title)
540
+ if data: result = comparator.compare(entry, data, "crossref")
541
+
542
+ elif step.name == "google_scholar" and entry.title:
543
+ data = fetchers['scholar'].search_by_title(entry.title)
544
+ if data: result = comparator.compare(entry, data, "scholar")
545
+
546
+ if result:
547
+ results.append(result)
548
+
549
+ if results:
550
+ best = max(results, key=lambda r: r.confidence)
551
+ return best, results
552
+
553
+ # No results
554
+ return comparator.create_unable_result(entry, "Not found in any data source"), []
555
+
556
+
557
+
558
+
559
+
560
+ if __name__ == "__main__":
561
+ main()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ bibtexparser>=1.4.0
2
+ requests>=2.31.0
3
+ beautifulsoup4>=4.12.0
4
+ rich>=13.7.0
5
+ Unidecode>=1.3.0
6
+ lxml>=5.0.0
7
+ gradio>=4.44.0
scripts/build_index.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Build a title-based index from downloaded DBLP bib files.
4
+
5
+ Reads all .bib files in data/raw/ and produces sharded JSON files
6
+ under data/index_shards/ (~25MB each) for GitHub-friendly storage.
7
+
8
+ Usage:
9
+ python scripts/build_index.py
10
+ """
11
+ import json
12
+ import os
13
+ import re
14
+ import shutil
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ try:
19
+ import bibtexparser
20
+ from bibtexparser.bparser import BibTexParser
21
+ from bibtexparser.customization import convert_to_unicode
22
+ except ImportError:
23
+ print("Error: bibtexparser required. Install: pip install bibtexparser")
24
+ sys.exit(1)
25
+
26
+ MAX_SHARD_MB = 25 # Target shard size in MB
27
+
28
+
29
+ def normalize_title(title: str) -> str:
30
+ """Normalize a title for index lookup."""
31
+ title = re.sub(r'\{([^}]*)\}', r'\1', title)
32
+ title = re.sub(r'[^\w\s]', ' ', title.lower())
33
+ return re.sub(r'\s+', ' ', title).strip()
34
+
35
+
36
+ def write_shards(index: dict, shard_dir: Path):
37
+ """Split index into ~25MB JSON shard files."""
38
+ if shard_dir.exists():
39
+ shutil.rmtree(shard_dir)
40
+ shard_dir.mkdir(parents=True)
41
+
42
+ shard_num = 0
43
+ shard_items = []
44
+ shard_size = 0
45
+ max_bytes = MAX_SHARD_MB * 1024 * 1024
46
+
47
+ for key, val in index.items():
48
+ entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))
49
+
50
+ if shard_size + entry_size > max_bytes and shard_items:
51
+ path = shard_dir / f"index_{shard_num:02d}.json"
52
+ path.write_text(
53
+ json.dumps(dict(shard_items), ensure_ascii=False),
54
+ encoding="utf-8"
55
+ )
56
+ mb = path.stat().st_size / 1024 / 1024
57
+ print(f" ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
58
+ shard_num += 1
59
+ shard_items = []
60
+ shard_size = 0
61
+
62
+ shard_items.append((key, val))
63
+ shard_size += entry_size
64
+
65
+ # Last shard
66
+ if shard_items:
67
+ path = shard_dir / f"index_{shard_num:02d}.json"
68
+ path.write_text(
69
+ json.dumps(dict(shard_items), ensure_ascii=False),
70
+ encoding="utf-8"
71
+ )
72
+ mb = path.stat().st_size / 1024 / 1024
73
+ print(f" ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
74
+ shard_num += 1
75
+
76
+ return shard_num
77
+
78
+
79
+ def main():
80
+ raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw"
81
+ shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
82
+
83
+ if not raw_dir.exists():
84
+ print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
85
+ sys.exit(1)
86
+
87
+ bib_files = sorted(raw_dir.glob("*.bib"))
88
+ if not bib_files:
89
+ print(f"No .bib files found in {raw_dir}")
90
+ sys.exit(1)
91
+
92
+ print(f"📦 Building index from {len(bib_files)} bib files...")
93
+
94
+ index = {}
95
+ skipped_files = 0
96
+
97
+ for bib_file in bib_files:
98
+ try:
99
+ parser = BibTexParser(common_strings=True)
100
+ parser.customization = convert_to_unicode
101
+ with open(bib_file, encoding="utf-8", errors="replace") as f:
102
+ db = bibtexparser.load(f, parser=parser)
103
+ except Exception as e:
104
+ print(f" ⚠ Skip {bib_file.name}: {e}")
105
+ skipped_files += 1
106
+ continue
107
+
108
+ for entry in db.entries:
109
+ title = entry.get("title", "")
110
+ if not title:
111
+ continue
112
+
113
+ key = normalize_title(title)
114
+ if not key:
115
+ continue
116
+
117
+ if key not in index:
118
+ index[key] = {
119
+ "title": title.rstrip('.'),
120
+ "author": entry.get("author", ""),
121
+ "year": entry.get("year", ""),
122
+ "booktitle": entry.get("booktitle", ""),
123
+ "journal": entry.get("journal", ""),
124
+ "doi": entry.get("doi", ""),
125
+ "url": entry.get("url", ""),
126
+ "pages": entry.get("pages", ""),
127
+ "volume": entry.get("volume", ""),
128
+ "_type": entry.get("ENTRYTYPE", "inproceedings"),
129
+ "_source": bib_file.stem,
130
+ }
131
+
132
+ print(f"\n📂 Writing sharded index...")
133
+ n_shards = write_shards(index, shard_dir)
134
+
135
+ total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
136
+ print(f"\n✅ Index: {len(index):,} unique entries → {n_shards} shards ({total_mb:.1f} MB total)")
137
+ print(f" Saved to: {shard_dir}/")
138
+ if skipped_files:
139
+ print(f" ⚠ {skipped_files} file(s) skipped due to parse errors")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()
scripts/refresh_db.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Refresh the local DBLP conference database.
3
+ # Run this before paper submission to ensure the DB is up to date.
4
+ set -e
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
8
+
9
+ echo "🔄 Refreshing conference database..."
10
+
11
+ # 1. Download new bib files (only those not yet downloaded)
12
+ python "$SCRIPT_DIR/update_db.py"
13
+
14
+ # 2. Rebuild the index
15
+ python "$SCRIPT_DIR/build_index.py"
16
+
17
+ echo ""
18
+ echo "✅ DB refreshed."
19
+ echo " Run: python main.py --bib your_paper.bib"
scripts/update_db.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download conference/journal proceedings from DBLP as BibTeX files.
4
+
5
+ Uses the DBLP venue-based search API which is more reliable than
6
+ the TOC-based .bht queries (which often return 404 or single entries).
7
+
8
+ API format:
9
+ https://dblp.org/search/publ/api
10
+ ?q=venue:{VenueName}: year:{year}:
11
+ &h=1000 # max results per batch
12
+ &f={offset} # pagination offset
13
+ &format=bib1 # BibTeX format
14
+
15
+ Usage:
16
+ python scripts/update_db.py
17
+ """
18
+ import requests
19
+ import time
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ DBLP_API = "https://dblp.org/search/publ/api"
24
+
25
+ # (dblp_venue_name, output_prefix, years)
26
+ # dblp_venue_name: exact venue string used in DBLP's venue: filter
27
+ # output_prefix: filename prefix for saved .bib files
28
+ CONFERENCES = [
29
+ # ── Speech & Audio ──────────────────────────────────────────
30
+ ("INTERSPEECH", "interspeech", range(2018, 2027)),
31
+ ("ICASSP", "icassp", range(2018, 2027)),
32
+ ("ASRU", "asru", [2019, 2021, 2023, 2025]),
33
+ ("SLT", "slt", [2018, 2021, 2022, 2024]),
34
+
35
+ # ── ML / AI ─────────────────────────────────────────────────
36
+ ("ICML", "icml", range(2018, 2027)),
37
+ ("NeurIPS", "neurips", range(2017, 2027)),
38
+ ("ICLR", "iclr", range(2018, 2027)),
39
+ ("AAAI", "aaai", range(2018, 2027)),
40
+ ("IJCAI", "ijcai", range(2018, 2027)),
41
+ ("CVPR", "cvpr", range(2018, 2027)),
42
+ ("ECCV", "eccv", [2018, 2020, 2022, 2024]),
43
+ ("ICCV", "iccv", [2019, 2021, 2023, 2025]),
44
+
45
+ # ── NLP ─────────────────────────────────────────────────────
46
+ ("ACL", "acl", range(2018, 2027)), # includes Findings
47
+ ("EMNLP", "emnlp", range(2018, 2027)), # includes Findings
48
+ ("NAACL", "naacl", range(2018, 2027)),
49
+ ("EACL", "eacl", range(2018, 2027)),
50
+ ("LREC/COLING", "coling", [2024, 2025]),
51
+ # Older COLING uses different venue
52
+ # ("COLING", "coling", [2018, 2020, 2022]),
53
+
54
+ # ── IR / Web / Data ─────────────────────────────────────────
55
+ ("SIGIR", "sigir", range(2018, 2027)),
56
+ ("KDD", "kdd", range(2018, 2027)),
57
+ ("WWW", "www", range(2018, 2027)),
58
+ ("WSDM", "wsdm", range(2018, 2027)),
59
+ ]
60
+
61
+ # Journals use venue search too
62
+ JOURNALS = [
63
+ ("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)),
64
+ ("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)),
65
+ ]
66
+
67
+
68
+ def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
69
+ """Download a conference/journal year from DBLP using venue search."""
70
+ out_file = out_dir / f"{prefix}{year}.bib"
71
+ if out_file.exists():
72
+ return # Skip if already downloaded
73
+
74
+ query = f"venue:{venue_name}: year:{year}:"
75
+ all_bib = []
76
+ offset = 0
77
+
78
+ while True:
79
+ try:
80
+ r = requests.get(DBLP_API, params={
81
+ "q": query, "h": 1000, "f": offset,
82
+ "format": "bib1",
83
+ }, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
84
+ text = r.text.strip()
85
+ except Exception as e:
86
+ print(f" ✗ {prefix}{year}: network error ({e})")
87
+ return
88
+
89
+ # Check for HTML error pages
90
+ if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
91
+ break
92
+
93
+ all_bib.append(text)
94
+ n_entries = text.count("@")
95
+ if n_entries < 1000:
96
+ break
97
+ offset += 1000
98
+ time.sleep(1)
99
+
100
+ if all_bib:
101
+ total = sum(b.count("@") for b in all_bib)
102
+ out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
103
+ print(f" ✓ {prefix}{year}: {total} entries")
104
+ else:
105
+ print(f" ✗ {prefix}{year}: not on DBLP yet")
106
+
107
+
108
+ def main():
109
+ out = Path(__file__).resolve().parent.parent / "data" / "raw"
110
+ out.mkdir(parents=True, exist_ok=True)
111
+
112
+ print("📥 Downloading conference proceedings from DBLP...")
113
+ total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
114
+ done = 0
115
+ for venue, prefix, years in CONFERENCES:
116
+ for y in years:
117
+ download_venue(venue, prefix, y, out)
118
+ done += 1
119
+ time.sleep(0.5)
120
+
121
+ print(f"\n📥 Downloading journal volumes from DBLP...")
122
+ for venue, prefix, years in JOURNALS:
123
+ for y in years:
124
+ download_venue(venue, prefix, y, out)
125
+ time.sleep(0.5)
126
+
127
+ print(f"\n✅ Done. Run: python scripts/build_index.py")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Bibliography Checker Package"""
src/comparator.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata comparison between bib entries and fetched metadata.
3
+ """
4
+ from datetime import datetime
5
+ from dataclasses import dataclass
6
+ from typing import Optional, List, Union, Any, Tuple
7
+
8
+ from .parser import BibEntry
9
+ from .utils import TextNormalizer
10
+
11
+ CURRENT_YEAR = datetime.now().year
12
+
13
+ # Year source priority: lower number = more trustworthy
14
+ YEAR_SOURCE_PRIORITY = {
15
+ "crossref": 0, # DOI-verified, most accurate
16
+ "dblp": 1, # Conference proceedings
17
+ "openalex": 2,
18
+ "semantic_scholar": 3,
19
+ "arxiv_journal_ref": 4, # arXiv's journal_ref field
20
+ "scholar": 5,
21
+ "arxiv": 99, # arXiv submission date — last resort
22
+ }
23
+
24
+
25
+ def resolve_year(candidates: list, bib_year: str = "") -> Tuple[Optional[str], Optional[str]]:
26
+ """
27
+ Pick the best year across all candidate results using source priority.
28
+ Conference/journal year always beats arXiv submission year.
29
+ Never returns a future year.
30
+
31
+ Args:
32
+ candidates: list of ComparisonResult objects
33
+ bib_year: the current bib entry year (fallback)
34
+ Returns:
35
+ (best_year, best_source) or (None, None)
36
+ """
37
+ pool = []
38
+ for cand in candidates:
39
+ if not cand or not cand.fetched_data:
40
+ continue
41
+ source = cand.source
42
+ fetched_year = str(getattr(cand.fetched_data, 'year', '') or '').strip()
43
+
44
+ if not fetched_year or not fetched_year.isdigit():
45
+ continue
46
+
47
+ # Check for conference_year from arXiv journal_ref
48
+ conf_year = str(getattr(cand.fetched_data, 'conference_year', '') or '').strip()
49
+ if source == "arxiv" and conf_year and conf_year.isdigit():
50
+ pool.append((YEAR_SOURCE_PRIORITY.get("arxiv_journal_ref", 4), conf_year, "arxiv_journal_ref"))
51
+
52
+ priority = YEAR_SOURCE_PRIORITY.get(source, 50)
53
+ pool.append((priority, fetched_year, source))
54
+
55
+ if not pool:
56
+ return None, None
57
+
58
+ pool.sort()
59
+
60
+ # Pick best year that isn't in the future
61
+ for _, year, source in pool:
62
+ if int(year) <= CURRENT_YEAR:
63
+ return year, source
64
+
65
+ # All years are future — return None
66
+ return None, None
67
+
68
+
69
+ @dataclass
70
+ class ComparisonResult:
71
+ """Result of comparing bib entry with fetched metadata."""
72
+ entry_key: str
73
+
74
+ # Title comparison
75
+ title_match: bool
76
+ title_similarity: float
77
+ bib_title: str
78
+ fetched_title: str
79
+
80
+ # Author comparison
81
+ author_match: bool
82
+ author_similarity: float
83
+ bib_authors: list[str]
84
+ fetched_authors: list[str]
85
+
86
+ # Year comparison
87
+ year_match: bool
88
+ bib_year: str
89
+ fetched_year: str
90
+
91
+ # Overall assessment
92
+ is_match: bool
93
+ confidence: float
94
+ issues: list[str]
95
+ source: str
96
+
97
+ # Raw metadata for auto-fixing
98
+ fetched_data: Any = None
99
+
100
+ # Author initial conflict flag
101
+ author_initial_conflict: bool = False
102
+
103
+ @property
104
+ def has_issues(self) -> bool:
105
+ return len(self.issues) > 0
106
+
107
+ @dataclass
108
+ class EntryReport:
109
+ """Complete report for a single bib entry."""
110
+ entry: BibEntry
111
+ comparison: Optional[ComparisonResult]
112
+ evaluations: list = None
113
+
114
+ def __post_init__(self):
115
+ if self.evaluations is None:
116
+ self.evaluations = []
117
+
118
+
119
+
120
+ class MetadataComparator:
121
+ """Compares bibliography entries with fetched metadata."""
122
+
123
+ # Thresholds for matching
124
+ TITLE_THRESHOLD = 0.8
125
+ AUTHOR_THRESHOLD = 0.6
126
+
127
+ def __init__(self):
128
+ self.normalizer = TextNormalizer
129
+
130
+ def compare(self, bib_entry: BibEntry, fetched_data: Any, source_name: str) -> ComparisonResult:
131
+ """
132
+ Generic comparison method for any data source.
133
+ fetched_data must have 'title', 'year', and 'authors' attributes.
134
+ """
135
+ issues = []
136
+
137
+ # --- Title Comparison ---
138
+ bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
139
+ fetched_title_norm = self.normalizer.normalize_for_comparison(fetched_data.title)
140
+
141
+ title_similarity = self.normalizer.similarity_ratio(bib_title_norm, fetched_title_norm)
142
+ if len(bib_title_norm) < 100:
143
+ lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, fetched_title_norm)
144
+ title_similarity = max(title_similarity, lev_sim)
145
+
146
+ title_match = title_similarity >= self.TITLE_THRESHOLD
147
+ if not title_match:
148
+ issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
149
+
150
+ # --- Author Comparison ---
151
+ bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
152
+
153
+ # Check for DBLP disambiguation IDs in bib entry author names
154
+ raw_author_list = self.normalizer.parse_author_list(bib_entry.author)
155
+ for raw_auth in raw_author_list:
156
+ if self.normalizer.has_dblp_disambiguation_id(raw_auth.strip()):
157
+ issues.append(f"DBLP disambiguation ID in author: '{raw_auth.strip()}'")
158
+
159
+ # Handle different author formats (list vs string)
160
+ fetched_authors_raw = getattr(fetched_data, 'authors', [])
161
+ if isinstance(fetched_authors_raw, str):
162
+ # Scholar style: "Author1, Author2"
163
+ fetched_authors_raw = [a.strip() for a in fetched_authors_raw.split(',')]
164
+
165
+ fetched_authors = [
166
+ self.normalizer.normalize_author_name(str(a))
167
+ for a in fetched_authors_raw
168
+ ]
169
+
170
+ author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
171
+ author_match = author_similarity >= self.AUTHOR_THRESHOLD
172
+
173
+ if not author_match:
174
+ issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
175
+
176
+ # --- Year Comparison ---
177
+ bib_year = str(bib_entry.year).strip()
178
+ fetched_year = str(getattr(fetched_data, 'year', '')).strip()
179
+ year_match = bib_year == fetched_year
180
+
181
+ if not year_match and bib_year and fetched_year:
182
+ issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
183
+
184
+ # --- Overall Assessment ---
185
+ is_match = title_match and author_match
186
+ # Simple weighted confidence score
187
+ confidence = (
188
+ title_similarity * 0.5 +
189
+ author_similarity * 0.3 +
190
+ (1.0 if year_match else 0.5) * 0.2
191
+ )
192
+
193
+ # --- Author Initial Conflict Detection ---
194
+ author_initial_conflict = self._check_author_initial_conflict(
195
+ bib_authors, fetched_authors,
196
+ self.normalizer.parse_author_list(bib_entry.author),
197
+ fetched_authors_raw
198
+ )
199
+ if author_initial_conflict:
200
+ issues.append("Author initial conflict detected (e.g., first-name initials differ)")
201
+ # Cap confidence — don't auto-adopt these authors
202
+ confidence = min(confidence, 0.7)
203
+
204
+ return ComparisonResult(
205
+ entry_key=bib_entry.key,
206
+ title_match=title_match,
207
+ title_similarity=title_similarity,
208
+ bib_title=bib_entry.title,
209
+ fetched_title=fetched_data.title,
210
+ author_match=author_match,
211
+ author_similarity=author_similarity,
212
+ bib_authors=bib_authors,
213
+ fetched_authors=fetched_authors,
214
+ year_match=year_match,
215
+ bib_year=bib_year,
216
+ fetched_year=fetched_year,
217
+ is_match=is_match,
218
+ confidence=confidence,
219
+ issues=issues,
220
+ source=source_name,
221
+ fetched_data=fetched_data,
222
+ author_initial_conflict=author_initial_conflict
223
+ )
224
+
225
+ def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
226
+ """Create result when metadata couldn't be fetched."""
227
+ return ComparisonResult(
228
+ entry_key=bib_entry.key,
229
+ title_match=False, title_similarity=0.0,
230
+ bib_title=bib_entry.title, fetched_title="",
231
+ author_match=False, author_similarity=0.0,
232
+ bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[],
233
+ year_match=False, bib_year=bib_entry.year, fetched_year="",
234
+ is_match=False, confidence=0.0,
235
+ issues=[reason], source="unable",
236
+ fetched_data=None
237
+ )
238
+
239
+ def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
240
+ """Compare two author lists."""
241
+ if not list1 and not list2: return 1.0
242
+ if not list1 or not list2: return 0.0
243
+
244
+ total_similarity = 0.0
245
+ for author1 in list1:
246
+ best_match = 0.0
247
+ for author2 in list2:
248
+ if self._names_match(author1, author2):
249
+ best_match = 1.0
250
+ break
251
+ sim = self.normalizer.similarity_ratio(author1, author2)
252
+ best_match = max(best_match, sim)
253
+ total_similarity += best_match
254
+
255
+ return total_similarity / len(list1)
256
+
257
+ def _names_match(self, name1: str, name2: str) -> bool:
258
+ """Check if two names match (handles abbreviated names)."""
259
+ def split_name(n):
260
+ parts = n.lower().replace('.', '').split()
261
+ return parts
262
+
263
+ words1 = split_name(name1)
264
+ words2 = split_name(name2)
265
+ if not words1 or not words2: return False
266
+
267
+ # Last name must match (assuming last word is last name)
268
+ if words1[-1] != words2[-1]:
269
+ return False
270
+
271
+ # First name check:
272
+ if len(words1) > 1 and len(words2) > 1:
273
+ f1 = words1[0]
274
+ f2 = words2[0]
275
+
276
+ # If one is just an initial
277
+ if len(f1) == 1 or len(f2) == 1:
278
+ if f1[0] != f2[0]: return False
279
+ else:
280
+ # Both full names - must match
281
+ if f1 != f2: return False
282
+
283
+ return True
284
+
285
+ def _check_author_initial_conflict(
286
+ self,
287
+ bib_authors_norm: list[str],
288
+ fetched_authors_norm: list[str],
289
+ bib_authors_raw: list[str],
290
+ fetched_authors_raw: list,
291
+ ) -> bool:
292
+ """
293
+ Detect when first-name initials clearly conflict between
294
+ bib entry and fetched data.
295
+
296
+ e.g., "Y. Zhou" (bib) vs "Henry Zhou" (fetched) → True (Y ≠ H)
297
+ This prevents blindly overwriting authors with wrong names.
298
+ """
299
+ # Compare by position — aligned authors
300
+ min_len = min(len(bib_authors_norm), len(fetched_authors_norm))
301
+ if min_len == 0:
302
+ return False
303
+
304
+ for i in range(min_len):
305
+ bib_parts = bib_authors_norm[i].split()
306
+ fetched_parts = fetched_authors_norm[i].split()
307
+
308
+ if len(bib_parts) < 2 or len(fetched_parts) < 2:
309
+ continue
310
+
311
+ # Last name must match to consider this a potential conflict
312
+ if bib_parts[-1] != fetched_parts[-1]:
313
+ continue
314
+
315
+ bib_first = bib_parts[0]
316
+ fetched_first = fetched_parts[0]
317
+
318
+ # Both have first name info (not empty)
319
+ if not bib_first or not fetched_first:
320
+ continue
321
+
322
+ # If initials differ, it's a conflict
323
+ if bib_first[0] != fetched_first[0]:
324
+ return True
325
+
326
+ return False
src/fetcher.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified metadata fetchers for BibGuard.
3
+ """
4
+ import re
5
+ import time
6
+ import random
7
+ import requests
8
+ import xml.etree.ElementTree as ET
9
+ from dataclasses import dataclass
10
+ from typing import Optional, Any
11
+ from urllib.parse import quote
12
+ from bs4 import BeautifulSoup
13
+
14
+ @dataclass
15
+ class FetchResult:
16
+ """Unified fetch result."""
17
+ title: str = ""
18
+ authors: list[str] | str = ""
19
+ year: str = ""
20
+ doi: str = ""
21
+ url: str = ""
22
+ source: str = ""
23
+ conference_year: str = "" # Year from journal_ref / conference proceedings
24
+ year_source: str = "" # Where the year came from
25
+
26
+ def __post_init__(self):
27
+ if self.authors is None: self.authors = []
28
+ if isinstance(self.authors, str) and self.authors:
29
+ # Simple split if string provided
30
+ self.authors = [a.strip() for a in re.split(r',| and ', self.authors) if a.strip()]
31
+
32
+ class BaseFetcher:
33
+ """Base class for fetchers."""
34
+ def _rate_limit(self, delay: float, last_time: float) -> float:
35
+ elapsed = time.time() - last_time
36
+ if elapsed < delay:
37
+ time.sleep(delay - elapsed)
38
+ return time.time()
39
+
40
+ class ArxivFetcher(BaseFetcher):
41
+ """Fetches metadata from arXiv API."""
42
+ API_BASE = "http://export.arxiv.org/api/query"
43
+
44
+ def __init__(self):
45
+ self._last_req = 0.0
46
+
47
+ def fetch_by_id(self, arxiv_id: str) -> Optional[FetchResult]:
48
+ self._last_req = self._rate_limit(3.0, self._last_req)
49
+ clean_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE).strip()
50
+ try:
51
+ resp = requests.get(self.API_BASE, params={'id_list': clean_id, 'max_results': 1}, timeout=30)
52
+ return self._parse(resp.text)
53
+ except Exception: return None
54
+
55
+ def search_by_title(self, title: str) -> list[FetchResult]:
56
+ self._last_req = self._rate_limit(3.0, self._last_req)
57
+ clean = re.sub(r'[^\w\s]', ' ', title).strip()
58
+ try:
59
+ resp = requests.get(self.API_BASE, params={'search_query': f'ti:"{clean}"', 'max_results': 3}, timeout=30)
60
+ return self._parse(resp.text, multiple=True)
61
+ except Exception: return []
62
+
63
+ def _parse(self, xml: str, multiple=False) -> Optional[FetchResult] | list[FetchResult]:
64
+ try:
65
+ root = ET.fromstring(xml)
66
+ ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
67
+ entries = root.findall('atom:entry', ns)
68
+ results = []
69
+ for entry in entries:
70
+ id_txt = entry.find('atom:id', ns).text
71
+ title = entry.find('atom:title', ns).text.strip()
72
+ authors = [a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)]
73
+ pub = entry.find('atom:published', ns).text
74
+ year = pub[:4] if pub else ""
75
+ doi_elem = entry.find('arxiv:doi', ns)
76
+ doi = doi_elem.text if doi_elem is not None else ""
77
+
78
+ # Extract conference year from journal_ref if available
79
+ conference_year = ""
80
+ journal_ref_elem = entry.find('arxiv:journal_ref', ns)
81
+ if journal_ref_elem is not None and journal_ref_elem.text:
82
+ jr_text = journal_ref_elem.text.strip()
83
+ year_match = re.search(r'\b(19|20)\d{2}\b', jr_text)
84
+ if year_match:
85
+ conference_year = year_match.group(0)
86
+
87
+ result = FetchResult(
88
+ title=title,
89
+ authors=authors,
90
+ year=year,
91
+ doi=doi,
92
+ url=id_txt,
93
+ source="arxiv",
94
+ conference_year=conference_year,
95
+ year_source="arxiv_journal_ref" if conference_year else "arxiv_submission",
96
+ )
97
+ results.append(result)
98
+
99
+ if multiple: return results
100
+ return results[0] if results else None
101
+ except Exception:
102
+ return [] if multiple else None
103
+
104
+ class CrossRefFetcher(BaseFetcher):
105
+ """Fetches from CrossRef API."""
106
+ API_BASE = "https://api.crossref.org/works"
107
+
108
+ def __init__(self, email=None):
109
+ self._last_req = 0.0
110
+ self.headers = {'User-Agent': f'BibGuard/1.0 (mailto:{email or "user@example.com"})'}
111
+
112
+ def search_by_title(self, title: str) -> Optional[FetchResult]:
113
+ self._last_req = self._rate_limit(0.2, self._last_req)
114
+ try:
115
+ resp = requests.get(self.API_BASE, params={'query.bibliographic': title, 'rows': 1}, headers=self.headers, timeout=10)
116
+ data = resp.json()['message']['items']
117
+ if data: return self._parse(data[0])
118
+ except Exception: pass
119
+ return None
120
+
121
+ def search_by_doi(self, doi: str) -> Optional[FetchResult]:
122
+ self._last_req = self._rate_limit(0.2, self._last_req)
123
+ try:
124
+ resp = requests.get(f"{self.API_BASE}/{quote(doi)}", headers=self.headers, timeout=10)
125
+ return self._parse(resp.json()['message'])
126
+ except Exception: return None
127
+
128
+ def _parse(self, item: dict) -> FetchResult:
129
+ title = item.get('title', [''])[0]
130
+ authors = [f"{a.get('given','')} {a.get('family','')}".strip() for a in item.get('author', [])]
131
+ year = str(item.get('published-print', {}).get('date-parts', [[None]])[0][0] or "")
132
+ return FetchResult(title, authors, year, item.get('DOI', ''), item.get('URL', ''), "crossref")
133
+
134
+ class DBLPFetcher(BaseFetcher):
135
+ """Fetches from DBLP."""
136
+ API_BASE = "https://dblp.org/search/publ/api"
137
+
138
+ # DBLP disambiguation ID: 4-digit suffix appended to author names
139
+ # e.g. "Tian Tan 0019", "Wei Li 0119"
140
+ _DISAMBIG_RE = re.compile(r'\s+\d{4}\s*$')
141
+
142
+ def __init__(self):
143
+ self._last_req = 0.0
144
+
145
+ @staticmethod
146
+ def _strip_disambig(name: str) -> str:
147
+ """Strip DBLP disambiguation suffix from author name."""
148
+ return DBLPFetcher._DISAMBIG_RE.sub('', name).strip()
149
+
150
+ def search_by_title(self, title: str) -> Optional[FetchResult]:
151
+ self._last_req = self._rate_limit(1.0, self._last_req)
152
+ try:
153
+ resp = requests.get(self.API_BASE, params={'q': title, 'format': 'json', 'h': 1}, timeout=10)
154
+ hits = resp.json().get('result', {}).get('hits', {}).get('hit', [])
155
+ if hits:
156
+ info = hits[0]['info']
157
+ authors = info.get('authors', {}).get('author', [])
158
+ if isinstance(authors, dict): authors = [self._strip_disambig(authors.get('text', ''))]
159
+ elif isinstance(authors, list): authors = [self._strip_disambig(a.get('text', '')) for a in authors]
160
+ return FetchResult(info.get('title', '').rstrip('.'), authors, info.get('year', ''), info.get('doi', ''), info.get('url', ''), "dblp")
161
+ except Exception: pass
162
+ return None
163
+
164
+ class SemanticScholarFetcher(BaseFetcher):
165
+ """Fetches from Semantic Scholar."""
166
+ API_BASE = "https://api.semanticscholar.org/graph/v1/paper"
167
+
168
+ def __init__(self):
169
+ self._last_req = 0.0
170
+
171
+ def search_by_title(self, title: str) -> Optional[FetchResult]:
172
+ return self._fetch(f"{self.API_BASE}/search", {'query': title, 'limit': 1, 'fields': 'title,authors,year,doi,url'})
173
+
174
+ def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
175
+ return self._fetch(f"{self.API_BASE}/DOI:{doi}", {'fields': 'title,authors,year,doi,url'})
176
+
177
+ def _fetch(self, url, params) -> Optional[FetchResult]:
178
+ self._last_req = self._rate_limit(2.0, self._last_req)
179
+ try:
180
+ resp = requests.get(url, params=params, timeout=10)
181
+ data = resp.json()
182
+ if 'data' in data and data['data']: data = data['data'][0] # Handle search result
183
+ if 'error' in data: return None
184
+
185
+ authors = [a['name'] for a in data.get('authors', [])]
186
+ return FetchResult(data.get('title', ''), authors, str(data.get('year', '')), data.get('doi', ''), data.get('url', ''), "semantic_scholar")
187
+ except Exception: return None
188
+
189
+ class OpenAlexFetcher(BaseFetcher):
190
+ """Fetches from OpenAlex."""
191
+ API_BASE = "https://api.openalex.org/works"
192
+
193
+ def __init__(self):
194
+ self._last_req = 0.0
195
+
196
+ def search_by_title(self, title: str) -> Optional[FetchResult]:
197
+ self._last_req = self._rate_limit(0.2, self._last_req)
198
+ try:
199
+ resp = requests.get(self.API_BASE, params={'search': title, 'per-page': 1}, timeout=10)
200
+ data = resp.json().get('results', [])
201
+ if data: return self._parse(data[0])
202
+ except Exception: pass
203
+ return None
204
+
205
+ def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
206
+ self._last_req = self._rate_limit(0.2, self._last_req)
207
+ try:
208
+ resp = requests.get(f"{self.API_BASE}/https://doi.org/{doi}", timeout=10)
209
+ return self._parse(resp.json())
210
+ except Exception: return None
211
+
212
+ def _parse(self, data: dict) -> FetchResult:
213
+ authors = [a['author']['display_name'] for a in data.get('authorships', [])]
214
+ doi = data.get('doi', '').replace('https://doi.org/', '')
215
+ return FetchResult(data.get('title', ''), authors, str(data.get('publication_year', '')), doi, data.get('id', ''), "openalex")
216
+
217
+ class ScholarFetcher(BaseFetcher):
218
+ """Google Scholar Scraper (Fallback)."""
219
+ SEARCH_URL = "https://scholar.google.com/scholar"
220
+
221
+ def __init__(self):
222
+ self._last_req = 0.0
223
+ self._session = requests.Session()
224
+ self._blocked = False
225
+
226
+ def search_by_title(self, title: str) -> Optional[FetchResult]:
227
+ if self._blocked: return None
228
+ self._last_req = self._rate_limit(5.0 + random.random() * 3, self._last_req) # Polite delay
229
+ try:
230
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
231
+ resp = self._session.get(self.SEARCH_URL, params={'q': f'"{title}"', 'hl': 'en', 'num': 1}, headers=headers, timeout=30)
232
+ if resp.status_code == 429 or 'unusual traffic' in resp.text:
233
+ self._blocked = True
234
+ return None
235
+ return self._parse(resp.text)
236
+ except Exception: return None
237
+
238
+ def _parse(self, html: str) -> Optional[FetchResult]:
239
+ soup = BeautifulSoup(html, 'lxml')
240
+ entry = soup.find('div', class_='gs_ri')
241
+ if not entry: return None
242
+
243
+ title_tag = entry.find('h3', class_='gs_rt')
244
+ title = title_tag.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '').strip()
245
+ url = title_tag.find('a')['href'] if title_tag.find('a') else ""
246
+
247
+ meta = entry.find('div', class_='gs_a').get_text(strip=True)
248
+ # Attempt to extract year
249
+ year_match = re.search(r'\b(19|20)\d{2}\b', meta)
250
+ year = year_match.group(0) if year_match else ""
251
+ # Attempt to extract authors (before " - ")
252
+ authors = meta.split(' - ')[0]
253
+
254
+ return FetchResult(title, authors, year, "", url, "scholar")
src/local_db.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local Conference Database: fast, offline title lookup against DBLP index.
3
+
4
+ This module provides a local database of conference/journal proceedings
5
+ downloaded from DBLP. It serves as a "ground truth" source that eliminates
6
+ the need for network API calls for entries that match known publications.
7
+ """
8
+ import json
9
+ import re
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from dataclasses import dataclass
13
+
14
+
15
+ def _normalize(title: str) -> str:
16
+ """Normalize a title for index lookup (must match build_index.py)."""
17
+ title = re.sub(r'\{([^}]*)\}', r'\1', title)
18
+ title = re.sub(r'[^\w\s]', ' ', title.lower())
19
+ return re.sub(r'\s+', ' ', title).strip()
20
+
21
+
22
+ @dataclass
23
+ class LocalMatch:
24
+ """Result from a local DB lookup."""
25
+ title: str
26
+ author: str
27
+ year: str
28
+ booktitle: str
29
+ journal: str
30
+ doi: str
31
+ url: str
32
+ pages: str
33
+ volume: str
34
+ entry_type: str
35
+ source_file: str
36
+
37
+
38
+ class LocalConferenceDB:
39
+ """Title-based lookup against locally cached DBLP proceedings."""
40
+
41
+ def __init__(self, index_dir: str = None):
42
+ if index_dir is None:
43
+ base = Path(__file__).resolve().parent.parent / "data"
44
+ self._shard_dir = base / "index_shards"
45
+ self._legacy_path = base / "conference_index.json"
46
+ else:
47
+ self._shard_dir = Path(index_dir)
48
+ self._legacy_path = Path(index_dir).parent / "conference_index.json"
49
+ self._idx: dict = {}
50
+ self._loaded = False
51
+
52
+ def load(self) -> bool:
53
+ """Load index from shards or legacy single file. Returns True if successful."""
54
+ try:
55
+ # Try sharded index first
56
+ if self._shard_dir.exists():
57
+ shard_files = sorted(self._shard_dir.glob("index_*.json"))
58
+ if shard_files:
59
+ for shard_path in shard_files:
60
+ shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
61
+ self._idx.update(shard_data)
62
+ self._loaded = True
63
+ print(f" 📚 Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).")
64
+ return True
65
+
66
+ # Fallback: legacy single file
67
+ if self._legacy_path.exists():
68
+ self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8"))
69
+ self._loaded = True
70
+ print(f" 📚 Local DB: {len(self._idx):,} entries loaded.")
71
+ return True
72
+
73
+ print(" ⚠ Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py")
74
+ return False
75
+ except Exception as e:
76
+ print(f" ⚠ Failed to load local DB: {e}")
77
+ return False
78
+
79
+ @property
80
+ def is_loaded(self) -> bool:
81
+ return self._loaded and len(self._idx) > 0
82
+
83
+ def lookup(self, title: str) -> Optional[LocalMatch]:
84
+ """
85
+ Look up an entry by title.
86
+ Returns LocalMatch if found, None otherwise.
87
+ """
88
+ if not self._loaded:
89
+ return None
90
+
91
+ key = _normalize(title)
92
+ data = self._idx.get(key)
93
+ if not data:
94
+ return None
95
+
96
+ return LocalMatch(
97
+ title=data.get("title", ""),
98
+ author=data.get("author", ""),
99
+ year=data.get("year", ""),
100
+ booktitle=data.get("booktitle", ""),
101
+ journal=data.get("journal", ""),
102
+ doi=data.get("doi", ""),
103
+ url=data.get("url", ""),
104
+ pages=data.get("pages", ""),
105
+ volume=data.get("volume", ""),
106
+ entry_type=data.get("_type", "inproceedings"),
107
+ source_file=data.get("_source", ""),
108
+ )
src/normalizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Booktitle normalizer: maps verbose venue names to standard abbreviations.
3
+
4
+ Loads rules from data/abbr.tsv (regex → abbreviation).
5
+ """
6
+ import re
7
+ import csv
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+
12
+ class BooktitleNormalizer:
13
+ """Normalizes booktitle/journal names to standard abbreviations."""
14
+
15
+ def __init__(self, tsv_path: str = None):
16
+ if tsv_path is None:
17
+ tsv_path = str(Path(__file__).resolve().parent.parent / "data" / "abbr.tsv")
18
+ self.rules: list[tuple[re.Pattern, str]] = []
19
+ self._load_rules(tsv_path)
20
+
21
+ def _load_rules(self, tsv_path: str):
22
+ """Load regex → abbreviation rules from TSV file."""
23
+ path = Path(tsv_path)
24
+ if not path.exists():
25
+ return
26
+
27
+ with open(path, 'r', encoding='utf-8') as f:
28
+ reader = csv.reader(f, delimiter='\t')
29
+ for row in reader:
30
+ if len(row) >= 2:
31
+ pattern_str = row[0].strip()
32
+ abbr = row[1].strip()
33
+ # Skip comments and empty lines
34
+ if not pattern_str or pattern_str.startswith('#'):
35
+ continue
36
+ try:
37
+ self.rules.append((re.compile(pattern_str, re.IGNORECASE), abbr))
38
+ except re.error:
39
+ pass # Skip invalid regex
40
+
41
+ def normalize(self, booktitle: str) -> Optional[str]:
42
+ """
43
+ Normalize a booktitle to its standard abbreviation.
44
+ Returns the abbreviation if matched, None if no match found.
45
+ """
46
+ if not booktitle:
47
+ return None
48
+ for pattern, abbr in self.rules:
49
+ if pattern.search(booktitle):
50
+ return abbr
51
+ return None
src/parser.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BibTeX file parser.
3
+ """
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from typing import Optional
7
+ from pathlib import Path
8
+
9
+ import bibtexparser
10
+ from bibtexparser.bparser import BibTexParser
11
+ from bibtexparser.customization import convert_to_unicode
12
+
13
+
14
+ @dataclass
15
+ class BibEntry:
16
+ """Represents a parsed bibliography entry."""
17
+ key: str
18
+ entry_type: str
19
+ title: str = ""
20
+ author: str = ""
21
+ year: str = ""
22
+ abstract: str = ""
23
+ url: str = ""
24
+ doi: str = ""
25
+ arxiv_id: str = ""
26
+ journal: str = ""
27
+ booktitle: str = ""
28
+ publisher: str = ""
29
+ pages: str = ""
30
+ volume: str = ""
31
+ number: str = ""
32
+ raw_entry: dict = field(default_factory=dict)
33
+
34
+ @property
35
+ def has_arxiv(self) -> bool:
36
+ """Check if entry has arXiv information."""
37
+ return bool(self.arxiv_id)
38
+
39
+ @property
40
+ def search_query(self) -> str:
41
+ """Get search query for this entry."""
42
+ return self.title or self.key
43
+
44
+
45
+ class BibParser:
46
+ """Parser for .bib files."""
47
+
48
+ # Patterns for extracting arXiv IDs
49
+ ARXIV_PATTERNS = [
50
+ # New format: 2301.00001 or 2301.00001v1
51
+ r'(\d{4}\.\d{4,5}(?:v\d+)?)',
52
+ # Old format: hep-th/9901001 or math.GT/0309136
53
+ r'([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
54
+ # arXiv: prefix
55
+ r'arXiv:(\d{4}\.\d{4,5}(?:v\d+)?)',
56
+ r'arXiv:([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
57
+ ]
58
+
59
+ # URL patterns for arXiv
60
+ ARXIV_URL_PATTERNS = [
61
+ r'arxiv\.org/abs/(\d{4}\.\d{4,5}(?:v\d+)?)',
62
+ r'arxiv\.org/abs/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
63
+ r'arxiv\.org/pdf/(\d{4}\.\d{4,5}(?:v\d+)?)(?:\.pdf)?',
64
+ r'arxiv\.org/pdf/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)(?:\.pdf)?',
65
+ ]
66
+
67
+ def __init__(self):
68
+ self.entries: list[BibEntry] = []
69
+
70
+ def parse_file(self, filepath: str) -> list[BibEntry]:
71
+ """Parse a .bib file and return list of entries."""
72
+ path = Path(filepath)
73
+ if not path.exists():
74
+ raise FileNotFoundError(f"Bib file not found: {filepath}")
75
+
76
+ with open(path, 'r', encoding='utf-8', errors='replace') as f:
77
+ content = f.read()
78
+
79
+ return self.parse_content(content)
80
+
81
+ def parse_content(self, content: str) -> list[BibEntry]:
82
+ """Parse bib content string."""
83
+ parser = BibTexParser(common_strings=True)
84
+ parser.customization = convert_to_unicode
85
+
86
+ try:
87
+ bib_database = bibtexparser.loads(content, parser=parser)
88
+ except Exception as e:
89
+ raise ValueError(f"Failed to parse bib content: {e}")
90
+
91
+ self.entries = []
92
+ for entry in bib_database.entries:
93
+ bib_entry = self._convert_entry(entry)
94
+ self.entries.append(bib_entry)
95
+
96
+ return self.entries
97
+
98
+ def _convert_entry(self, entry: dict) -> BibEntry:
99
+ """Convert a bibtexparser entry to BibEntry."""
100
+ # Extract basic fields
101
+ bib_entry = BibEntry(
102
+ key=entry.get('ID', ''),
103
+ entry_type=entry.get('ENTRYTYPE', ''),
104
+ title=entry.get('title', ''),
105
+ author=entry.get('author', ''),
106
+ year=entry.get('year', ''),
107
+ abstract=entry.get('abstract', ''),
108
+ url=entry.get('url', ''),
109
+ doi=entry.get('doi', ''),
110
+ journal=entry.get('journal', ''),
111
+ booktitle=entry.get('booktitle', ''),
112
+ publisher=entry.get('publisher', ''),
113
+ pages=entry.get('pages', ''),
114
+ volume=entry.get('volume', ''),
115
+ number=entry.get('number', ''),
116
+ raw_entry=entry.copy()
117
+ )
118
+
119
+ # Extract arXiv ID
120
+ bib_entry.arxiv_id = self._extract_arxiv_id(entry)
121
+
122
+ return bib_entry
123
+
124
+ def _extract_arxiv_id(self, entry: dict) -> str:
125
+ """Extract arXiv ID from entry."""
126
+ # Check eprint field first
127
+ eprint = entry.get('eprint', '')
128
+ if eprint:
129
+ arxiv_id = self._parse_arxiv_id(eprint)
130
+ if arxiv_id:
131
+ return arxiv_id
132
+
133
+ # Check arxiv field
134
+ arxiv = entry.get('arxiv', '')
135
+ if arxiv:
136
+ arxiv_id = self._parse_arxiv_id(arxiv)
137
+ if arxiv_id:
138
+ return arxiv_id
139
+
140
+ # Check URL field
141
+ url = entry.get('url', '')
142
+ if url:
143
+ for pattern in self.ARXIV_URL_PATTERNS:
144
+ match = re.search(pattern, url, re.IGNORECASE)
145
+ if match:
146
+ return match.group(1)
147
+
148
+ # Check journal field for "arXiv preprint arXiv:XXXX.XXXXX" format
149
+ journal = entry.get('journal', '')
150
+ if journal and 'arxiv' in journal.lower():
151
+ arxiv_id = self._parse_arxiv_id(journal)
152
+ if arxiv_id:
153
+ return arxiv_id
154
+
155
+ # Check note field
156
+ note = entry.get('note', '')
157
+ if note:
158
+ arxiv_id = self._parse_arxiv_id(note)
159
+ if arxiv_id:
160
+ return arxiv_id
161
+
162
+ return ""
163
+
164
+ def _parse_arxiv_id(self, text: str) -> str:
165
+ """Parse arXiv ID from text."""
166
+ for pattern in self.ARXIV_PATTERNS:
167
+ match = re.search(pattern, text)
168
+ if match:
169
+ return match.group(1)
170
+ return ""
171
+
172
+ def get_entry_by_key(self, key: str) -> Optional[BibEntry]:
173
+ """Get entry by citation key."""
174
+ for entry in self.entries:
175
+ if entry.key == key:
176
+ return entry
177
+ return None
178
+
179
+ def filter_file(self, input_path: str, output_path: str, keys_to_keep: set[str]):
180
+ """
181
+ Create a new bib file containing only specified keys.
182
+ Preserves original formatting, comments, and strings.
183
+ """
184
+ with open(input_path, 'r', encoding='utf-8') as f:
185
+ content = f.read()
186
+
187
+ filtered_content = self._filter_content(content, keys_to_keep)
188
+
189
+ with open(output_path, 'w', encoding='utf-8') as f:
190
+ f.write(filtered_content)
191
+
192
+ def _filter_content(self, content: str, keys_to_keep: set[str]) -> str:
193
+ """Filter content string keeping only specified keys."""
194
+ ranges_to_remove = []
195
+ i = 0
196
+ length = len(content)
197
+
198
+ while i < length:
199
+ if content[i] == '@':
200
+ start = i
201
+ # Find opening brace
202
+ brace_open = content.find('{', i)
203
+ if brace_open == -1:
204
+ i += 1
205
+ continue
206
+
207
+ # Get entry type
208
+ entry_type = content[i+1:brace_open].strip().lower()
209
+
210
+ # Skip comments
211
+ if entry_type == 'comment':
212
+ i = brace_open + 1
213
+ continue
214
+
215
+ # Find matching closing brace to determine entry end
216
+ balance = 1
217
+ j = brace_open + 1
218
+ in_quote = False
219
+
220
+ while j < length and balance > 0:
221
+ char = content[j]
222
+
223
+ # Handle escaped characters
224
+ if char == '\\':
225
+ j += 2
226
+ continue
227
+
228
+ if char == '"':
229
+ in_quote = not in_quote
230
+ elif not in_quote:
231
+ if char == '{':
232
+ balance += 1
233
+ elif char == '}':
234
+ balance -= 1
235
+ j += 1
236
+
237
+ end = j
238
+
239
+ # Extract key (between { and ,)
240
+ # Only for standard entries, not @string or @preamble
241
+ if entry_type not in ('string', 'preamble'):
242
+ # Find comma or end of entry
243
+ # Key is usually the first token after {
244
+ key_part = content[brace_open+1:end]
245
+ comma_pos = key_part.find(',')
246
+
247
+ if comma_pos != -1:
248
+ key = key_part[:comma_pos].strip()
249
+
250
+ # If key is NOT in keep list, mark for removal
251
+ if key not in keys_to_keep:
252
+ ranges_to_remove.append((start, end))
253
+
254
+ i = end
255
+ else:
256
+ i += 1
257
+
258
+ # Reconstruct content
259
+ new_content = []
260
+ last_pos = 0
261
+ for start, end in ranges_to_remove:
262
+ new_content.append(content[last_pos:start])
263
+
264
+ # Clean up whitespace after removed entry
265
+ last_pos = end
266
+ while last_pos < length and content[last_pos] in ' \t\r':
267
+ last_pos += 1
268
+ if last_pos < length and content[last_pos] == '\n':
269
+ last_pos += 1
270
+
271
+ new_content.append(content[last_pos:])
272
+ return "".join(new_content)
273
+
274
+ def save_entries(self, filepath: str, entries: list[BibEntry]):
275
+ """Save entries to a .bib file."""
276
+ db = bibtexparser.bibdatabase.BibDatabase()
277
+
278
+ db_entries = []
279
+ for entry in entries:
280
+ # Start with raw entry to preserve custom fields
281
+ db_entry = entry.raw_entry.copy()
282
+
283
+ # Update with potentially modified fields
284
+ db_entry['ID'] = entry.key
285
+ db_entry['ENTRYTYPE'] = entry.entry_type
286
+ if entry.title: db_entry['title'] = entry.title
287
+ if entry.author: db_entry['author'] = entry.author
288
+ if entry.year: db_entry['year'] = entry.year
289
+ if entry.journal: db_entry['journal'] = entry.journal
290
+ if entry.booktitle: db_entry['booktitle'] = entry.booktitle
291
+ if entry.publisher: db_entry['publisher'] = entry.publisher
292
+ if entry.pages: db_entry['pages'] = entry.pages
293
+ if entry.volume: db_entry['volume'] = entry.volume
294
+ if entry.number: db_entry['number'] = entry.number
295
+ if entry.doi: db_entry['doi'] = entry.doi
296
+ elif 'doi' in db_entry:
297
+ # DOI was removed (e.g., by DOI mismatch sanitizer)
298
+ del db_entry['doi']
299
+ if entry.url: db_entry['url'] = entry.url
300
+
301
+ # Handle entry type consistency:
302
+ # inproceedings should use booktitle, not journal
303
+ if entry.entry_type.lower() == 'inproceedings':
304
+ if not entry.journal and 'journal' in db_entry:
305
+ del db_entry['journal']
306
+ # article should use journal, not booktitle
307
+ elif entry.entry_type.lower() == 'article':
308
+ if not entry.booktitle and 'booktitle' in db_entry:
309
+ del db_entry['booktitle']
310
+
311
+ db_entries.append(db_entry)
312
+
313
+ db.entries = db_entries
314
+
315
+ with open(filepath, 'w', encoding='utf-8') as f:
316
+ bibtexparser.dump(db, f)
src/sanitizer.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BibTeX Sanitizer: Structural and formatting checks for bib entries.
3
+
4
+ Runs as a pre-processing phase before metadata fetch-and-compare,
5
+ detecting and auto-fixing common formatting issues that crawlers
6
+ and copy-paste introduce into .bib files.
7
+ """
8
+ import re
9
+ from datetime import datetime
10
+ from dataclasses import dataclass, field
11
+ from typing import List, Optional, Any
12
+
13
+ CURRENT_YEAR = datetime.now().year
14
+
15
+ from .parser import BibEntry
16
+ from .utils import TextNormalizer
17
+
18
+
19
+ @dataclass
20
+ class SanitizeFix:
21
+ """Describes a single sanitization fix applied to a bib entry."""
22
+ entry_key: str
23
+ category: str # e.g., "dblp_id", "corporate_author", "entry_type", "title_case", "doi_mismatch"
24
+ field: str # which field was affected
25
+ description: str # human-readable description
26
+ old_value: str = ""
27
+ new_value: str = ""
28
+
29
+
30
+ # Known conference name keywords for entry type detection
31
+ CONFERENCE_KEYWORDS = [
32
+ "conference", "proceedings", "workshop", "symposium",
33
+ # Top ML/AI
34
+ "iclr", "icml", "neurips", "nips", "aaai", "ijcai",
35
+ # NLP
36
+ "acl", "emnlp", "naacl", "coling", "eacl",
37
+ # Vision
38
+ "cvpr", "iccv", "eccv",
39
+ # Speech
40
+ "interspeech", "icassp",
41
+ # IR/Data
42
+ "sigir", "kdd", "www", "wsdm",
43
+ # Systems
44
+ "osdi", "sosp", "nsdi",
45
+ # General
46
+ "international conference", "annual meeting",
47
+ ]
48
+
49
+
50
+ class BibSanitizer:
51
+ """Performs structural and formatting sanity checks on BibEntry objects."""
52
+
53
+ def sanitize_all(self, entries: List[BibEntry]) -> dict:
54
+ """
55
+ Run all sanitization checks on a list of entries.
56
+ Returns dict: {entry_key: [SanitizeFix, ...]}
57
+ Entries are modified in-place.
58
+ """
59
+ all_fixes = {}
60
+ for entry in entries:
61
+ fixes = []
62
+ fixes.extend(self._check_dblp_ids(entry))
63
+ fixes.extend(self._check_corporate_authors(entry))
64
+ fixes.extend(self._check_entry_type(entry))
65
+ fixes.extend(self._check_title_capitalization(entry))
66
+ fixes.extend(self._check_future_year(entry))
67
+ fixes.extend(self._clean_entry_fields(entry))
68
+ if fixes:
69
+ all_fixes[entry.key] = fixes
70
+ return all_fixes
71
+
72
+ # ------------------------------------------------------------------
73
+ # Check 1: DBLP Disambiguation ID Cleanup
74
+ # ------------------------------------------------------------------
75
+ def _check_dblp_ids(self, entry: BibEntry) -> List[SanitizeFix]:
76
+ """Strip DBLP disambiguation IDs (4-digit suffixes) from author names."""
77
+ fixes = []
78
+ if not entry.author:
79
+ return fixes
80
+
81
+ raw_authors = TextNormalizer.parse_author_list(entry.author)
82
+ cleaned_authors = []
83
+ any_changed = False
84
+
85
+ for author in raw_authors:
86
+ author = author.strip()
87
+ if TextNormalizer.has_dblp_disambiguation_id(author):
88
+ cleaned = TextNormalizer.strip_dblp_disambiguation_id(author)
89
+ fixes.append(SanitizeFix(
90
+ entry_key=entry.key,
91
+ category="dblp_id",
92
+ field="author",
93
+ description=f"Stripped DBLP disambiguation ID: '{author}' → '{cleaned}'",
94
+ old_value=author,
95
+ new_value=cleaned,
96
+ ))
97
+ cleaned_authors.append(cleaned)
98
+ any_changed = True
99
+ else:
100
+ cleaned_authors.append(author)
101
+
102
+ if any_changed:
103
+ new_author_str = " and ".join(cleaned_authors)
104
+ entry.author = new_author_str
105
+ # Also update raw_entry so save_entries doesn't re-introduce the IDs
106
+ if 'author' in entry.raw_entry:
107
+ entry.raw_entry['author'] = new_author_str
108
+
109
+ return fixes
110
+
111
+ # ------------------------------------------------------------------
112
+ # Check 2: Corporate / Institutional Author Protection
113
+ # ------------------------------------------------------------------
114
+ def _check_corporate_authors(self, entry: BibEntry) -> List[SanitizeFix]:
115
+ """
116
+ Detect single-word author names and wrap in {{double braces}}.
117
+
118
+ BibTeX treats single-word names as a last name, rendering e.g.
119
+ "KimiTeam" as "K. Team". Wrapping in {{}} prevents this.
120
+ """
121
+ fixes = []
122
+ if not entry.author:
123
+ return fixes
124
+
125
+ raw_authors = TextNormalizer.parse_author_list(entry.author)
126
+ new_authors = []
127
+ any_changed = False
128
+
129
+ for author in raw_authors:
130
+ author = author.strip()
131
+ # Already wrapped in double braces
132
+ if author.startswith('{{') and author.endswith('}}'):
133
+ new_authors.append(author)
134
+ continue
135
+ # Already wrapped in single braces (check if it's a corporate name)
136
+ if author.startswith('{') and author.endswith('}'):
137
+ new_authors.append(author)
138
+ continue
139
+
140
+ # Single-word author (no spaces) that starts with uppercase
141
+ # e.g., "KimiTeam", "OpenAI", "Google"
142
+ stripped = author.strip('{}')
143
+ if ' ' not in stripped and stripped and stripped[0].isupper() and len(stripped) > 1:
144
+ wrapped = '{{' + stripped + '}}'
145
+ fixes.append(SanitizeFix(
146
+ entry_key=entry.key,
147
+ category="corporate_author",
148
+ field="author",
149
+ description=f"Corporate author protected: '{author}' → '{wrapped}'",
150
+ old_value=author,
151
+ new_value=wrapped,
152
+ ))
153
+ new_authors.append(wrapped)
154
+ any_changed = True
155
+ else:
156
+ new_authors.append(author)
157
+
158
+ if any_changed:
159
+ new_author_str = " and ".join(new_authors)
160
+ entry.author = new_author_str
161
+ if 'author' in entry.raw_entry:
162
+ entry.raw_entry['author'] = new_author_str
163
+
164
+ return fixes
165
+
166
+ # ------------------------------------------------------------------
167
+ # Check 3: Entry Type Correction (article → inproceedings)
168
+ # ------------------------------------------------------------------
169
+ def _check_entry_type(self, entry: BibEntry) -> List[SanitizeFix]:
170
+ """
171
+ Detect conference papers incorrectly typed as @article.
172
+
173
+ Heuristics:
174
+ - Has booktitle field → should be inproceedings
175
+ - Journal field contains conference keywords → move to booktitle
176
+ """
177
+ fixes = []
178
+
179
+ if entry.entry_type.lower() != 'article':
180
+ return fixes
181
+
182
+ # Case 1: Has booktitle but typed as article
183
+ if entry.booktitle:
184
+ old_type = entry.entry_type
185
+ entry.entry_type = 'inproceedings'
186
+ if 'ENTRYTYPE' in entry.raw_entry:
187
+ entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
188
+ fixes.append(SanitizeFix(
189
+ entry_key=entry.key,
190
+ category="entry_type",
191
+ field="ENTRYTYPE",
192
+ description=f"Entry has booktitle but was @{old_type} → @inproceedings",
193
+ old_value=old_type,
194
+ new_value='inproceedings',
195
+ ))
196
+ return fixes
197
+
198
+ # Case 2: Journal field contains conference keywords
199
+ if entry.journal:
200
+ journal_lower = entry.journal.lower()
201
+ matched_keyword = None
202
+ for keyword in CONFERENCE_KEYWORDS:
203
+ if keyword in journal_lower:
204
+ matched_keyword = keyword
205
+ break
206
+
207
+ if matched_keyword:
208
+ old_type = entry.entry_type
209
+ old_journal = entry.journal
210
+
211
+ # Move journal → booktitle
212
+ entry.booktitle = entry.journal
213
+ entry.journal = ""
214
+ entry.entry_type = 'inproceedings'
215
+
216
+ # Update raw_entry
217
+ if 'ENTRYTYPE' in entry.raw_entry:
218
+ entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
219
+ entry.raw_entry['booktitle'] = old_journal
220
+ if 'journal' in entry.raw_entry:
221
+ del entry.raw_entry['journal']
222
+
223
+ fixes.append(SanitizeFix(
224
+ entry_key=entry.key,
225
+ category="entry_type",
226
+ field="ENTRYTYPE",
227
+ description=(
228
+ f"@{old_type} → @inproceedings "
229
+ f"(journal '{old_journal}' contains '{matched_keyword}', moved to booktitle)"
230
+ ),
231
+ old_value=old_type,
232
+ new_value='inproceedings',
233
+ ))
234
+
235
+ return fixes
236
+
237
+ # ------------------------------------------------------------------
238
+ # Check 4: DOI-Title Cross-Validation
239
+ # ------------------------------------------------------------------
240
+ def check_doi_title_match(self, entry: BibEntry, fetched_data: Any) -> List[SanitizeFix]:
241
+ """
242
+ Validate that a DOI resolves to the same paper as the bib entry.
243
+
244
+ Called during the fetch phase (requires network), not during
245
+ the offline sanitize phase.
246
+
247
+ If the DOI metadata title doesn't match the bib entry title,
248
+ flag the DOI as potentially wrong and remove it.
249
+ """
250
+ fixes = []
251
+ if not entry.doi or not fetched_data:
252
+ return fixes
253
+
254
+ fetched_title = getattr(fetched_data, 'title', '')
255
+ if not fetched_title:
256
+ return fixes
257
+
258
+ bib_title_norm = TextNormalizer.normalize_for_comparison(entry.title)
259
+ doi_title_norm = TextNormalizer.normalize_for_comparison(fetched_title)
260
+
261
+ similarity = TextNormalizer.similarity_ratio(bib_title_norm, doi_title_norm)
262
+ if len(bib_title_norm) < 100:
263
+ lev_sim = TextNormalizer.levenshtein_similarity(bib_title_norm, doi_title_norm)
264
+ similarity = max(similarity, lev_sim)
265
+
266
+ if similarity < 0.5:
267
+ old_doi = entry.doi
268
+ fixes.append(SanitizeFix(
269
+ entry_key=entry.key,
270
+ category="doi_mismatch",
271
+ field="doi",
272
+ description=(
273
+ f"DOI '{old_doi}' resolves to a different title "
274
+ f"('{fetched_title[:60]}...' vs '{entry.title[:60]}...'). "
275
+ f"Similarity: {similarity:.0%}. DOI removed."
276
+ ),
277
+ old_value=old_doi,
278
+ new_value="",
279
+ ))
280
+ entry.doi = ""
281
+ if 'doi' in entry.raw_entry:
282
+ del entry.raw_entry['doi']
283
+
284
+ return fixes
285
+
286
+ # ------------------------------------------------------------------
287
+ # Check 5: Title Capitalization Protection (for IEEEtran)
288
+ # ------------------------------------------------------------------
289
+
290
+ # Pattern: 2+ uppercase letters (acronyms like MMAU, SALMONN, GPT, BEATs)
291
+ _ACRONYM_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{2,}[a-z]?(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
292
+
293
+ # Pattern: CamelCase words (SpeechT5, HuBERT, ChatGPT, AudioPaLM)
294
+ _CAMELCASE_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][a-z]+(?:[\.-]?[A-Z][a-z]*)+)(?![A-Za-z0-9])')
295
+
296
+ # Pattern: Word with mixed case + digits, optionally with dots/hyphens (GPT-4o, Llama3, Qwen2.5-Omni)
297
+ _MIXED_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*\d[A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
298
+
299
+ def _check_title_capitalization(self, entry: BibEntry) -> List[SanitizeFix]:
300
+ """
301
+ Wrap acronyms and proper nouns in {} to protect capitalization.
302
+
303
+ IEEEtran's .bst forces titles to sentence case.
304
+ Without braces, "SALMONN" becomes "salmonn".
305
+ """
306
+ fixes = []
307
+ if not entry.title:
308
+ return fixes
309
+
310
+ title = entry.title
311
+ words_to_protect = set()
312
+
313
+ # Find acronyms (e.g., MMAU, CREMA-D, SALMONN)
314
+ for m in self._ACRONYM_RE.finditer(title):
315
+ word = m.group(1)
316
+ # Skip very common short words that might be false positives
317
+ if word in ('AI', 'ML', 'NLP', 'CV', 'LLM', 'ASR', 'TTS', 'NER',
318
+ 'QA', 'MT', 'IR', 'RL', 'GAN', 'VAE', 'RNN', 'CNN',
319
+ 'GPU', 'CPU', 'TPU', 'API', 'URL', 'PDF', 'HTML',
320
+ 'II', 'III', 'IV', 'VI', 'VII', 'VIII', 'IX', 'XI',
321
+ 'USB', 'RAM', 'ROM', 'SSD', 'TCP', 'HTTP', 'SSL',
322
+ 'BERT', 'GPT', 'LSTM', 'MLP', 'FFN', 'LLM'):
323
+ # Still protect these! They're valid acronyms
324
+ words_to_protect.add(word)
325
+ elif len(word) >= 2:
326
+ words_to_protect.add(word)
327
+
328
+ # Find CamelCase (e.g., SpeechT5, HuBERT, ChatGPT, BEATs)
329
+ for m in self._CAMELCASE_RE.finditer(title):
330
+ words_to_protect.add(m.group(1))
331
+
332
+ # Find mixed-case+digit patterns (e.g., GPT4, Llama3)
333
+ for m in self._MIXED_RE.finditer(title):
334
+ words_to_protect.add(m.group(1))
335
+
336
+ if not words_to_protect:
337
+ return fixes
338
+
339
+ # Apply protection: wrap each word in {} if not already braced
340
+ new_title = title
341
+ protected_words = []
342
+
343
+ for word in sorted(words_to_protect, key=len, reverse=True):
344
+ # Check if this word is already inside braces
345
+ # Look for {word} already in the title
346
+ if '{' + word + '}' in new_title:
347
+ continue
348
+ if '{{' + word + '}}' in new_title:
349
+ continue
350
+
351
+ # Replace the bare word with {word}
352
+ # Use word boundary to avoid partial matches
353
+ pattern = re.compile(r'(?<!\{)\b' + re.escape(word) + r'\b(?!\})')
354
+ if pattern.search(new_title):
355
+ new_title = pattern.sub('{' + word + '}', new_title)
356
+ protected_words.append(word)
357
+
358
+ if protected_words and new_title != title:
359
+ fixes.append(SanitizeFix(
360
+ entry_key=entry.key,
361
+ category="title_case",
362
+ field="title",
363
+ description=f"Protected capitalization: {', '.join(protected_words)}",
364
+ old_value=title,
365
+ new_value=new_title,
366
+ ))
367
+ entry.title = new_title
368
+ if 'title' in entry.raw_entry:
369
+ entry.raw_entry['title'] = new_title
370
+
371
+ return fixes
372
+
373
+ # ------------------------------------------------------------------
374
+ # Check 6: Future Year Detection
375
+ # ------------------------------------------------------------------
376
+ def _check_future_year(self, entry: BibEntry) -> List[SanitizeFix]:
377
+ """
378
+ Detect entries with year > current year.
379
+
380
+ These are likely arXiv submission dates that will be wrong once
381
+ the paper is published at a conference. Flag them for forced
382
+ API lookup so the correct conference year can be found.
383
+ """
384
+ fixes = []
385
+ year_str = str(entry.year).strip()
386
+ if not year_str or not year_str.isdigit():
387
+ return fixes
388
+
389
+ year = int(year_str)
390
+
391
+ if year > CURRENT_YEAR:
392
+ # Flag the entry for forced API lookup
393
+ entry._force_api_lookup = True
394
+ fixes.append(SanitizeFix(
395
+ entry_key=entry.key,
396
+ category="future_year",
397
+ field="year",
398
+ description=(
399
+ f"Future year {year} detected (current: {CURRENT_YEAR}). "
400
+ f"Will force API lookup to find correct year."
401
+ ),
402
+ old_value=year_str,
403
+ new_value="", # Will be resolved by API
404
+ ))
405
+ elif year < 1950:
406
+ fixes.append(SanitizeFix(
407
+ entry_key=entry.key,
408
+ category="future_year",
409
+ field="year",
410
+ description=f"Suspiciously old year: {year}",
411
+ old_value=year_str,
412
+ new_value="",
413
+ ))
414
+
415
+ return fixes
416
+
417
+ # ------------------------------------------------------------------
418
+ # Check 7: Field Cleanup Policy
419
+ # ------------------------------------------------------------------
420
+ # Fields to remove per entry type
421
+ FIELD_REMOVE_POLICY = {
422
+ "inproceedings": [
423
+ "address", "month", "abstract",
424
+ "archiveprefix", "primaryclass",
425
+ "biburl", "bibsource", "timestamp",
426
+ "copyright", "issn", "isbn",
427
+ ],
428
+ "article": [
429
+ "address", "month", "abstract",
430
+ "archiveprefix", "primaryclass",
431
+ "biburl", "bibsource", "timestamp",
432
+ "copyright", "issn",
433
+ ],
434
+ "misc": [
435
+ "address", "month", "abstract",
436
+ "biburl", "bibsource", "timestamp",
437
+ "copyright",
438
+ ],
439
+ }
440
+
441
+ def _clean_entry_fields(self, entry: BibEntry) -> List[SanitizeFix]:
442
+ """
443
+ Remove junk/noise fields that crawlers often include.
444
+ These fields add clutter and can cause formatting issues.
445
+ """
446
+ fixes = []
447
+ entry_type = entry.entry_type.lower()
448
+ to_remove = self.FIELD_REMOVE_POLICY.get(entry_type, [])
449
+
450
+ removed_fields = []
451
+ for field_name in to_remove:
452
+ # Check in raw_entry (case-insensitive)
453
+ for raw_key in list(entry.raw_entry.keys()):
454
+ if raw_key.lower() == field_name.lower() and raw_key not in ('ID', 'ENTRYTYPE'):
455
+ del entry.raw_entry[raw_key]
456
+ removed_fields.append(raw_key)
457
+
458
+ if removed_fields:
459
+ fixes.append(SanitizeFix(
460
+ entry_key=entry.key,
461
+ category="field_cleanup",
462
+ field="multiple",
463
+ description=f"Removed junk fields: {', '.join(removed_fields)}",
464
+ old_value=", ".join(removed_fields),
465
+ new_value="",
466
+ ))
467
+
468
+ return fixes
469
+
470
+ # ------------------------------------------------------------------
471
+ # Standalone: Duplicate Detection
472
+ # ------------------------------------------------------------------
473
+ @staticmethod
474
+ def find_duplicates(entries: List[BibEntry]) -> dict:
475
+ """
476
+ Find entries that share the same normalized title.
477
+ Returns {normalized_title: [key1, key2, ...]} for duplicates.
478
+ """
479
+ import re as _re
480
+ from collections import defaultdict
481
+
482
+ def _norm(t: str) -> str:
483
+ t = _re.sub(r'\{([^}]*)\}', r'\1', t)
484
+ t = _re.sub(r'[^\w\s]', ' ', t.lower())
485
+ return _re.sub(r'\s+', ' ', t).strip()
486
+
487
+ title_map = defaultdict(list)
488
+ for entry in entries:
489
+ key = _norm(entry.title)
490
+ if key:
491
+ title_map[key].append(entry.key)
492
+
493
+ return {t: keys for t, keys in title_map.items() if len(keys) > 1}
src/space_service.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Non-interactive RefCheck workflow for Hugging Face Spaces.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import tempfile
7
+ from dataclasses import dataclass, field
8
+ from functools import lru_cache
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ from main import (
14
+ apply_fix,
15
+ apply_local_fix,
16
+ get_default_workflow,
17
+ validate_entry,
18
+ )
19
+ from src.comparator import EntryReport, MetadataComparator
20
+ from src.fetcher import (
21
+ ArxivFetcher,
22
+ CrossRefFetcher,
23
+ DBLPFetcher,
24
+ OpenAlexFetcher,
25
+ ScholarFetcher,
26
+ SemanticScholarFetcher,
27
+ )
28
+ from src.local_db import LocalConferenceDB
29
+ from src.parser import BibEntry, BibParser
30
+ from src.sanitizer import BibSanitizer, SanitizeFix
31
+
32
+
33
+ @dataclass
34
+ class RefCheckOptions:
35
+ """Options for a non-interactive RefCheck run."""
36
+
37
+ remove_unverified: bool = True
38
+ enable_google_scholar: bool = False
39
+ max_workers: int = 4
40
+
41
+
42
+ @dataclass
43
+ class RefCheckResult:
44
+ """Artifacts and summary produced by a Space run."""
45
+
46
+ total_input: int = 0
47
+ total_output: int = 0
48
+ verified: int = 0
49
+ issues: int = 0
50
+ not_found: int = 0
51
+ fixed_details: dict[str, list[str]] = field(default_factory=dict)
52
+ removed_details: list[tuple[str, str, str]] = field(default_factory=list)
53
+ review_details: list[dict[str, Any]] = field(default_factory=list)
54
+ duplicate_details: dict[str, list[str]] = field(default_factory=dict)
55
+ sanitize_fixes: dict[str, list[SanitizeFix]] = field(default_factory=dict)
56
+ local_matches: int = 0
57
+ local_db_loaded: bool = False
58
+ fixed_bib_path: str = ""
59
+ report_path: str = ""
60
+ report_markdown: str = ""
61
+
62
+
63
+ def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = None) -> RefCheckResult:
64
+ """Validate and fix an uploaded BibTeX file without interactive prompts."""
65
+ options = options or RefCheckOptions()
66
+ source_path = Path(file_path)
67
+ parser = BibParser()
68
+ entries = parser.parse_file(str(source_path))
69
+ result = RefCheckResult(total_input=len(entries))
70
+
71
+ if not entries:
72
+ result.report_markdown = "## RefCheck Report\n\nNo BibTeX entries were found."
73
+ result.report_path = _write_report(result.report_markdown)
74
+ result.fixed_bib_path = _write_bib(parser, [], source_path.stem)
75
+ return result
76
+
77
+ sanitizer = BibSanitizer()
78
+ result.sanitize_fixes = sanitizer.sanitize_all(entries)
79
+ _record_sanitize_fixes(result.fixed_details, result.sanitize_fixes)
80
+ result.duplicate_details = sanitizer.find_duplicates(entries)
81
+
82
+ result.local_db_loaded, api_entries, result.local_matches = _apply_local_db(entries, result.fixed_details)
83
+
84
+ fetchers = _build_fetchers()
85
+ workflow = get_default_workflow()
86
+ for step in workflow.steps:
87
+ if step.name == "google_scholar":
88
+ step.enabled = options.enable_google_scholar
89
+
90
+ comparator = MetadataComparator()
91
+ analysis = _analyze_entries(api_entries, workflow, fetchers, comparator, options.max_workers)
92
+
93
+ actions: dict[str, tuple[str, Any, list[Any]]] = {}
94
+
95
+ for entry, best_result, candidates in analysis:
96
+ if not best_result:
97
+ actions[entry.key] = ("keep", None, [])
98
+ elif getattr(entry, "_force_api_lookup", False) and best_result.fetched_data:
99
+ actions[entry.key] = ("fix", best_result, candidates)
100
+ elif best_result.confidence > 0.85 and best_result.fetched_data:
101
+ actions[entry.key] = ("fix", best_result, candidates)
102
+ elif best_result.is_match:
103
+ actions[entry.key] = ("keep", best_result, candidates)
104
+ elif candidates:
105
+ actions[entry.key] = ("review", best_result, candidates)
106
+ else:
107
+ actions[entry.key] = ("remove", best_result, candidates)
108
+
109
+ updated_entries: list[BibEntry] = []
110
+
111
+ for entry in entries:
112
+ action, best_result, candidates = actions.get(entry.key, ("keep", None, []))
113
+
114
+ if action == "fix":
115
+ changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
116
+ if changes:
117
+ result.fixed_details.setdefault(entry.key, []).extend(changes)
118
+ updated_entries.append(entry)
119
+ elif action == "review":
120
+ result.review_details.append(_review_payload(entry, best_result, candidates))
121
+ updated_entries.append(entry)
122
+ elif action == "remove":
123
+ if options.remove_unverified:
124
+ result.removed_details.append((entry.key, entry.title, "No matching metadata found in any source"))
125
+ else:
126
+ result.review_details.append(
127
+ {
128
+ "key": entry.key,
129
+ "title": entry.title,
130
+ "reason": "No matching metadata found in any source",
131
+ "candidates": [],
132
+ }
133
+ )
134
+ updated_entries.append(entry)
135
+ else:
136
+ updated_entries.append(entry)
137
+
138
+ result.total_output = len(updated_entries)
139
+ fixed_path = _write_bib(parser, updated_entries, source_path.stem)
140
+ result.fixed_bib_path = fixed_path
141
+
142
+ verified_entries = parser.parse_file(fixed_path)
143
+ verification_reports = _verify_entries(
144
+ verified_entries,
145
+ workflow,
146
+ fetchers,
147
+ comparator,
148
+ options.max_workers,
149
+ )
150
+ result.verified = sum(1 for r in verification_reports if r.comparison and r.comparison.is_match)
151
+ result.issues = sum(1 for r in verification_reports if r.comparison and r.comparison.has_issues)
152
+ result.not_found = sum(
153
+ 1
154
+ for r in verification_reports
155
+ if r.comparison and not r.comparison.is_match and not r.comparison.has_issues
156
+ )
157
+
158
+ result.report_markdown = _build_report(result, verification_reports)
159
+ result.report_path = _write_report(result.report_markdown)
160
+ return result
161
+
162
+
163
+ def _build_fetchers() -> dict[str, Any]:
164
+ return {
165
+ "arxiv": ArxivFetcher(),
166
+ "crossref": CrossRefFetcher(),
167
+ "scholar": ScholarFetcher(),
168
+ "semantic": SemanticScholarFetcher(),
169
+ "openalex": OpenAlexFetcher(),
170
+ "dblp": DBLPFetcher(),
171
+ }
172
+
173
+
174
+ def _analyze_entries(
175
+ entries: list[BibEntry],
176
+ workflow: Any,
177
+ fetchers: dict[str, Any],
178
+ comparator: MetadataComparator,
179
+ max_workers: int,
180
+ ) -> list[tuple[BibEntry, Any, list[Any]]]:
181
+ if not entries:
182
+ return []
183
+
184
+ analysis: list[tuple[BibEntry, Any, list[Any]]] = []
185
+ worker_count = min(max(1, max_workers), len(entries))
186
+ with ThreadPoolExecutor(max_workers=worker_count) as executor:
187
+ futures = {
188
+ executor.submit(validate_entry, entry, workflow, fetchers, comparator): entry
189
+ for entry in entries
190
+ }
191
+ for future in as_completed(futures):
192
+ entry = futures[future]
193
+ try:
194
+ best_result, candidates = future.result()
195
+ except Exception:
196
+ best_result, candidates = None, []
197
+ analysis.append((entry, best_result, candidates))
198
+ return analysis
199
+
200
+
201
+ def _verify_entries(
202
+ entries: list[BibEntry],
203
+ workflow: Any,
204
+ fetchers: dict[str, Any],
205
+ comparator: MetadataComparator,
206
+ max_workers: int,
207
+ ) -> list[EntryReport]:
208
+ reports: list[EntryReport] = []
209
+ for entry, best_result, _ in _analyze_entries(entries, workflow, fetchers, comparator, max_workers):
210
+ reports.append(EntryReport(entry=entry, comparison=best_result))
211
+ return reports
212
+
213
+
214
+ def _record_sanitize_fixes(
215
+ fixed_details: dict[str, list[str]],
216
+ sanitize_fixes: dict[str, list[SanitizeFix]],
217
+ ) -> None:
218
+ for key, fixes in sanitize_fixes.items():
219
+ fixed_details.setdefault(key, [])
220
+ fixed_details[key].extend(fix.description for fix in fixes)
221
+
222
+
223
+ def _apply_local_db(
224
+ entries: list[BibEntry],
225
+ fixed_details: dict[str, list[str]],
226
+ ) -> tuple[bool, list[BibEntry], int]:
227
+ local_db = _load_local_db()
228
+ if not local_db.is_loaded:
229
+ return False, entries, 0
230
+
231
+ api_entries = []
232
+ match_count = 0
233
+ for entry in entries:
234
+ official = local_db.lookup(entry.title)
235
+ if not official:
236
+ api_entries.append(entry)
237
+ continue
238
+
239
+ changes = apply_local_fix(entry, official)
240
+ match_count += 1
241
+ if changes:
242
+ fixed_details.setdefault(entry.key, []).extend(changes)
243
+
244
+ return True, api_entries, match_count
245
+
246
+
247
+ @lru_cache(maxsize=1)
248
+ def _load_local_db() -> LocalConferenceDB:
249
+ local_db = LocalConferenceDB()
250
+ local_db.load()
251
+ return local_db
252
+
253
+
254
+ def _review_payload(entry: BibEntry, best_result: Any, candidates: list[Any]) -> dict[str, Any]:
255
+ return {
256
+ "key": entry.key,
257
+ "title": entry.title,
258
+ "reason": "; ".join(best_result.issues) if best_result and best_result.issues else "Ambiguous match",
259
+ "candidates": [
260
+ {
261
+ "source": candidate.source,
262
+ "confidence": candidate.confidence,
263
+ "title": getattr(candidate.fetched_data, "title", ""),
264
+ "year": getattr(candidate.fetched_data, "year", ""),
265
+ "doi": getattr(candidate.fetched_data, "doi", ""),
266
+ }
267
+ for candidate in candidates[:5]
268
+ ],
269
+ }
270
+
271
+
272
+ def _write_bib(parser: BibParser, entries: list[BibEntry], original_stem: str) -> str:
273
+ out_dir = Path(tempfile.mkdtemp(prefix="refcheck_"))
274
+ out_path = out_dir / f"{original_stem or 'references'}_refcheck_fixed.bib"
275
+ parser.save_entries(str(out_path), entries)
276
+ return str(out_path)
277
+
278
+
279
+ def _write_report(markdown: str) -> str:
280
+ out_dir = Path(tempfile.mkdtemp(prefix="refcheck_report_"))
281
+ out_path = out_dir / "refcheck_report.md"
282
+ out_path.write_text(markdown, encoding="utf-8")
283
+ return str(out_path)
284
+
285
+
286
+ def _build_report(result: RefCheckResult, reports: list[EntryReport]) -> str:
287
+ lines = [
288
+ "## RefCheck Report",
289
+ "",
290
+ "### Summary",
291
+ "",
292
+ f"- Input entries: {result.total_input}",
293
+ f"- Output entries: {result.total_output}",
294
+ f"- Verified after fix: {result.verified}",
295
+ f"- Remaining issues: {result.issues}",
296
+ f"- Not found after fix: {result.not_found}",
297
+ f"- Local DB loaded: {'yes' if result.local_db_loaded else 'no'}",
298
+ f"- Local DB matches: {result.local_matches}",
299
+ "",
300
+ ]
301
+
302
+ if result.removed_details:
303
+ lines.extend(["### Removed", ""])
304
+ for key, title, reason in result.removed_details:
305
+ lines.append(f"- `{key}`: {title} ({reason})")
306
+ lines.append("")
307
+
308
+ if result.fixed_details:
309
+ lines.extend(["### Fixed", ""])
310
+ for key, changes in sorted(result.fixed_details.items()):
311
+ lines.append(f"- `{key}`")
312
+ for change in changes:
313
+ lines.append(f" - {change}")
314
+ lines.append("")
315
+
316
+ if result.duplicate_details:
317
+ lines.extend(["### Duplicate Titles", ""])
318
+ for title, keys in result.duplicate_details.items():
319
+ lines.append(f"- `{', '.join(keys)}`: {title}")
320
+ lines.append("")
321
+
322
+ if result.review_details:
323
+ lines.extend(["### Needs Review", ""])
324
+ for item in result.review_details:
325
+ lines.append(f"- `{item['key']}`: {item['title']}")
326
+ lines.append(f" - Reason: {item['reason']}")
327
+ for candidate in item["candidates"]:
328
+ lines.append(
329
+ " - Candidate: "
330
+ f"{candidate['source']} "
331
+ f"(confidence {candidate['confidence']:.2f}) "
332
+ f"{candidate['title']} "
333
+ f"{candidate['year']} "
334
+ f"{candidate['doi']}".strip()
335
+ )
336
+ lines.append("")
337
+
338
+ remaining = [
339
+ report
340
+ for report in reports
341
+ if report.comparison and not report.comparison.is_match
342
+ ]
343
+ if remaining:
344
+ lines.extend(["### Verification Issues", ""])
345
+ for report in remaining:
346
+ comparison = report.comparison
347
+ issues = "; ".join(comparison.issues) if comparison.issues else "Not matched"
348
+ lines.append(
349
+ f"- `{report.entry.key}` via {comparison.source} "
350
+ f"(confidence {comparison.confidence:.2f}): {issues}"
351
+ )
352
+ lines.append("")
353
+
354
+ return "\n".join(lines).strip() + "\n"
src/ui.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rich.console import Console
2
+ from rich.table import Table
3
+ from rich.panel import Panel
4
+ from rich.tree import Tree
5
+ import copy
6
+
7
+ class BibUI:
8
+ """Handles all terminal UI interactions for BibGuard."""
9
+
10
+ def __init__(self):
11
+ self.console = Console()
12
+
13
+ def show_analysis_report(self, ok_entries, to_fix, to_review, to_remove):
14
+ """Display the initial analysis summary table."""
15
+ table = Table(title="📊 Analysis Report", show_header=True, header_style="bold magenta")
16
+ table.add_column("Category", style="cyan")
17
+ table.add_column("Count", justify="right")
18
+ table.add_column("Description")
19
+
20
+ table.add_row("✅ Correct", str(len(ok_entries)), "Entries match valid metadata")
21
+ table.add_row("🛠️ To Fix", str(len(to_fix)), "[green]High confidence auto-fixes[/green]")
22
+ table.add_row("🔍 Review", str(len(to_review)), "[yellow]Ambiguous or low confidence[/yellow]")
23
+ table.add_row("🗑️ Remove", str(len(to_remove)), "[red]No metadata found (Hallucinations)[/red]")
24
+
25
+ self.console.print(table)
26
+
27
+ if not (to_fix or to_review or to_remove):
28
+ self.console.print(Panel("[green]✓ No issues found. All entries are valid.[/green]", title="Status"))
29
+
30
+ def show_manual_review(self, entry, best_res, candidates, apply_fix_func):
31
+ """Display manual review table for a single entry."""
32
+ self.console.print(f"\n[bold]Entry: {entry.key}[/bold]")
33
+ self.console.print(f"Title: {entry.title}")
34
+ self.console.print(f"Year: {entry.year}")
35
+ self.console.print(f"Auth: {entry.author}")
36
+
37
+ cand_table = Table(show_header=True, header_style="bold blue")
38
+ cand_table.add_column("#", style="dim", width=4)
39
+ cand_table.add_column("Source", style="cyan", width=12)
40
+ cand_table.add_column("Conf", justify="right")
41
+ cand_table.add_column("Candidate Metadata (Fetched)", style="white")
42
+ cand_table.add_column("Proposed Changes", style="green")
43
+
44
+ for i, cand in enumerate(candidates, 1):
45
+ # We need to simulate the fix to show changes
46
+ # We pass the apply_fix function to avoid circular dependency or logic duplication
47
+ temp_entry = copy.deepcopy(entry)
48
+ changes = apply_fix_func(temp_entry, cand.fetched_data)
49
+ change_desc = "\n".join(changes) if changes else "[dim]No changes[/dim]"
50
+
51
+ conf_style = "green" if cand.confidence > 0.7 else "yellow" if cand.confidence > 0.4 else "red"
52
+
53
+ # Format the candidate's actual metadata
54
+ fd = cand.fetched_data
55
+ meta_lines = []
56
+ if getattr(fd, 'title', None):
57
+ meta_lines.append(f"[bold]Title:[/bold] {fd.title[:60] + '...' if len(fd.title) > 60 else fd.title}")
58
+ if getattr(fd, 'authors', None):
59
+ a_str = " and ".join(fd.authors)
60
+ meta_lines.append(f"[bold]Authors:[/bold] {a_str[:60] + '...' if len(a_str) > 60 else a_str}")
61
+ if getattr(fd, 'year', None):
62
+ meta_lines.append(f"[bold]Year:[/bold] {fd.year}")
63
+ if getattr(fd, 'doi', None):
64
+ meta_lines.append(f"[bold]DOI:[/bold] {fd.doi}")
65
+ meta_desc = "\n".join(meta_lines) if meta_lines else "[dim]No metadata details[/dim]"
66
+
67
+ cand_table.add_row(
68
+ str(i),
69
+ cand.source,
70
+ f"[{conf_style}]{cand.confidence:.2f}[/{conf_style}]",
71
+ meta_desc,
72
+ change_desc
73
+ )
74
+
75
+ self.console.print(cand_table)
76
+
77
+ def show_final_report(self, total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details):
78
+ """Display the verification status and modification tree."""
79
+ # Visual Final Status
80
+ status_table = Table(box=None, padding=(0, 2))
81
+ status_table.add_column("Metric", style="bold")
82
+ status_table.add_column("Value", justify="right")
83
+ status_table.add_row("Total Entries", str(total))
84
+ status_table.add_row("Verified", f"[green]{verified}[/green]")
85
+ status_table.add_row("Issues", f"[red]{issues}[/red]" if issues > 0 else "0")
86
+ status_table.add_row("Not Found", f"[yellow]{not_found}[/yellow]" if not_found > 0 else "0")
87
+
88
+ self.console.print(Panel(status_table, title="📊 Final Status", expand=False))
89
+
90
+ if issues > 0:
91
+ self.console.print("\n[bold red]⚠ Remaining Issues (Not Auto-Fixed):[/bold red]")
92
+ for r in reports:
93
+ if r.comparison and r.comparison.has_issues:
94
+ self.console.print(f" - [bold]{r.entry.key}[/bold] (Conf: {r.comparison.confidence:.2f}): {', '.join(r.comparison.issues)}")
95
+
96
+ # Report fixes and removals
97
+ if fixed_count > 0 or removed_details:
98
+ tree = Tree("✏️ Modifications Report")
99
+
100
+ if removed_details:
101
+ rem_node = tree.add(f"[red]Removed {len(removed_details)} entries[/red]")
102
+ for entry, reason in removed_details:
103
+ rem_node.add(f"[bold]{entry.key}[/bold]: \"{entry.title}\" ([italic]{reason}[/italic])")
104
+
105
+ if fixed_count > 0:
106
+ fix_node = tree.add(f"[green]Fixed {fixed_count} entries[/green]")
107
+ for key, changes in fixed_details.items():
108
+ entry_node = fix_node.add(f"[bold]{key}[/bold]")
109
+ for change in changes:
110
+ entry_node.add(change)
111
+
112
+ self.console.print(tree)
113
+ self.console.print("\n[green]✓ Changes applied and saved to file.[/green]")
114
+ else:
115
+ self.console.print("\n[green]✓ No changes were needed.[/green]")
116
+
117
+ def show_sanitize_report(self, sanitize_fixes: dict):
118
+ """Display sanitization results as a rich tree."""
119
+ if not sanitize_fixes:
120
+ self.console.print("[green]✓ No formatting issues found.[/green]\n")
121
+ return
122
+
123
+ # Category display info
124
+ category_info = {
125
+ "dblp_id": ("🔢", "DBLP Disambiguation ID Cleanup", "red"),
126
+ "corporate_author": ("🏢", "Corporate Author Protection", "yellow"),
127
+ "entry_type": ("📋", "Entry Type Correction", "cyan"),
128
+ "title_case": ("🔤", "Title Capitalization Protection", "blue"),
129
+ "doi_mismatch": ("🔗", "DOI Mismatch", "red"),
130
+ "future_year": ("📅", "Future Year Detection", "magenta"),
131
+ "field_cleanup": ("🧹", "Junk Field Removal", "dim"),
132
+ }
133
+
134
+ total_fixes = sum(len(fixes) for fixes in sanitize_fixes.values())
135
+ tree = Tree(f"🧹 Sanitization Report ({total_fixes} fixes in {len(sanitize_fixes)} entries)")
136
+
137
+ # Group fixes by category across all entries
138
+ by_category = {}
139
+ for entry_key, fixes in sanitize_fixes.items():
140
+ for fix in fixes:
141
+ if fix.category not in by_category:
142
+ by_category[fix.category] = []
143
+ by_category[fix.category].append(fix)
144
+
145
+ for cat, fixes in by_category.items():
146
+ icon, label, color = category_info.get(cat, ("❓", cat, "white"))
147
+ cat_node = tree.add(f"{icon} [{color}]{label} ({len(fixes)})[/{color}]")
148
+ for fix in fixes:
149
+ cat_node.add(f"[bold]{fix.entry_key}[/bold]: {fix.description}")
150
+
151
+ self.console.print(tree)
152
+ self.console.print("")
153
+
src/utils.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for BibGuard: Normalization and Progress Display.
3
+ """
4
+ import re
5
+ import unicodedata
6
+ import time
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass
9
+ from typing import Optional, List
10
+ from unidecode import unidecode
11
+ from rich.console import Console
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
13
+
14
+
15
+ class TextNormalizer:
16
+ """Utility class for normalizing text for comparison."""
17
+
18
+ # DBLP disambiguation ID pattern: 4-digit number at end of author name
19
+ # e.g. "Tian Tan 0019", "Wei Li 0119", "Zejun Ma 0001"
20
+ DBLP_DISAMBIG_PATTERN = re.compile(r'\s+\d{4}\s*$')
21
+
22
+ # LaTeX command patterns
23
+ LATEX_COMMANDS = [
24
+ (r'\\textbf\{([^}]*)\}', r'\1'),
25
+ (r'\\textit\{([^}]*)\}', r'\1'),
26
+ (r'\\emph\{([^}]*)\}', r'\1'),
27
+ (r'\\textrm\{([^}]*)\}', r'\1'),
28
+ (r'\\texttt\{([^}]*)\}', r'\1'),
29
+ (r'\\textsf\{([^}]*)\}', r'\1'),
30
+ (r'\\textsc\{([^}]*)\}', r'\1'),
31
+ (r'\\text\{([^}]*)\}', r'\1'),
32
+ (r'\\mathrm\{([^}]*)\}', r'\1'),
33
+ (r'\\mathbf\{([^}]*)\}', r'\1'),
34
+ (r'\\mathit\{([^}]*)\}', r'\1'),
35
+ (r'\\url\{([^}]*)\}', r'\1'),
36
+ (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'),
37
+ ]
38
+
39
+ # LaTeX special character mappings
40
+ LATEX_CHARS = {
41
+ r'\&': '&',
42
+ r'\%': '%',
43
+ r'\$': '$',
44
+ r'\#': '#',
45
+ r'\_': '_',
46
+ r'\{': '{',
47
+ r'\}': '}',
48
+ r'\~': '~',
49
+ r'\^': '^',
50
+ r'``': '"',
51
+ r"''": '"',
52
+ r'`': "'",
53
+ r"'": "'",
54
+ r'--': '–',
55
+ r'---': '—',
56
+ }
57
+
58
+ # LaTeX accent commands
59
+ LATEX_ACCENTS = [
60
+ (r"\\'([aeiouAEIOU])", r'\1'), # acute
61
+ (r'\\`([aeiouAEIOU])', r'\1'), # grave
62
+ (r'\\^([aeiouAEIOU])', r'\1'), # circumflex
63
+ (r'\\"([aeiouAEIOU])', r'\1'), # umlaut
64
+ (r'\\~([nNaAoO])', r'\1'), # tilde
65
+ (r'\\c\{([cC])\}', r'\1'), # cedilla
66
+ (r"\\'{([aeiouAEIOU])}", r'\1'),
67
+ (r'\\`{([aeiouAEIOU])}', r'\1'),
68
+ (r'\\^{([aeiouAEIOU])}', r'\1'),
69
+ (r'\\"{([aeiouAEIOU])}', r'\1'),
70
+ (r'\\~{([nNaAoO])}', r'\1'),
71
+ ]
72
+
73
+ @classmethod
74
+ def normalize_latex(cls, text: str) -> str:
75
+ """Remove LaTeX formatting commands."""
76
+ if not text: return ""
77
+ result = text
78
+ for pattern, replacement in cls.LATEX_COMMANDS:
79
+ result = re.sub(pattern, replacement, result)
80
+ for pattern, replacement in cls.LATEX_ACCENTS:
81
+ result = re.sub(pattern, replacement, result)
82
+ for latex_char, normal_char in cls.LATEX_CHARS.items():
83
+ result = result.replace(latex_char, normal_char)
84
+ return re.sub(r'[{}]', '', result)
85
+
86
+ @classmethod
87
+ def normalize_unicode(cls, text: str) -> str:
88
+ """Normalize Unicode characters to ASCII."""
89
+ if not text: return ""
90
+ text = unicodedata.normalize('NFKD', text)
91
+ return unidecode(text)
92
+
93
+ @classmethod
94
+ def normalize_for_comparison(cls, text: str) -> str:
95
+ """Full normalization pipeline for text comparison."""
96
+ if not text: return ""
97
+ text = cls.normalize_latex(text)
98
+ text = cls.normalize_unicode(text)
99
+ text = text.lower()
100
+ text = re.sub(r'\s+', ' ', text).strip()
101
+ return re.sub(r'[^\w\s]', '', text)
102
+
103
+ @classmethod
104
+ def strip_dblp_disambiguation_id(cls, name: str) -> str:
105
+ """Strip DBLP disambiguation suffix (4-digit number) from author name.
106
+
107
+ DBLP appends codes like '0001', '0019' to disambiguate homonymous authors.
108
+ e.g. 'Tian Tan 0019' -> 'Tian Tan'
109
+ 'Wei Li 0119' -> 'Wei Li'
110
+ """
111
+ if not name:
112
+ return name
113
+ return cls.DBLP_DISAMBIG_PATTERN.sub('', name).strip()
114
+
115
+ @classmethod
116
+ def has_dblp_disambiguation_id(cls, name: str) -> bool:
117
+ """Check if an author name contains a DBLP disambiguation ID."""
118
+ if not name:
119
+ return False
120
+ return bool(cls.DBLP_DISAMBIG_PATTERN.search(name))
121
+
122
+ @classmethod
123
+ def normalize_author_name(cls, name: str) -> str:
124
+ """Normalize author name format."""
125
+ if not name: return ""
126
+ name = cls.normalize_latex(name)
127
+ name = cls.normalize_unicode(name)
128
+ # Strip DBLP disambiguation IDs before further processing
129
+ name = cls.strip_dblp_disambiguation_id(name)
130
+ name = re.sub(r'\s+', ' ', name).strip()
131
+ if ',' in name:
132
+ parts = name.split(',', 1)
133
+ if len(parts) == 2:
134
+ name = f"{parts[1].strip()} {parts[0].strip()}"
135
+ name = name.lower()
136
+ return re.sub(r'[^\w\s]', '', name)
137
+
138
+ @classmethod
139
+ def parse_author_list(cls, authors: str) -> list[str]:
140
+ """Parse author string into a list of raw author names."""
141
+ if not authors: return []
142
+ # Split by ' and ', keeping original formatting
143
+ return re.split(r'\s+and\s+', authors, flags=re.IGNORECASE)
144
+
145
+ @classmethod
146
+ def normalize_author_list(cls, authors: str) -> list[str]:
147
+ """Parse and normalize a list of authors."""
148
+ if not authors: return []
149
+ author_list = cls.parse_author_list(authors)
150
+ normalized = []
151
+ for author in author_list:
152
+ norm = cls.normalize_author_name(author.strip())
153
+ if norm: normalized.append(norm)
154
+ return normalized
155
+
156
+ @classmethod
157
+ def similarity_ratio(cls, text1: str, text2: str) -> float:
158
+ """Calculate Jaccard similarity between two strings."""
159
+ if not text1 or not text2: return 0.0
160
+ words1, words2 = set(text1.split()), set(text2.split())
161
+ if not words1 and not words2: return 1.0
162
+ if not words1 or not words2: return 0.0
163
+ return len(words1 & words2) / len(words1 | words2)
164
+
165
+ @classmethod
166
+ def levenshtein_similarity(cls, s1: str, s2: str) -> float:
167
+ """Calculate normalized Levenshtein similarity."""
168
+ if not s1 and not s2: return 1.0
169
+ if not s1 or not s2: return 0.0
170
+ m, n = len(s1), len(s2)
171
+ dp = [list(range(n + 1))] + [[i] + [0]*n for i in range(1, m + 1)]
172
+ for i in range(1, m + 1):
173
+ for j in range(1, n + 1):
174
+ dp[i][j] = dp[i-1][j-1] if s1[i-1] == s2[j-1] else min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
175
+ return 1.0 - (dp[m][n] / max(m, n))
176
+
177
+
178
+ @dataclass
179
+ class ProgressStats:
180
+ """Statistics for progress display."""
181
+ total: int = 0
182
+ processed: int = 0
183
+ success: int = 0
184
+ warnings: int = 0
185
+ errors: int = 0
186
+
187
+
188
+ class ProgressDisplay:
189
+ """Rich terminal progress display."""
190
+
191
+ def __init__(self):
192
+ self.console = Console()
193
+ self.stats = ProgressStats()
194
+ self._progress: Optional[Progress] = None
195
+ self._task = None
196
+
197
+ @contextmanager
198
+ def progress_context(self, total: int, description: str = "Processing"):
199
+ """Context manager for progress display."""
200
+ self.stats.total = total
201
+ with Progress(
202
+ SpinnerColumn(),
203
+ TextColumn("[progress.description]{task.description}"),
204
+ BarColumn(bar_width=40),
205
+ TaskProgressColumn(),
206
+ TimeElapsedColumn(),
207
+ console=self.console,
208
+ transient=False
209
+ ) as progress:
210
+ self._progress = progress
211
+ self._task = progress.add_task(description, total=total)
212
+ try:
213
+ yield self
214
+ finally:
215
+ self._progress = None
216
+ self._task = None
217
+
218
+ def update(self, entry_key: str = "", task: str = "", advance: int = 0):
219
+ """Update progress display."""
220
+ if self._progress and self._task is not None:
221
+ desc = f"[cyan]{entry_key}[/cyan] - {task}" if entry_key else task
222
+ self._progress.update(self._task, description=desc, advance=advance)
223
+ self.stats.processed += advance
224
+
225
+ def mark_success(self): self.stats.success += 1
226
+ def mark_warning(self): self.stats.warnings += 1
227
+ def mark_error(self): self.stats.errors += 1
228
+ def print_error(self, message: str):
229
+ self.console.print(f" [red]✗[/red] {message}")