Add RefCheck Gradio Space
Browse files- .DS_Store +0 -0
- .gitattributes +4 -0
- .gitignore +24 -0
- README.md +213 -6
- app.py +87 -0
- data/abbr.tsv +35 -0
- data/index_shards/index_00.json +3 -0
- data/index_shards/index_01.json +3 -0
- data/index_shards/index_02.json +3 -0
- data/index_shards/index_03.json +3 -0
- main.py +561 -0
- requirements.txt +7 -0
- scripts/build_index.py +143 -0
- scripts/refresh_db.sh +19 -0
- scripts/update_db.py +131 -0
- src/__init__.py +1 -0
- src/comparator.py +326 -0
- src/fetcher.py +254 -0
- src/local_db.py +108 -0
- src/normalizer.py +51 -0
- src/parser.py +316 -0
- src/sanitizer.py +493 -0
- src/space_service.py +354 -0
- src/ui.py +153 -0
- src/utils.py +229 -0
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/index_shards/index_00.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/index_shards/index_01.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/index_shards/index_02.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/index_shards/index_03.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
.eggs/
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
|
| 10 |
+
# Environment
|
| 11 |
+
.env
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
|
| 15 |
+
# IDE
|
| 16 |
+
.idea/
|
| 17 |
+
.vscode/
|
| 18 |
+
*.swp
|
| 19 |
+
|
| 20 |
+
# DBLP raw data (regenerate with: python scripts/update_db.py)
|
| 21 |
+
data/raw/
|
| 22 |
+
|
| 23 |
+
# Legacy single-file index (replaced by sharded index)
|
| 24 |
+
data/conference_index.json
|
README.md
CHANGED
|
@@ -1,13 +1,220 @@
|
|
| 1 |
---
|
| 2 |
title: RefCheck
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.16.0
|
| 8 |
-
python_version: '3.13'
|
| 9 |
app_file: app.py
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: RefCheck
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
|
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
+
python_version: 3.11
|
| 9 |
+
suggested_hardware: cpu-basic
|
| 10 |
+
fullWidth: true
|
| 11 |
+
short_description: Upload BibTeX, validate citations, download fixes.
|
| 12 |
+
tags:
|
| 13 |
+
- bibtex
|
| 14 |
+
- citations
|
| 15 |
+
- academic
|
| 16 |
+
- bibliography
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# RefCheck 🔍
|
| 20 |
+
|
| 21 |
+
> **A Citation Hallucination Detector & Auto-Fixer**
|
| 22 |
+
> Validate and automatically correct your BibTeX bibliography against multiple academic databases.
|
| 23 |
+
|
| 24 |
+
[](https://www.python.org/downloads/)
|
| 25 |
+
[](LICENSE)
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## Why RefCheck?
|
| 30 |
+
|
| 31 |
+
Academic papers often contain citation errors — wrong titles, incorrect authors, mismatched years, or even completely fabricated references (hallucinations from AI tools). **RefCheck** automatically:
|
| 32 |
+
|
| 33 |
+
- ✅ **Validates** each citation against 6 academic databases
|
| 34 |
+
- 🔧 **Auto-fixes** metadata mismatches (title, authors, year, DOI)
|
| 35 |
+
- 🗑️ **Removes** unverifiable/hallucinated entries
|
| 36 |
+
- 📊 **Reports** a clear verification summary
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## Features
|
| 41 |
+
|
| 42 |
+
### Multi-Source Verification
|
| 43 |
+
|
| 44 |
+
RefCheck cross-references your citations against:
|
| 45 |
+
|
| 46 |
+
| Source | Lookup Methods |
|
| 47 |
+
|--------|----------------|
|
| 48 |
+
| **arXiv** | arXiv ID, Title search |
|
| 49 |
+
| **CrossRef** | DOI, Title search |
|
| 50 |
+
| **DBLP** | Title search |
|
| 51 |
+
| **Semantic Scholar** | DOI, Title search |
|
| 52 |
+
| **OpenAlex** | DOI, Title search |
|
| 53 |
+
| **Google Scholar** | Title search (disabled by default) |
|
| 54 |
+
|
| 55 |
+
### Two-Pass Workflow
|
| 56 |
+
|
| 57 |
+
1. **Pass 1 — Validate & Fix**: Checks each entry, auto-corrects metadata, removes invalid citations
|
| 58 |
+
2. **Pass 2 — Verify**: Re-validates the cleaned file to confirm all entries are correct
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## Installation
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
# Clone the repository
|
| 66 |
+
git clone https://github.com/voidful/RefCheck.git
|
| 67 |
+
cd RefCheck
|
| 68 |
+
|
| 69 |
+
# Install dependencies
|
| 70 |
+
pip install -r requirements.txt
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Requirements
|
| 74 |
+
|
| 75 |
+
- Python 3.9+
|
| 76 |
+
- Dependencies: `bibtexparser`, `requests`, `beautifulsoup4`, `rich`, `Unidecode`, `lxml`
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## Usage
|
| 81 |
+
|
| 82 |
+
### Hugging Face Space
|
| 83 |
+
|
| 84 |
+
This repository is ready to run as a Gradio Space. Create a Hugging Face Space with the Gradio SDK, push these files, and the Space will launch `app.py`.
|
| 85 |
+
|
| 86 |
+
The Space UI accepts a `.bib` upload and returns:
|
| 87 |
+
|
| 88 |
+
- a corrected BibTeX file
|
| 89 |
+
- a Markdown validation report
|
| 90 |
+
- a list of entries that still need manual review
|
| 91 |
+
|
| 92 |
+
### Basic Usage
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
# Validate and auto-fix a bib file
|
| 96 |
+
python main.py --bib references.bib
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Command-Line Options
|
| 100 |
+
|
| 101 |
+
| Option | Short | Description |
|
| 102 |
+
|--------|-------|-------------|
|
| 103 |
+
| `--bib` | `-b` | Path to your `.bib` file (required) |
|
| 104 |
+
| `--output` | `-o` | Output report path (optional) |
|
| 105 |
+
|
| 106 |
+
### Example
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
# Process your bibliography
|
| 110 |
+
python main.py --bib paper/references.bib
|
| 111 |
+
|
| 112 |
+
# With custom output path
|
| 113 |
+
python main.py --bib refs.bib --output validation_report.md
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
---
|
| 117 |
+
|
| 118 |
+
## How It Works
|
| 119 |
+
|
| 120 |
+
```
|
| 121 |
+
┌─────────────────┐
|
| 122 |
+
│ Load .bib file │
|
| 123 |
+
└────────┬────────┘
|
| 124 |
+
▼
|
| 125 |
+
┌─────────────────────────────────────────┐
|
| 126 |
+
│ For each entry: │
|
| 127 |
+
│ 1. Query academic databases │
|
| 128 |
+
│ 2. Compare metadata (title, author, yr)│
|
| 129 |
+
│ 3. Calculate confidence score │
|
| 130 |
+
└────────┬────────────────────────────────┘
|
| 131 |
+
▼
|
| 132 |
+
┌─────────────────────────────────────────┐
|
| 133 |
+
│ Decision: │
|
| 134 |
+
│ • confidence > 85% → Auto-fix metadata │
|
| 135 |
+
│ • Match found → Keep as-is │
|
| 136 |
+
│ • No match → Remove entry │
|
| 137 |
+
└────────┬────────────────────────────────┘
|
| 138 |
+
▼
|
| 139 |
+
┌─────────────────────────────────────────┐
|
| 140 |
+
│ Save updated .bib file │
|
| 141 |
+
│ Run verification pass │
|
| 142 |
+
└─────────────────────────────────────────┘
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
|
| 147 |
+
## Output
|
| 148 |
+
|
| 149 |
+
RefCheck displays real-time progress and a final summary:
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
📚 BibGuard - Auto-Fix & Verify
|
| 153 |
+
Target: references.bib
|
| 154 |
+
|
| 155 |
+
Found 42 entries. Running validation and auto-fix...
|
| 156 |
+
|
| 157 |
+
Validating & Fixing ━━━━━━━━━━━━━━━━━ 100% 42/42 ✓ 38 ⚠ 2 ✗ 2
|
| 158 |
+
|
| 159 |
+
✏️ Updates:
|
| 160 |
+
- Fixed 2 entries (metadata updated)
|
| 161 |
+
- Removed 2 invalid/hallucinated entries
|
| 162 |
+
✓ File saved.
|
| 163 |
+
|
| 164 |
+
🔄 Double checking (Re-validation)...
|
| 165 |
+
|
| 166 |
+
Verifying ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 40/40 ✓ 40
|
| 167 |
+
|
| 168 |
+
==================================================
|
| 169 |
+
📊 Final Status
|
| 170 |
+
==================================================
|
| 171 |
+
Total: 40
|
| 172 |
+
✓ Verified: 40
|
| 173 |
+
⚠ Issues: 0
|
| 174 |
+
✗ Not found: 0
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
### Status Meanings
|
| 178 |
+
|
| 179 |
+
| Symbol | Meaning |
|
| 180 |
+
|--------|---------|
|
| 181 |
+
| ✅ Verified | Entry matches a known publication |
|
| 182 |
+
| ⚠️ Fixed | Metadata was auto-corrected |
|
| 183 |
+
| ❌ Removed | Entry could not be verified (likely hallucination) |
|
| 184 |
+
|
| 185 |
+
---
|
| 186 |
+
|
| 187 |
+
## Project Structure
|
| 188 |
+
|
| 189 |
+
```
|
| 190 |
+
RefCheck/
|
| 191 |
+
├── main.py # Entry point & workflow orchestration
|
| 192 |
+
├── requirements.txt # Python dependencies
|
| 193 |
+
├── README.md
|
| 194 |
+
└── src/
|
| 195 |
+
├── fetcher.py # API clients for academic databases
|
| 196 |
+
├── comparator.py # Metadata comparison & scoring
|
| 197 |
+
├── parser.py # BibTeX parsing & saving
|
| 198 |
+
└── utils.py # Progress display & text utilities
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## License
|
| 204 |
+
|
| 205 |
+
MIT License — see [LICENSE](LICENSE) for details.
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## Contributing
|
| 210 |
+
|
| 211 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 212 |
+
|
| 213 |
+
---
|
| 214 |
+
|
| 215 |
+
## Acknowledgments
|
| 216 |
+
|
| 217 |
+
Built with:
|
| 218 |
+
- [bibtexparser](https://github.com/sciunto-org/python-bibtexparser) for BibTeX handling
|
| 219 |
+
- [Rich](https://github.com/Textualize/rich) for beautiful terminal output
|
| 220 |
+
- APIs from arXiv, CrossRef, DBLP, Semantic Scholar, and OpenAlex
|
app.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
|
| 8 |
+
from src.space_service import RefCheckOptions, run_refcheck_file
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _uploaded_path(uploaded: Any) -> str | None:
|
| 12 |
+
if not uploaded:
|
| 13 |
+
return None
|
| 14 |
+
if isinstance(uploaded, str):
|
| 15 |
+
return uploaded
|
| 16 |
+
if isinstance(uploaded, dict):
|
| 17 |
+
return uploaded.get("path") or uploaded.get("name")
|
| 18 |
+
name = getattr(uploaded, "name", None)
|
| 19 |
+
if name:
|
| 20 |
+
return str(name)
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def process_bib(
|
| 25 |
+
uploaded: Any,
|
| 26 |
+
remove_unverified: bool,
|
| 27 |
+
enable_google_scholar: bool,
|
| 28 |
+
max_workers: int,
|
| 29 |
+
) -> tuple[str, str | None, str | None]:
|
| 30 |
+
file_path = _uploaded_path(uploaded)
|
| 31 |
+
if not file_path:
|
| 32 |
+
return "## RefCheck Report\n\nNo BibTeX file was uploaded.", None, None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
options = RefCheckOptions(
|
| 36 |
+
remove_unverified=remove_unverified,
|
| 37 |
+
enable_google_scholar=enable_google_scholar,
|
| 38 |
+
max_workers=int(max_workers),
|
| 39 |
+
)
|
| 40 |
+
result = run_refcheck_file(Path(file_path), options)
|
| 41 |
+
return result.report_markdown, result.fixed_bib_path, result.report_path
|
| 42 |
+
except Exception as exc:
|
| 43 |
+
return f"## RefCheck Report\n\nProcessing failed: `{exc}`", None, None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
with gr.Blocks(title="RefCheck") as demo:
|
| 47 |
+
gr.Markdown("# RefCheck")
|
| 48 |
+
|
| 49 |
+
with gr.Row():
|
| 50 |
+
with gr.Column(scale=1):
|
| 51 |
+
bib_file = gr.File(
|
| 52 |
+
label="BibTeX file",
|
| 53 |
+
file_types=[".bib", ".txt"],
|
| 54 |
+
type="filepath",
|
| 55 |
+
)
|
| 56 |
+
remove_unverified = gr.Checkbox(
|
| 57 |
+
label="Remove unverifiable entries",
|
| 58 |
+
value=True,
|
| 59 |
+
)
|
| 60 |
+
enable_google_scholar = gr.Checkbox(
|
| 61 |
+
label="Google Scholar fallback",
|
| 62 |
+
value=False,
|
| 63 |
+
)
|
| 64 |
+
max_workers = gr.Slider(
|
| 65 |
+
label="Parallel lookups",
|
| 66 |
+
minimum=1,
|
| 67 |
+
maximum=8,
|
| 68 |
+
step=1,
|
| 69 |
+
value=4,
|
| 70 |
+
)
|
| 71 |
+
run_button = gr.Button("Run RefCheck", variant="primary")
|
| 72 |
+
|
| 73 |
+
with gr.Column(scale=2):
|
| 74 |
+
report = gr.Markdown(label="Report")
|
| 75 |
+
fixed_bib = gr.File(label="Fixed BibTeX")
|
| 76 |
+
report_file = gr.File(label="Markdown report")
|
| 77 |
+
|
| 78 |
+
run_button.click(
|
| 79 |
+
fn=process_bib,
|
| 80 |
+
inputs=[bib_file, remove_unverified, enable_google_scholar, max_workers],
|
| 81 |
+
outputs=[report, fixed_bib, report_file],
|
| 82 |
+
api_name="refcheck",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
demo.queue(default_concurrency_limit=2).launch()
|
data/abbr.tsv
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pattern (regex, case-insensitive) Abbreviation
|
| 2 |
+
# Speech & Audio
|
| 3 |
+
.*Interspeech.* Interspeech
|
| 4 |
+
.*IEEE.*International Conference.*Acoustics.*Speech.*Signal Processing.* ICASSP
|
| 5 |
+
.*IEEE.*Automatic Speech Recognition.*Understanding.* ASRU
|
| 6 |
+
.*IEEE Spoken Language Technology.* SLT
|
| 7 |
+
.*IEEE/ACM Transactions on Audio.*Speech.*Language.* IEEE/ACM Trans. Audio Speech Lang. Process.
|
| 8 |
+
|
| 9 |
+
# ML
|
| 10 |
+
.*International Conference on Machine Learning.* ICML
|
| 11 |
+
.*Advances in Neural Information Processing.* NeurIPS
|
| 12 |
+
.*Conference on Neural Information Processing.* NeurIPS
|
| 13 |
+
.*International Conference on Learning Representations.* ICLR
|
| 14 |
+
.*AAAI Conference on Artificial Intelligence.* AAAI
|
| 15 |
+
.*International Joint Conference on Artificial Intelligence.* IJCAI
|
| 16 |
+
.*IEEE.*Conference on Computer Vision and Pattern Recognition.* CVPR
|
| 17 |
+
.*European Conference on Computer Vision.* ECCV
|
| 18 |
+
.*IEEE International Conference on Computer Vision[^a].* ICCV
|
| 19 |
+
|
| 20 |
+
# NLP
|
| 21 |
+
.*Annual Meeting.*Association for Computational Linguistics.* ACL
|
| 22 |
+
.*Empirical Methods in Natural Language Processing.* EMNLP
|
| 23 |
+
.*North American Chapter.*Association for Computational Linguistics.* NAACL
|
| 24 |
+
.*European Chapter.*Association for Computational Linguistics.* EACL
|
| 25 |
+
.*Findings.*EMNLP.* Findings of EMNLP
|
| 26 |
+
.*Findings.*ACL.* Findings of ACL
|
| 27 |
+
.*International Conference on Computational Linguistics.* COLING
|
| 28 |
+
.*Conference.*Machine Translation.* WMT
|
| 29 |
+
.*Language Resources and Evaluation.* LREC
|
| 30 |
+
|
| 31 |
+
# IR / Web / Data
|
| 32 |
+
.*ACM.*Information Retrieval.* SIGIR
|
| 33 |
+
.*Knowledge Discovery.*Data Mining.* KDD
|
| 34 |
+
.*World Wide Web.* WWW
|
| 35 |
+
.*Web Search and Data Mining.* WSDM
|
data/index_shards/index_00.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2522c57f135bb5c3d581c824cb8538e9f84b786a01a3d0535b52457ef91b227
|
| 3 |
+
size 26214218
|
data/index_shards/index_01.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bd3dbc8999aed2796171312da9521a84548b105f32ecb7621e09c97a8c298c7
|
| 3 |
+
size 26214151
|
data/index_shards/index_02.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:519a53fc56bbeb76feaa0be489bd4cc3727a5261b270846ffd6d1a97d42551b9
|
| 3 |
+
size 26214343
|
data/index_shards/index_03.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c32e3cc3dd2809c115e33f37030166e2b66eeb0dab7dd0b81010647f799ec93
|
| 3 |
+
size 25401874
|
main.py
ADDED
|
@@ -0,0 +1,561 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
BibGuard - Citation Hallucination Detector
|
| 4 |
+
|
| 5 |
+
Validates bibliography entries against multiple academic data sources:
|
| 6 |
+
arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python main.py --bib references.bib
|
| 10 |
+
python main.py --bib references.bib --output report.md
|
| 11 |
+
"""
|
| 12 |
+
import argparse
|
| 13 |
+
import sys
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
from typing import List, Optional
|
| 18 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 19 |
+
import threading
|
| 20 |
+
import copy
|
| 21 |
+
|
| 22 |
+
from src.parser import BibParser
|
| 23 |
+
from src.fetcher import (
|
| 24 |
+
ArxivFetcher, CrossRefFetcher, DBLPFetcher,
|
| 25 |
+
SemanticScholarFetcher, OpenAlexFetcher, ScholarFetcher
|
| 26 |
+
)
|
| 27 |
+
from src.comparator import MetadataComparator, EntryReport, resolve_year, CURRENT_YEAR
|
| 28 |
+
from src.sanitizer import BibSanitizer
|
| 29 |
+
from src.local_db import LocalConferenceDB
|
| 30 |
+
from src.ui import BibUI
|
| 31 |
+
from src.utils import ProgressDisplay, TextNormalizer
|
| 32 |
+
@dataclass
|
| 33 |
+
class WorkflowStep:
|
| 34 |
+
name: str
|
| 35 |
+
enabled: bool = True
|
| 36 |
+
display_name: str = ""
|
| 37 |
+
priority: int = 0
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class WorkflowConfig:
|
| 41 |
+
steps: List[WorkflowStep] = field(default_factory=list)
|
| 42 |
+
def get_enabled_steps(self) -> List[WorkflowStep]:
|
| 43 |
+
return sorted([s for s in self.steps if s.enabled], key=lambda x: x.priority)
|
| 44 |
+
|
| 45 |
+
def get_default_workflow() -> WorkflowConfig:
|
| 46 |
+
return WorkflowConfig(steps=[
|
| 47 |
+
WorkflowStep("arxiv_id", True, "arXiv by ID", 0),
|
| 48 |
+
WorkflowStep("crossref_doi", True, "CrossRef by DOI", 1),
|
| 49 |
+
WorkflowStep("semantic_scholar", True, "Semantic Scholar", 2),
|
| 50 |
+
WorkflowStep("dblp", True, "DBLP", 3),
|
| 51 |
+
WorkflowStep("openalex", True, "OpenAlex", 4),
|
| 52 |
+
WorkflowStep("arxiv_title", True, "arXiv by Title", 5),
|
| 53 |
+
WorkflowStep("crossref_title", True, "CrossRef by Title", 6),
|
| 54 |
+
WorkflowStep("google_scholar", False, "Google Scholar", 7),
|
| 55 |
+
])
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
parser = argparse.ArgumentParser(
|
| 59 |
+
description="BibGuard: Citation Fixer & Validator",
|
| 60 |
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
parser.add_argument("--bib", "-b", required=True, help="Path to .bib file")
|
| 64 |
+
parser.add_argument("--output", "-o", help="Output report path (optional)")
|
| 65 |
+
|
| 66 |
+
args = parser.parse_args()
|
| 67 |
+
|
| 68 |
+
bib_path = Path(args.bib)
|
| 69 |
+
if not bib_path.exists():
|
| 70 |
+
print(f"Error: Bib file not found: {args.bib}")
|
| 71 |
+
sys.exit(1)
|
| 72 |
+
|
| 73 |
+
workflow = get_default_workflow()
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
run_fix_and_verify(bib_path, workflow)
|
| 77 |
+
except KeyboardInterrupt:
|
| 78 |
+
print("\nCancelled")
|
| 79 |
+
sys.exit(130)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def run_fix_and_verify(bib_path: Path, workflow):
|
| 83 |
+
"""Run validation, auto-fix issues, and verify."""
|
| 84 |
+
progress = ProgressDisplay()
|
| 85 |
+
bib_parser = BibParser()
|
| 86 |
+
ui = BibUI()
|
| 87 |
+
|
| 88 |
+
print(f"📚 BibGuard - Auto-Fix & Verify")
|
| 89 |
+
print(f" Target: {bib_path}\n")
|
| 90 |
+
|
| 91 |
+
# --- Pass 1: Validate & Fix ---
|
| 92 |
+
entries = bib_parser.parse_file(str(bib_path))
|
| 93 |
+
if not entries:
|
| 94 |
+
print("No entries found")
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
print(f"Found {len(entries)} entries. Running validation and auto-fix...\n")
|
| 98 |
+
|
| 99 |
+
# Initialize components
|
| 100 |
+
fetchers = {
|
| 101 |
+
'arxiv': ArxivFetcher(),
|
| 102 |
+
'crossref': CrossRefFetcher(),
|
| 103 |
+
'scholar': ScholarFetcher(),
|
| 104 |
+
'semantic': SemanticScholarFetcher(),
|
| 105 |
+
'openalex': OpenAlexFetcher(),
|
| 106 |
+
'dblp': DBLPFetcher(),
|
| 107 |
+
}
|
| 108 |
+
comparator = MetadataComparator()
|
| 109 |
+
sanitizer = BibSanitizer()
|
| 110 |
+
|
| 111 |
+
fixed_count = 0
|
| 112 |
+
updated_entries = []
|
| 113 |
+
fixed_details = {} # Key: entry_key, Value: list of changes
|
| 114 |
+
removed_details = [] # List of (entry_key, reason)
|
| 115 |
+
manual_review_queue = [] # List of (entry, best_result, candidates)
|
| 116 |
+
|
| 117 |
+
# --- Phase 0: Sanitize (Offline Checks) ---
|
| 118 |
+
print("🧹 Running formatting sanity checks...")
|
| 119 |
+
sanitize_fixes = sanitizer.sanitize_all(entries)
|
| 120 |
+
ui.show_sanitize_report(sanitize_fixes)
|
| 121 |
+
|
| 122 |
+
# If sanitization made changes, save immediately so Phase 1 works on clean data
|
| 123 |
+
if sanitize_fixes:
|
| 124 |
+
bib_parser.save_entries(str(bib_path), entries)
|
| 125 |
+
# Merge sanitize fixes into fixed_details for the final report
|
| 126 |
+
for key, fixes in sanitize_fixes.items():
|
| 127 |
+
if key not in fixed_details:
|
| 128 |
+
fixed_details[key] = []
|
| 129 |
+
for fix in fixes:
|
| 130 |
+
fixed_details[key].append(fix.description)
|
| 131 |
+
fixed_count += 1
|
| 132 |
+
|
| 133 |
+
# Duplicate detection
|
| 134 |
+
dupes = sanitizer.find_duplicates(entries)
|
| 135 |
+
if dupes:
|
| 136 |
+
print(f"\n⚠ Found {len(dupes)} duplicate title(s):")
|
| 137 |
+
for title, keys in dupes.items():
|
| 138 |
+
print(f" {' / '.join(keys)}")
|
| 139 |
+
print()
|
| 140 |
+
|
| 141 |
+
# --- Phase 0.5: Local DB Lookup ---
|
| 142 |
+
local_db = LocalConferenceDB()
|
| 143 |
+
local_db_loaded = local_db.load()
|
| 144 |
+
|
| 145 |
+
api_needed_entries = entries # Default: all entries need API
|
| 146 |
+
if local_db_loaded:
|
| 147 |
+
api_needed_entries = []
|
| 148 |
+
local_matched_count = 0
|
| 149 |
+
for entry in entries:
|
| 150 |
+
official = local_db.lookup(entry.title)
|
| 151 |
+
if official:
|
| 152 |
+
# Apply local DB fix
|
| 153 |
+
changes = apply_local_fix(entry, official)
|
| 154 |
+
if changes:
|
| 155 |
+
local_matched_count += 1
|
| 156 |
+
if entry.key not in fixed_details:
|
| 157 |
+
fixed_details[entry.key] = []
|
| 158 |
+
fixed_details[entry.key].extend(changes)
|
| 159 |
+
fixed_count += 1
|
| 160 |
+
else:
|
| 161 |
+
api_needed_entries.append(entry)
|
| 162 |
+
|
| 163 |
+
if local_matched_count > 0:
|
| 164 |
+
print(f" 📚 Local DB matched: {local_matched_count}, API needed: {len(api_needed_entries)}")
|
| 165 |
+
bib_parser.save_entries(str(bib_path), entries)
|
| 166 |
+
|
| 167 |
+
# --- Phase 1: Analysis (API Fetch) ---
|
| 168 |
+
analysis_results = []
|
| 169 |
+
|
| 170 |
+
with progress.progress_context(len(api_needed_entries), "Analyzing Entries") as prog:
|
| 171 |
+
with ThreadPoolExecutor(max_workers=min(10, max(1, len(api_needed_entries)))) as executor:
|
| 172 |
+
futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in api_needed_entries}
|
| 173 |
+
|
| 174 |
+
for future in as_completed(futures):
|
| 175 |
+
entry = futures[future]
|
| 176 |
+
try:
|
| 177 |
+
best_result, candidates = future.result()
|
| 178 |
+
analysis_results.append((entry, best_result, candidates))
|
| 179 |
+
prog.update(entry.key, "Analyzed", 1)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
prog.mark_error()
|
| 182 |
+
prog.update(entry.key, "Failed", 1)
|
| 183 |
+
# Keep valid entry even if fetch failed
|
| 184 |
+
analysis_results.append((entry, None, []))
|
| 185 |
+
|
| 186 |
+
# --- Phase 2: Meaningful Report ---
|
| 187 |
+
# Categorize results
|
| 188 |
+
to_fix = []
|
| 189 |
+
to_review = []
|
| 190 |
+
to_remove = []
|
| 191 |
+
ok_entries = []
|
| 192 |
+
|
| 193 |
+
for entry, best_result, candidates in analysis_results:
|
| 194 |
+
if not best_result:
|
| 195 |
+
ok_entries.append(entry)
|
| 196 |
+
continue
|
| 197 |
+
|
| 198 |
+
# Entries flagged for forced API lookup (e.g., future year) always go to to_fix
|
| 199 |
+
if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
|
| 200 |
+
to_fix.append((entry, best_result, candidates))
|
| 201 |
+
elif best_result.confidence > 0.85 and best_result.fetched_data:
|
| 202 |
+
to_fix.append((entry, best_result, candidates))
|
| 203 |
+
elif best_result.is_match:
|
| 204 |
+
ok_entries.append(entry)
|
| 205 |
+
elif candidates:
|
| 206 |
+
to_review.append((entry, best_result, candidates))
|
| 207 |
+
else:
|
| 208 |
+
to_remove.append(entry)
|
| 209 |
+
|
| 210 |
+
# Visualize Analysis Report
|
| 211 |
+
ui.show_analysis_report(ok_entries, to_fix, to_review, to_remove)
|
| 212 |
+
|
| 213 |
+
if not (to_fix or to_review or to_remove):
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
# --- Phase 3: Apply Fixes ---
|
| 217 |
+
print(f"\n🚀 Applying fixes...")
|
| 218 |
+
|
| 219 |
+
updated_entries = []
|
| 220 |
+
# Add OK entries first (preserve order if we cared, but we sort later usually)
|
| 221 |
+
updated_entries.extend(ok_entries)
|
| 222 |
+
|
| 223 |
+
# Process Fixes
|
| 224 |
+
for entry, best_result, candidates in to_fix:
|
| 225 |
+
changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
|
| 226 |
+
if changes:
|
| 227 |
+
fixed_count += 1
|
| 228 |
+
fixed_details[entry.key] = changes
|
| 229 |
+
updated_entries.append(entry)
|
| 230 |
+
|
| 231 |
+
# Process Removals
|
| 232 |
+
for entry in to_remove:
|
| 233 |
+
removed_details.append((entry, "No matching metadata found in any source"))
|
| 234 |
+
# Do NOT add to updated_entries
|
| 235 |
+
|
| 236 |
+
# Process Reviews (Add to queue)
|
| 237 |
+
for item in to_review:
|
| 238 |
+
manual_review_queue.append(item)
|
| 239 |
+
updated_entries.append(item[0]) # Add tentatively, filter later if removed
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# --- Interactive Manual Review ---
|
| 243 |
+
if manual_review_queue:
|
| 244 |
+
print(f"\n\n🔍 Manual Review Required for {len(manual_review_queue)} entries:")
|
| 245 |
+
|
| 246 |
+
# Sort by key for consistent order
|
| 247 |
+
manual_review_queue.sort(key=lambda x: x[0].key)
|
| 248 |
+
|
| 249 |
+
entries_to_remove = set()
|
| 250 |
+
|
| 251 |
+
for entry, best_res, candidates in manual_review_queue:
|
| 252 |
+
ui.show_manual_review(entry, best_res, candidates, apply_fix)
|
| 253 |
+
|
| 254 |
+
while True:
|
| 255 |
+
choice = input(f"\nSelect [1-{len(candidates)}], (s)kip, (r)emove, or (q)uit: ").strip().lower()
|
| 256 |
+
|
| 257 |
+
if choice == 'q':
|
| 258 |
+
print("Exiting manual review.")
|
| 259 |
+
# Keep remaining in queue as is (already in updated_entries)
|
| 260 |
+
break
|
| 261 |
+
elif choice == 's':
|
| 262 |
+
print("Skipped.")
|
| 263 |
+
break
|
| 264 |
+
elif choice == 'r':
|
| 265 |
+
print("Marked for removal.")
|
| 266 |
+
entries_to_remove.add(entry.key)
|
| 267 |
+
removed_details.append((entry, "Removed by user during manual review"))
|
| 268 |
+
break
|
| 269 |
+
elif choice.isdigit():
|
| 270 |
+
idx = int(choice) - 1
|
| 271 |
+
if 0 <= idx < len(candidates):
|
| 272 |
+
selected = candidates[idx]
|
| 273 |
+
changes = apply_fix(entry, selected.fetched_data)
|
| 274 |
+
if changes:
|
| 275 |
+
fixed_count += 1
|
| 276 |
+
if entry.key not in fixed_details: fixed_details[entry.key] = []
|
| 277 |
+
fixed_details[entry.key].extend(changes)
|
| 278 |
+
print(f"Applied: {', '.join(changes)}")
|
| 279 |
+
else:
|
| 280 |
+
print("No changes needed for selected source.")
|
| 281 |
+
break
|
| 282 |
+
else:
|
| 283 |
+
print("Invalid selection.")
|
| 284 |
+
else:
|
| 285 |
+
print("Invalid input.")
|
| 286 |
+
|
| 287 |
+
if choice == 'q':
|
| 288 |
+
break
|
| 289 |
+
|
| 290 |
+
# Filter out removed entries
|
| 291 |
+
if entries_to_remove:
|
| 292 |
+
updated_entries = [e for e in updated_entries if e.key not in entries_to_remove]
|
| 293 |
+
|
| 294 |
+
# Overwrite file if changes made
|
| 295 |
+
# Overwrite file if changes made (beyond Phase 0 sanitization)
|
| 296 |
+
has_phase1_changes = any(k not in sanitize_fixes for k in fixed_details) or removed_details
|
| 297 |
+
if has_phase1_changes or fixed_count > len(sanitize_fixes):
|
| 298 |
+
bib_parser.save_entries(str(bib_path), updated_entries)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
# --- Pass 2: Double Check ---
|
| 302 |
+
print("\n🔄 Double checking (Re-validation)...")
|
| 303 |
+
|
| 304 |
+
entries = bib_parser.parse_file(str(bib_path))
|
| 305 |
+
reports = []
|
| 306 |
+
|
| 307 |
+
with progress.progress_context(len(entries), "Verifying") as prog:
|
| 308 |
+
with ThreadPoolExecutor(max_workers=min(10, len(entries))) as executor:
|
| 309 |
+
# Note: validate_entry now returns tuple, need to handle
|
| 310 |
+
futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in entries}
|
| 311 |
+
|
| 312 |
+
for future in as_completed(futures):
|
| 313 |
+
entry = futures[future]
|
| 314 |
+
try:
|
| 315 |
+
best_result, _ = future.result() # Ignore candidates in verify pass
|
| 316 |
+
reports.append(EntryReport(entry=entry, comparison=best_result))
|
| 317 |
+
|
| 318 |
+
if best_result.is_match:
|
| 319 |
+
prog.mark_success()
|
| 320 |
+
else:
|
| 321 |
+
prog.mark_error()
|
| 322 |
+
prog.update(entry.key, "Verified", 1)
|
| 323 |
+
except Exception:
|
| 324 |
+
prog.mark_error()
|
| 325 |
+
prog.update(entry.key, "Failed", 1)
|
| 326 |
+
|
| 327 |
+
# Summary
|
| 328 |
+
total = len(entries)
|
| 329 |
+
verified = sum(1 for r in reports if r.comparison and r.comparison.is_match)
|
| 330 |
+
issues = sum(1 for r in reports if r.comparison and r.comparison.has_issues)
|
| 331 |
+
not_found = sum(1 for r in reports if r.comparison and not r.comparison.is_match and not r.comparison.has_issues)
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# Visual Final Status
|
| 335 |
+
ui.show_final_report(total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details)
|
| 336 |
+
print("")
|
| 337 |
+
|
| 338 |
+
def apply_local_fix(entry, official) -> list:
|
| 339 |
+
"""
|
| 340 |
+
Apply fixes from local conference DB (ground truth).
|
| 341 |
+
Only updates year, booktitle, and entry type — not authors or title,
|
| 342 |
+
since DBLP data for those may have different formatting conventions.
|
| 343 |
+
"""
|
| 344 |
+
changes = []
|
| 345 |
+
|
| 346 |
+
# Year: conference year is ground truth
|
| 347 |
+
if official.year and official.year != entry.year:
|
| 348 |
+
year_int = int(official.year) if official.year.isdigit() else 0
|
| 349 |
+
if 1950 <= year_int <= CURRENT_YEAR:
|
| 350 |
+
changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
|
| 351 |
+
entry.year = official.year
|
| 352 |
+
|
| 353 |
+
# Entry type upgrade: misc/article → inproceedings if booktitle exists
|
| 354 |
+
if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
|
| 355 |
+
old_type = entry.entry_type
|
| 356 |
+
entry.entry_type = 'inproceedings'
|
| 357 |
+
if 'ENTRYTYPE' in entry.raw_entry:
|
| 358 |
+
entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
|
| 359 |
+
# Clear journal if it was arXiv
|
| 360 |
+
if entry.journal and 'arxiv' in entry.journal.lower():
|
| 361 |
+
entry.journal = ""
|
| 362 |
+
if 'journal' in entry.raw_entry:
|
| 363 |
+
del entry.raw_entry['journal']
|
| 364 |
+
changes.append(f"Type: @{old_type} → @inproceedings [local_db]")
|
| 365 |
+
|
| 366 |
+
# Booktitle: adopt from DB if missing or different
|
| 367 |
+
if official.booktitle and not entry.booktitle:
|
| 368 |
+
entry.booktitle = official.booktitle
|
| 369 |
+
entry.raw_entry['booktitle'] = official.booktitle
|
| 370 |
+
changes.append(f"Booktitle: [Added] {official.booktitle[:50]}... [local_db]")
|
| 371 |
+
|
| 372 |
+
# DOI: adopt if missing
|
| 373 |
+
if official.doi and not entry.doi:
|
| 374 |
+
entry.doi = official.doi
|
| 375 |
+
entry.raw_entry['doi'] = official.doi
|
| 376 |
+
changes.append(f"DOI: [Added] {official.doi} [local_db]")
|
| 377 |
+
|
| 378 |
+
return changes
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def apply_fix(entry, data, all_candidates=None) -> list:
|
| 382 |
+
"""Update entry metadata from fetched data. Returns list of changes strings."""
|
| 383 |
+
changes = []
|
| 384 |
+
|
| 385 |
+
# Helper to clean string
|
| 386 |
+
def clean(s): return str(s).strip() if s else ""
|
| 387 |
+
|
| 388 |
+
# Title
|
| 389 |
+
new_title = clean(data.title)
|
| 390 |
+
if new_title and new_title.lower() != entry.title.lower():
|
| 391 |
+
changes.append(f"Title: {entry.title} -> {new_title}")
|
| 392 |
+
entry.title = new_title
|
| 393 |
+
|
| 394 |
+
# Year: Use resolve_year() if we have multiple candidates
|
| 395 |
+
if all_candidates:
|
| 396 |
+
best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
|
| 397 |
+
if best_year and best_year != entry.year:
|
| 398 |
+
if int(best_year) > CURRENT_YEAR:
|
| 399 |
+
changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
|
| 400 |
+
else:
|
| 401 |
+
changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
|
| 402 |
+
entry.year = best_year
|
| 403 |
+
else:
|
| 404 |
+
# Single candidate fallback
|
| 405 |
+
new_year = clean(getattr(data, 'year', ''))
|
| 406 |
+
if new_year and new_year != entry.year:
|
| 407 |
+
if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
|
| 408 |
+
changes.append(f"⚠ Skip suspicious future year {new_year}")
|
| 409 |
+
else:
|
| 410 |
+
changes.append(f"Year: {entry.year} -> {new_year}")
|
| 411 |
+
entry.year = new_year
|
| 412 |
+
|
| 413 |
+
# Author: Smart Merge Strategy
|
| 414 |
+
# Check for author initial conflict first
|
| 415 |
+
has_initial_conflict = False
|
| 416 |
+
if all_candidates:
|
| 417 |
+
for cand in all_candidates:
|
| 418 |
+
if hasattr(cand, 'author_initial_conflict') and cand.author_initial_conflict:
|
| 419 |
+
has_initial_conflict = True
|
| 420 |
+
break
|
| 421 |
+
|
| 422 |
+
if has_initial_conflict:
|
| 423 |
+
# Don't overwrite authors when initials conflict
|
| 424 |
+
changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
|
| 425 |
+
else:
|
| 426 |
+
# Normal author merge logic
|
| 427 |
+
current_authors_raw = TextNormalizer.parse_author_list(entry.author)
|
| 428 |
+
current_authors_norm = [TextNormalizer.normalize_author_name(a) for a in current_authors_raw]
|
| 429 |
+
|
| 430 |
+
new_authors_list = getattr(data, 'authors', [])
|
| 431 |
+
if isinstance(new_authors_list, str):
|
| 432 |
+
new_authors_list = TextNormalizer.parse_author_list(new_authors_list)
|
| 433 |
+
|
| 434 |
+
# Strip DBLP disambiguation IDs from new authors
|
| 435 |
+
new_authors_list = [TextNormalizer.strip_dblp_disambiguation_id(str(a)) for a in new_authors_list]
|
| 436 |
+
|
| 437 |
+
# Also check if the EXISTING bib authors have DBLP disambiguation IDs baked in
|
| 438 |
+
for raw_auth in current_authors_raw:
|
| 439 |
+
if TextNormalizer.has_dblp_disambiguation_id(raw_auth.strip()):
|
| 440 |
+
changes.append(f"⚠ DBLP disambiguation ID detected in author: '{raw_auth.strip()}'")
|
| 441 |
+
|
| 442 |
+
final_authors = []
|
| 443 |
+
|
| 444 |
+
for new_auth in new_authors_list:
|
| 445 |
+
new_auth_str = str(new_auth).strip()
|
| 446 |
+
new_auth_norm = TextNormalizer.normalize_author_name(new_auth_str)
|
| 447 |
+
|
| 448 |
+
# Try to find a match in the existing list
|
| 449 |
+
match_found = False
|
| 450 |
+
for i, old_norm in enumerate(current_authors_norm):
|
| 451 |
+
if old_norm == new_auth_norm:
|
| 452 |
+
# Found a match! Use the OLD format
|
| 453 |
+
final_authors.append(current_authors_raw[i].strip())
|
| 454 |
+
match_found = True
|
| 455 |
+
break
|
| 456 |
+
|
| 457 |
+
if not match_found:
|
| 458 |
+
# New author, use the new string
|
| 459 |
+
final_authors.append(new_auth_str)
|
| 460 |
+
|
| 461 |
+
# Reconstruct the string
|
| 462 |
+
new_author_str = " and ".join(final_authors)
|
| 463 |
+
|
| 464 |
+
# Check if the result is effectively different from the original full string
|
| 465 |
+
def simple_norm(s): return s.lower().replace(" ", "").strip()
|
| 466 |
+
|
| 467 |
+
if simple_norm(new_author_str) != simple_norm(entry.author):
|
| 468 |
+
old_auth = (entry.author[:50] + '...') if len(entry.author) > 50 else entry.author
|
| 469 |
+
new_auth_disp = (new_author_str[:50] + '...') if len(new_author_str) > 50 else new_author_str
|
| 470 |
+
changes.append(f"Author: {old_auth} -> {new_auth_disp}")
|
| 471 |
+
entry.author = new_author_str
|
| 472 |
+
|
| 473 |
+
# Optional fields (doi, journal, etc.)
|
| 474 |
+
if hasattr(data, 'doi') and data.doi and not entry.doi:
|
| 475 |
+
changes.append(f"DOI: [Added] {data.doi}")
|
| 476 |
+
entry.doi = data.doi
|
| 477 |
+
|
| 478 |
+
return changes
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
def validate_entry(entry, workflow, fetchers, comparator):
|
| 482 |
+
"""Validate a single entry against configured data sources. Returns (best_result, all_results)."""
|
| 483 |
+
from src.utils import TextNormalizer
|
| 484 |
+
|
| 485 |
+
results = []
|
| 486 |
+
|
| 487 |
+
for step in workflow.get_enabled_steps():
|
| 488 |
+
result = None
|
| 489 |
+
data = None
|
| 490 |
+
|
| 491 |
+
if step.name == "arxiv_id" and entry.has_arxiv:
|
| 492 |
+
data = fetchers['arxiv'].fetch_by_id(entry.arxiv_id)
|
| 493 |
+
if data: result = comparator.compare(entry, data, "arxiv")
|
| 494 |
+
|
| 495 |
+
elif step.name == "crossref_doi" and entry.doi:
|
| 496 |
+
data = fetchers['crossref'].search_by_doi(entry.doi)
|
| 497 |
+
if data:
|
| 498 |
+
# DOI cross-validation: check if the DOI actually resolves to this paper
|
| 499 |
+
from src.sanitizer import BibSanitizer
|
| 500 |
+
doi_fixes = BibSanitizer().check_doi_title_match(entry, data)
|
| 501 |
+
if doi_fixes:
|
| 502 |
+
# DOI points to a different work — skip this result
|
| 503 |
+
# The fixes have already cleared the bad DOI from the entry
|
| 504 |
+
result = None
|
| 505 |
+
else:
|
| 506 |
+
result = comparator.compare(entry, data, "crossref")
|
| 507 |
+
|
| 508 |
+
elif step.name == "semantic_scholar" and entry.title:
|
| 509 |
+
data = fetchers['semantic'].fetch_by_doi(entry.doi) if entry.doi else None
|
| 510 |
+
if not data:
|
| 511 |
+
data = fetchers['semantic'].search_by_title(entry.title)
|
| 512 |
+
if data: result = comparator.compare(entry, data, "semantic_scholar")
|
| 513 |
+
|
| 514 |
+
elif step.name == "dblp" and entry.title:
|
| 515 |
+
data = fetchers['dblp'].search_by_title(entry.title)
|
| 516 |
+
if data: result = comparator.compare(entry, data, "dblp")
|
| 517 |
+
|
| 518 |
+
elif step.name == "openalex" and entry.title:
|
| 519 |
+
data = fetchers['openalex'].fetch_by_doi(entry.doi) if entry.doi else None
|
| 520 |
+
if not data:
|
| 521 |
+
data = fetchers['openalex'].search_by_title(entry.title)
|
| 522 |
+
if data: result = comparator.compare(entry, data, "openalex")
|
| 523 |
+
|
| 524 |
+
elif step.name == "arxiv_title" and entry.title:
|
| 525 |
+
metas = fetchers['arxiv'].search_by_title(entry.title)
|
| 526 |
+
if metas:
|
| 527 |
+
norm1 = TextNormalizer.normalize_for_comparison(entry.title)
|
| 528 |
+
best, best_sim = None, 0
|
| 529 |
+
for m in metas:
|
| 530 |
+
sim = TextNormalizer.similarity_ratio(
|
| 531 |
+
norm1, TextNormalizer.normalize_for_comparison(m.title)
|
| 532 |
+
)
|
| 533 |
+
if sim > best_sim:
|
| 534 |
+
best, best_sim = m, sim
|
| 535 |
+
if best and best_sim > 0.5:
|
| 536 |
+
result = comparator.compare(entry, best, "arxiv")
|
| 537 |
+
|
| 538 |
+
elif step.name == "crossref_title" and entry.title:
|
| 539 |
+
data = fetchers['crossref'].search_by_title(entry.title)
|
| 540 |
+
if data: result = comparator.compare(entry, data, "crossref")
|
| 541 |
+
|
| 542 |
+
elif step.name == "google_scholar" and entry.title:
|
| 543 |
+
data = fetchers['scholar'].search_by_title(entry.title)
|
| 544 |
+
if data: result = comparator.compare(entry, data, "scholar")
|
| 545 |
+
|
| 546 |
+
if result:
|
| 547 |
+
results.append(result)
|
| 548 |
+
|
| 549 |
+
if results:
|
| 550 |
+
best = max(results, key=lambda r: r.confidence)
|
| 551 |
+
return best, results
|
| 552 |
+
|
| 553 |
+
# No results
|
| 554 |
+
return comparator.create_unable_result(entry, "Not found in any data source"), []
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
if __name__ == "__main__":
|
| 561 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bibtexparser>=1.4.0
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
beautifulsoup4>=4.12.0
|
| 4 |
+
rich>=13.7.0
|
| 5 |
+
Unidecode>=1.3.0
|
| 6 |
+
lxml>=5.0.0
|
| 7 |
+
gradio>=4.44.0
|
scripts/build_index.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Build a title-based index from downloaded DBLP bib files.
|
| 4 |
+
|
| 5 |
+
Reads all .bib files in data/raw/ and produces sharded JSON files
|
| 6 |
+
under data/index_shards/ (~25MB each) for GitHub-friendly storage.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/build_index.py
|
| 10 |
+
"""
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import shutil
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import bibtexparser
|
| 20 |
+
from bibtexparser.bparser import BibTexParser
|
| 21 |
+
from bibtexparser.customization import convert_to_unicode
|
| 22 |
+
except ImportError:
|
| 23 |
+
print("Error: bibtexparser required. Install: pip install bibtexparser")
|
| 24 |
+
sys.exit(1)
|
| 25 |
+
|
| 26 |
+
MAX_SHARD_MB = 25 # Target shard size in MB
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def normalize_title(title: str) -> str:
|
| 30 |
+
"""Normalize a title for index lookup."""
|
| 31 |
+
title = re.sub(r'\{([^}]*)\}', r'\1', title)
|
| 32 |
+
title = re.sub(r'[^\w\s]', ' ', title.lower())
|
| 33 |
+
return re.sub(r'\s+', ' ', title).strip()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def write_shards(index: dict, shard_dir: Path):
|
| 37 |
+
"""Split index into ~25MB JSON shard files."""
|
| 38 |
+
if shard_dir.exists():
|
| 39 |
+
shutil.rmtree(shard_dir)
|
| 40 |
+
shard_dir.mkdir(parents=True)
|
| 41 |
+
|
| 42 |
+
shard_num = 0
|
| 43 |
+
shard_items = []
|
| 44 |
+
shard_size = 0
|
| 45 |
+
max_bytes = MAX_SHARD_MB * 1024 * 1024
|
| 46 |
+
|
| 47 |
+
for key, val in index.items():
|
| 48 |
+
entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))
|
| 49 |
+
|
| 50 |
+
if shard_size + entry_size > max_bytes and shard_items:
|
| 51 |
+
path = shard_dir / f"index_{shard_num:02d}.json"
|
| 52 |
+
path.write_text(
|
| 53 |
+
json.dumps(dict(shard_items), ensure_ascii=False),
|
| 54 |
+
encoding="utf-8"
|
| 55 |
+
)
|
| 56 |
+
mb = path.stat().st_size / 1024 / 1024
|
| 57 |
+
print(f" ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
|
| 58 |
+
shard_num += 1
|
| 59 |
+
shard_items = []
|
| 60 |
+
shard_size = 0
|
| 61 |
+
|
| 62 |
+
shard_items.append((key, val))
|
| 63 |
+
shard_size += entry_size
|
| 64 |
+
|
| 65 |
+
# Last shard
|
| 66 |
+
if shard_items:
|
| 67 |
+
path = shard_dir / f"index_{shard_num:02d}.json"
|
| 68 |
+
path.write_text(
|
| 69 |
+
json.dumps(dict(shard_items), ensure_ascii=False),
|
| 70 |
+
encoding="utf-8"
|
| 71 |
+
)
|
| 72 |
+
mb = path.stat().st_size / 1024 / 1024
|
| 73 |
+
print(f" ✓ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
|
| 74 |
+
shard_num += 1
|
| 75 |
+
|
| 76 |
+
return shard_num
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def main():
|
| 80 |
+
raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw"
|
| 81 |
+
shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
|
| 82 |
+
|
| 83 |
+
if not raw_dir.exists():
|
| 84 |
+
print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
|
| 85 |
+
sys.exit(1)
|
| 86 |
+
|
| 87 |
+
bib_files = sorted(raw_dir.glob("*.bib"))
|
| 88 |
+
if not bib_files:
|
| 89 |
+
print(f"No .bib files found in {raw_dir}")
|
| 90 |
+
sys.exit(1)
|
| 91 |
+
|
| 92 |
+
print(f"📦 Building index from {len(bib_files)} bib files...")
|
| 93 |
+
|
| 94 |
+
index = {}
|
| 95 |
+
skipped_files = 0
|
| 96 |
+
|
| 97 |
+
for bib_file in bib_files:
|
| 98 |
+
try:
|
| 99 |
+
parser = BibTexParser(common_strings=True)
|
| 100 |
+
parser.customization = convert_to_unicode
|
| 101 |
+
with open(bib_file, encoding="utf-8", errors="replace") as f:
|
| 102 |
+
db = bibtexparser.load(f, parser=parser)
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f" ⚠ Skip {bib_file.name}: {e}")
|
| 105 |
+
skipped_files += 1
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
for entry in db.entries:
|
| 109 |
+
title = entry.get("title", "")
|
| 110 |
+
if not title:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
key = normalize_title(title)
|
| 114 |
+
if not key:
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
if key not in index:
|
| 118 |
+
index[key] = {
|
| 119 |
+
"title": title.rstrip('.'),
|
| 120 |
+
"author": entry.get("author", ""),
|
| 121 |
+
"year": entry.get("year", ""),
|
| 122 |
+
"booktitle": entry.get("booktitle", ""),
|
| 123 |
+
"journal": entry.get("journal", ""),
|
| 124 |
+
"doi": entry.get("doi", ""),
|
| 125 |
+
"url": entry.get("url", ""),
|
| 126 |
+
"pages": entry.get("pages", ""),
|
| 127 |
+
"volume": entry.get("volume", ""),
|
| 128 |
+
"_type": entry.get("ENTRYTYPE", "inproceedings"),
|
| 129 |
+
"_source": bib_file.stem,
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
print(f"\n📂 Writing sharded index...")
|
| 133 |
+
n_shards = write_shards(index, shard_dir)
|
| 134 |
+
|
| 135 |
+
total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
|
| 136 |
+
print(f"\n✅ Index: {len(index):,} unique entries → {n_shards} shards ({total_mb:.1f} MB total)")
|
| 137 |
+
print(f" Saved to: {shard_dir}/")
|
| 138 |
+
if skipped_files:
|
| 139 |
+
print(f" ⚠ {skipped_files} file(s) skipped due to parse errors")
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
main()
|
scripts/refresh_db.sh
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Refresh the local DBLP conference database.
|
| 3 |
+
# Run this before paper submission to ensure the DB is up to date.
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 7 |
+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
| 8 |
+
|
| 9 |
+
echo "🔄 Refreshing conference database..."
|
| 10 |
+
|
| 11 |
+
# 1. Download new bib files (only those not yet downloaded)
|
| 12 |
+
python "$SCRIPT_DIR/update_db.py"
|
| 13 |
+
|
| 14 |
+
# 2. Rebuild the index
|
| 15 |
+
python "$SCRIPT_DIR/build_index.py"
|
| 16 |
+
|
| 17 |
+
echo ""
|
| 18 |
+
echo "✅ DB refreshed."
|
| 19 |
+
echo " Run: python main.py --bib your_paper.bib"
|
scripts/update_db.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download conference/journal proceedings from DBLP as BibTeX files.
|
| 4 |
+
|
| 5 |
+
Uses the DBLP venue-based search API which is more reliable than
|
| 6 |
+
the TOC-based .bht queries (which often return 404 or single entries).
|
| 7 |
+
|
| 8 |
+
API format:
|
| 9 |
+
https://dblp.org/search/publ/api
|
| 10 |
+
?q=venue:{VenueName}: year:{year}:
|
| 11 |
+
&h=1000 # max results per batch
|
| 12 |
+
&f={offset} # pagination offset
|
| 13 |
+
&format=bib1 # BibTeX format
|
| 14 |
+
|
| 15 |
+
Usage:
|
| 16 |
+
python scripts/update_db.py
|
| 17 |
+
"""
|
| 18 |
+
import requests
|
| 19 |
+
import time
|
| 20 |
+
import sys
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
DBLP_API = "https://dblp.org/search/publ/api"
|
| 24 |
+
|
| 25 |
+
# (dblp_venue_name, output_prefix, years)
|
| 26 |
+
# dblp_venue_name: exact venue string used in DBLP's venue: filter
|
| 27 |
+
# output_prefix: filename prefix for saved .bib files
|
| 28 |
+
CONFERENCES = [
|
| 29 |
+
# ── Speech & Audio ──────────────────────────────────────────
|
| 30 |
+
("INTERSPEECH", "interspeech", range(2018, 2027)),
|
| 31 |
+
("ICASSP", "icassp", range(2018, 2027)),
|
| 32 |
+
("ASRU", "asru", [2019, 2021, 2023, 2025]),
|
| 33 |
+
("SLT", "slt", [2018, 2021, 2022, 2024]),
|
| 34 |
+
|
| 35 |
+
# ── ML / AI ─────────────────────────────────────────────────
|
| 36 |
+
("ICML", "icml", range(2018, 2027)),
|
| 37 |
+
("NeurIPS", "neurips", range(2017, 2027)),
|
| 38 |
+
("ICLR", "iclr", range(2018, 2027)),
|
| 39 |
+
("AAAI", "aaai", range(2018, 2027)),
|
| 40 |
+
("IJCAI", "ijcai", range(2018, 2027)),
|
| 41 |
+
("CVPR", "cvpr", range(2018, 2027)),
|
| 42 |
+
("ECCV", "eccv", [2018, 2020, 2022, 2024]),
|
| 43 |
+
("ICCV", "iccv", [2019, 2021, 2023, 2025]),
|
| 44 |
+
|
| 45 |
+
# ── NLP ─────────────────────────────────────────────────────
|
| 46 |
+
("ACL", "acl", range(2018, 2027)), # includes Findings
|
| 47 |
+
("EMNLP", "emnlp", range(2018, 2027)), # includes Findings
|
| 48 |
+
("NAACL", "naacl", range(2018, 2027)),
|
| 49 |
+
("EACL", "eacl", range(2018, 2027)),
|
| 50 |
+
("LREC/COLING", "coling", [2024, 2025]),
|
| 51 |
+
# Older COLING uses different venue
|
| 52 |
+
# ("COLING", "coling", [2018, 2020, 2022]),
|
| 53 |
+
|
| 54 |
+
# ── IR / Web / Data ─────────────────────────────────────────
|
| 55 |
+
("SIGIR", "sigir", range(2018, 2027)),
|
| 56 |
+
("KDD", "kdd", range(2018, 2027)),
|
| 57 |
+
("WWW", "www", range(2018, 2027)),
|
| 58 |
+
("WSDM", "wsdm", range(2018, 2027)),
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
# Journals use venue search too
|
| 62 |
+
JOURNALS = [
|
| 63 |
+
("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)),
|
| 64 |
+
("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)),
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
|
| 69 |
+
"""Download a conference/journal year from DBLP using venue search."""
|
| 70 |
+
out_file = out_dir / f"{prefix}{year}.bib"
|
| 71 |
+
if out_file.exists():
|
| 72 |
+
return # Skip if already downloaded
|
| 73 |
+
|
| 74 |
+
query = f"venue:{venue_name}: year:{year}:"
|
| 75 |
+
all_bib = []
|
| 76 |
+
offset = 0
|
| 77 |
+
|
| 78 |
+
while True:
|
| 79 |
+
try:
|
| 80 |
+
r = requests.get(DBLP_API, params={
|
| 81 |
+
"q": query, "h": 1000, "f": offset,
|
| 82 |
+
"format": "bib1",
|
| 83 |
+
}, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
|
| 84 |
+
text = r.text.strip()
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f" ✗ {prefix}{year}: network error ({e})")
|
| 87 |
+
return
|
| 88 |
+
|
| 89 |
+
# Check for HTML error pages
|
| 90 |
+
if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
|
| 91 |
+
break
|
| 92 |
+
|
| 93 |
+
all_bib.append(text)
|
| 94 |
+
n_entries = text.count("@")
|
| 95 |
+
if n_entries < 1000:
|
| 96 |
+
break
|
| 97 |
+
offset += 1000
|
| 98 |
+
time.sleep(1)
|
| 99 |
+
|
| 100 |
+
if all_bib:
|
| 101 |
+
total = sum(b.count("@") for b in all_bib)
|
| 102 |
+
out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
|
| 103 |
+
print(f" ✓ {prefix}{year}: {total} entries")
|
| 104 |
+
else:
|
| 105 |
+
print(f" ✗ {prefix}{year}: not on DBLP yet")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def main():
|
| 109 |
+
out = Path(__file__).resolve().parent.parent / "data" / "raw"
|
| 110 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
print("📥 Downloading conference proceedings from DBLP...")
|
| 113 |
+
total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
|
| 114 |
+
done = 0
|
| 115 |
+
for venue, prefix, years in CONFERENCES:
|
| 116 |
+
for y in years:
|
| 117 |
+
download_venue(venue, prefix, y, out)
|
| 118 |
+
done += 1
|
| 119 |
+
time.sleep(0.5)
|
| 120 |
+
|
| 121 |
+
print(f"\n📥 Downloading journal volumes from DBLP...")
|
| 122 |
+
for venue, prefix, years in JOURNALS:
|
| 123 |
+
for y in years:
|
| 124 |
+
download_venue(venue, prefix, y, out)
|
| 125 |
+
time.sleep(0.5)
|
| 126 |
+
|
| 127 |
+
print(f"\n✅ Done. Run: python scripts/build_index.py")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Bibliography Checker Package"""
|
src/comparator.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metadata comparison between bib entries and fetched metadata.
|
| 3 |
+
"""
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Optional, List, Union, Any, Tuple
|
| 7 |
+
|
| 8 |
+
from .parser import BibEntry
|
| 9 |
+
from .utils import TextNormalizer
|
| 10 |
+
|
| 11 |
+
CURRENT_YEAR = datetime.now().year
|
| 12 |
+
|
| 13 |
+
# Year source priority: lower number = more trustworthy
|
| 14 |
+
YEAR_SOURCE_PRIORITY = {
|
| 15 |
+
"crossref": 0, # DOI-verified, most accurate
|
| 16 |
+
"dblp": 1, # Conference proceedings
|
| 17 |
+
"openalex": 2,
|
| 18 |
+
"semantic_scholar": 3,
|
| 19 |
+
"arxiv_journal_ref": 4, # arXiv's journal_ref field
|
| 20 |
+
"scholar": 5,
|
| 21 |
+
"arxiv": 99, # arXiv submission date — last resort
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def resolve_year(candidates: list, bib_year: str = "") -> Tuple[Optional[str], Optional[str]]:
|
| 26 |
+
"""
|
| 27 |
+
Pick the best year across all candidate results using source priority.
|
| 28 |
+
Conference/journal year always beats arXiv submission year.
|
| 29 |
+
Never returns a future year.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
candidates: list of ComparisonResult objects
|
| 33 |
+
bib_year: the current bib entry year (fallback)
|
| 34 |
+
Returns:
|
| 35 |
+
(best_year, best_source) or (None, None)
|
| 36 |
+
"""
|
| 37 |
+
pool = []
|
| 38 |
+
for cand in candidates:
|
| 39 |
+
if not cand or not cand.fetched_data:
|
| 40 |
+
continue
|
| 41 |
+
source = cand.source
|
| 42 |
+
fetched_year = str(getattr(cand.fetched_data, 'year', '') or '').strip()
|
| 43 |
+
|
| 44 |
+
if not fetched_year or not fetched_year.isdigit():
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
# Check for conference_year from arXiv journal_ref
|
| 48 |
+
conf_year = str(getattr(cand.fetched_data, 'conference_year', '') or '').strip()
|
| 49 |
+
if source == "arxiv" and conf_year and conf_year.isdigit():
|
| 50 |
+
pool.append((YEAR_SOURCE_PRIORITY.get("arxiv_journal_ref", 4), conf_year, "arxiv_journal_ref"))
|
| 51 |
+
|
| 52 |
+
priority = YEAR_SOURCE_PRIORITY.get(source, 50)
|
| 53 |
+
pool.append((priority, fetched_year, source))
|
| 54 |
+
|
| 55 |
+
if not pool:
|
| 56 |
+
return None, None
|
| 57 |
+
|
| 58 |
+
pool.sort()
|
| 59 |
+
|
| 60 |
+
# Pick best year that isn't in the future
|
| 61 |
+
for _, year, source in pool:
|
| 62 |
+
if int(year) <= CURRENT_YEAR:
|
| 63 |
+
return year, source
|
| 64 |
+
|
| 65 |
+
# All years are future — return None
|
| 66 |
+
return None, None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class ComparisonResult:
|
| 71 |
+
"""Result of comparing bib entry with fetched metadata."""
|
| 72 |
+
entry_key: str
|
| 73 |
+
|
| 74 |
+
# Title comparison
|
| 75 |
+
title_match: bool
|
| 76 |
+
title_similarity: float
|
| 77 |
+
bib_title: str
|
| 78 |
+
fetched_title: str
|
| 79 |
+
|
| 80 |
+
# Author comparison
|
| 81 |
+
author_match: bool
|
| 82 |
+
author_similarity: float
|
| 83 |
+
bib_authors: list[str]
|
| 84 |
+
fetched_authors: list[str]
|
| 85 |
+
|
| 86 |
+
# Year comparison
|
| 87 |
+
year_match: bool
|
| 88 |
+
bib_year: str
|
| 89 |
+
fetched_year: str
|
| 90 |
+
|
| 91 |
+
# Overall assessment
|
| 92 |
+
is_match: bool
|
| 93 |
+
confidence: float
|
| 94 |
+
issues: list[str]
|
| 95 |
+
source: str
|
| 96 |
+
|
| 97 |
+
# Raw metadata for auto-fixing
|
| 98 |
+
fetched_data: Any = None
|
| 99 |
+
|
| 100 |
+
# Author initial conflict flag
|
| 101 |
+
author_initial_conflict: bool = False
|
| 102 |
+
|
| 103 |
+
@property
|
| 104 |
+
def has_issues(self) -> bool:
|
| 105 |
+
return len(self.issues) > 0
|
| 106 |
+
|
| 107 |
+
@dataclass
|
| 108 |
+
class EntryReport:
|
| 109 |
+
"""Complete report for a single bib entry."""
|
| 110 |
+
entry: BibEntry
|
| 111 |
+
comparison: Optional[ComparisonResult]
|
| 112 |
+
evaluations: list = None
|
| 113 |
+
|
| 114 |
+
def __post_init__(self):
|
| 115 |
+
if self.evaluations is None:
|
| 116 |
+
self.evaluations = []
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class MetadataComparator:
|
| 121 |
+
"""Compares bibliography entries with fetched metadata."""
|
| 122 |
+
|
| 123 |
+
# Thresholds for matching
|
| 124 |
+
TITLE_THRESHOLD = 0.8
|
| 125 |
+
AUTHOR_THRESHOLD = 0.6
|
| 126 |
+
|
| 127 |
+
def __init__(self):
|
| 128 |
+
self.normalizer = TextNormalizer
|
| 129 |
+
|
| 130 |
+
def compare(self, bib_entry: BibEntry, fetched_data: Any, source_name: str) -> ComparisonResult:
|
| 131 |
+
"""
|
| 132 |
+
Generic comparison method for any data source.
|
| 133 |
+
fetched_data must have 'title', 'year', and 'authors' attributes.
|
| 134 |
+
"""
|
| 135 |
+
issues = []
|
| 136 |
+
|
| 137 |
+
# --- Title Comparison ---
|
| 138 |
+
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 139 |
+
fetched_title_norm = self.normalizer.normalize_for_comparison(fetched_data.title)
|
| 140 |
+
|
| 141 |
+
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, fetched_title_norm)
|
| 142 |
+
if len(bib_title_norm) < 100:
|
| 143 |
+
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, fetched_title_norm)
|
| 144 |
+
title_similarity = max(title_similarity, lev_sim)
|
| 145 |
+
|
| 146 |
+
title_match = title_similarity >= self.TITLE_THRESHOLD
|
| 147 |
+
if not title_match:
|
| 148 |
+
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
|
| 149 |
+
|
| 150 |
+
# --- Author Comparison ---
|
| 151 |
+
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
|
| 152 |
+
|
| 153 |
+
# Check for DBLP disambiguation IDs in bib entry author names
|
| 154 |
+
raw_author_list = self.normalizer.parse_author_list(bib_entry.author)
|
| 155 |
+
for raw_auth in raw_author_list:
|
| 156 |
+
if self.normalizer.has_dblp_disambiguation_id(raw_auth.strip()):
|
| 157 |
+
issues.append(f"DBLP disambiguation ID in author: '{raw_auth.strip()}'")
|
| 158 |
+
|
| 159 |
+
# Handle different author formats (list vs string)
|
| 160 |
+
fetched_authors_raw = getattr(fetched_data, 'authors', [])
|
| 161 |
+
if isinstance(fetched_authors_raw, str):
|
| 162 |
+
# Scholar style: "Author1, Author2"
|
| 163 |
+
fetched_authors_raw = [a.strip() for a in fetched_authors_raw.split(',')]
|
| 164 |
+
|
| 165 |
+
fetched_authors = [
|
| 166 |
+
self.normalizer.normalize_author_name(str(a))
|
| 167 |
+
for a in fetched_authors_raw
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
|
| 171 |
+
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 172 |
+
|
| 173 |
+
if not author_match:
|
| 174 |
+
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
| 175 |
+
|
| 176 |
+
# --- Year Comparison ---
|
| 177 |
+
bib_year = str(bib_entry.year).strip()
|
| 178 |
+
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
|
| 179 |
+
year_match = bib_year == fetched_year
|
| 180 |
+
|
| 181 |
+
if not year_match and bib_year and fetched_year:
|
| 182 |
+
issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
|
| 183 |
+
|
| 184 |
+
# --- Overall Assessment ---
|
| 185 |
+
is_match = title_match and author_match
|
| 186 |
+
# Simple weighted confidence score
|
| 187 |
+
confidence = (
|
| 188 |
+
title_similarity * 0.5 +
|
| 189 |
+
author_similarity * 0.3 +
|
| 190 |
+
(1.0 if year_match else 0.5) * 0.2
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# --- Author Initial Conflict Detection ---
|
| 194 |
+
author_initial_conflict = self._check_author_initial_conflict(
|
| 195 |
+
bib_authors, fetched_authors,
|
| 196 |
+
self.normalizer.parse_author_list(bib_entry.author),
|
| 197 |
+
fetched_authors_raw
|
| 198 |
+
)
|
| 199 |
+
if author_initial_conflict:
|
| 200 |
+
issues.append("Author initial conflict detected (e.g., first-name initials differ)")
|
| 201 |
+
# Cap confidence — don't auto-adopt these authors
|
| 202 |
+
confidence = min(confidence, 0.7)
|
| 203 |
+
|
| 204 |
+
return ComparisonResult(
|
| 205 |
+
entry_key=bib_entry.key,
|
| 206 |
+
title_match=title_match,
|
| 207 |
+
title_similarity=title_similarity,
|
| 208 |
+
bib_title=bib_entry.title,
|
| 209 |
+
fetched_title=fetched_data.title,
|
| 210 |
+
author_match=author_match,
|
| 211 |
+
author_similarity=author_similarity,
|
| 212 |
+
bib_authors=bib_authors,
|
| 213 |
+
fetched_authors=fetched_authors,
|
| 214 |
+
year_match=year_match,
|
| 215 |
+
bib_year=bib_year,
|
| 216 |
+
fetched_year=fetched_year,
|
| 217 |
+
is_match=is_match,
|
| 218 |
+
confidence=confidence,
|
| 219 |
+
issues=issues,
|
| 220 |
+
source=source_name,
|
| 221 |
+
fetched_data=fetched_data,
|
| 222 |
+
author_initial_conflict=author_initial_conflict
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
|
| 226 |
+
"""Create result when metadata couldn't be fetched."""
|
| 227 |
+
return ComparisonResult(
|
| 228 |
+
entry_key=bib_entry.key,
|
| 229 |
+
title_match=False, title_similarity=0.0,
|
| 230 |
+
bib_title=bib_entry.title, fetched_title="",
|
| 231 |
+
author_match=False, author_similarity=0.0,
|
| 232 |
+
bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[],
|
| 233 |
+
year_match=False, bib_year=bib_entry.year, fetched_year="",
|
| 234 |
+
is_match=False, confidence=0.0,
|
| 235 |
+
issues=[reason], source="unable",
|
| 236 |
+
fetched_data=None
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
|
| 240 |
+
"""Compare two author lists."""
|
| 241 |
+
if not list1 and not list2: return 1.0
|
| 242 |
+
if not list1 or not list2: return 0.0
|
| 243 |
+
|
| 244 |
+
total_similarity = 0.0
|
| 245 |
+
for author1 in list1:
|
| 246 |
+
best_match = 0.0
|
| 247 |
+
for author2 in list2:
|
| 248 |
+
if self._names_match(author1, author2):
|
| 249 |
+
best_match = 1.0
|
| 250 |
+
break
|
| 251 |
+
sim = self.normalizer.similarity_ratio(author1, author2)
|
| 252 |
+
best_match = max(best_match, sim)
|
| 253 |
+
total_similarity += best_match
|
| 254 |
+
|
| 255 |
+
return total_similarity / len(list1)
|
| 256 |
+
|
| 257 |
+
def _names_match(self, name1: str, name2: str) -> bool:
|
| 258 |
+
"""Check if two names match (handles abbreviated names)."""
|
| 259 |
+
def split_name(n):
|
| 260 |
+
parts = n.lower().replace('.', '').split()
|
| 261 |
+
return parts
|
| 262 |
+
|
| 263 |
+
words1 = split_name(name1)
|
| 264 |
+
words2 = split_name(name2)
|
| 265 |
+
if not words1 or not words2: return False
|
| 266 |
+
|
| 267 |
+
# Last name must match (assuming last word is last name)
|
| 268 |
+
if words1[-1] != words2[-1]:
|
| 269 |
+
return False
|
| 270 |
+
|
| 271 |
+
# First name check:
|
| 272 |
+
if len(words1) > 1 and len(words2) > 1:
|
| 273 |
+
f1 = words1[0]
|
| 274 |
+
f2 = words2[0]
|
| 275 |
+
|
| 276 |
+
# If one is just an initial
|
| 277 |
+
if len(f1) == 1 or len(f2) == 1:
|
| 278 |
+
if f1[0] != f2[0]: return False
|
| 279 |
+
else:
|
| 280 |
+
# Both full names - must match
|
| 281 |
+
if f1 != f2: return False
|
| 282 |
+
|
| 283 |
+
return True
|
| 284 |
+
|
| 285 |
+
def _check_author_initial_conflict(
|
| 286 |
+
self,
|
| 287 |
+
bib_authors_norm: list[str],
|
| 288 |
+
fetched_authors_norm: list[str],
|
| 289 |
+
bib_authors_raw: list[str],
|
| 290 |
+
fetched_authors_raw: list,
|
| 291 |
+
) -> bool:
|
| 292 |
+
"""
|
| 293 |
+
Detect when first-name initials clearly conflict between
|
| 294 |
+
bib entry and fetched data.
|
| 295 |
+
|
| 296 |
+
e.g., "Y. Zhou" (bib) vs "Henry Zhou" (fetched) → True (Y ≠ H)
|
| 297 |
+
This prevents blindly overwriting authors with wrong names.
|
| 298 |
+
"""
|
| 299 |
+
# Compare by position — aligned authors
|
| 300 |
+
min_len = min(len(bib_authors_norm), len(fetched_authors_norm))
|
| 301 |
+
if min_len == 0:
|
| 302 |
+
return False
|
| 303 |
+
|
| 304 |
+
for i in range(min_len):
|
| 305 |
+
bib_parts = bib_authors_norm[i].split()
|
| 306 |
+
fetched_parts = fetched_authors_norm[i].split()
|
| 307 |
+
|
| 308 |
+
if len(bib_parts) < 2 or len(fetched_parts) < 2:
|
| 309 |
+
continue
|
| 310 |
+
|
| 311 |
+
# Last name must match to consider this a potential conflict
|
| 312 |
+
if bib_parts[-1] != fetched_parts[-1]:
|
| 313 |
+
continue
|
| 314 |
+
|
| 315 |
+
bib_first = bib_parts[0]
|
| 316 |
+
fetched_first = fetched_parts[0]
|
| 317 |
+
|
| 318 |
+
# Both have first name info (not empty)
|
| 319 |
+
if not bib_first or not fetched_first:
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
# If initials differ, it's a conflict
|
| 323 |
+
if bib_first[0] != fetched_first[0]:
|
| 324 |
+
return True
|
| 325 |
+
|
| 326 |
+
return False
|
src/fetcher.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified metadata fetchers for BibGuard.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
import random
|
| 7 |
+
import requests
|
| 8 |
+
import xml.etree.ElementTree as ET
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Optional, Any
|
| 11 |
+
from urllib.parse import quote
|
| 12 |
+
from bs4 import BeautifulSoup
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class FetchResult:
|
| 16 |
+
"""Unified fetch result."""
|
| 17 |
+
title: str = ""
|
| 18 |
+
authors: list[str] | str = ""
|
| 19 |
+
year: str = ""
|
| 20 |
+
doi: str = ""
|
| 21 |
+
url: str = ""
|
| 22 |
+
source: str = ""
|
| 23 |
+
conference_year: str = "" # Year from journal_ref / conference proceedings
|
| 24 |
+
year_source: str = "" # Where the year came from
|
| 25 |
+
|
| 26 |
+
def __post_init__(self):
|
| 27 |
+
if self.authors is None: self.authors = []
|
| 28 |
+
if isinstance(self.authors, str) and self.authors:
|
| 29 |
+
# Simple split if string provided
|
| 30 |
+
self.authors = [a.strip() for a in re.split(r',| and ', self.authors) if a.strip()]
|
| 31 |
+
|
| 32 |
+
class BaseFetcher:
|
| 33 |
+
"""Base class for fetchers."""
|
| 34 |
+
def _rate_limit(self, delay: float, last_time: float) -> float:
|
| 35 |
+
elapsed = time.time() - last_time
|
| 36 |
+
if elapsed < delay:
|
| 37 |
+
time.sleep(delay - elapsed)
|
| 38 |
+
return time.time()
|
| 39 |
+
|
| 40 |
+
class ArxivFetcher(BaseFetcher):
|
| 41 |
+
"""Fetches metadata from arXiv API."""
|
| 42 |
+
API_BASE = "http://export.arxiv.org/api/query"
|
| 43 |
+
|
| 44 |
+
def __init__(self):
|
| 45 |
+
self._last_req = 0.0
|
| 46 |
+
|
| 47 |
+
def fetch_by_id(self, arxiv_id: str) -> Optional[FetchResult]:
|
| 48 |
+
self._last_req = self._rate_limit(3.0, self._last_req)
|
| 49 |
+
clean_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE).strip()
|
| 50 |
+
try:
|
| 51 |
+
resp = requests.get(self.API_BASE, params={'id_list': clean_id, 'max_results': 1}, timeout=30)
|
| 52 |
+
return self._parse(resp.text)
|
| 53 |
+
except Exception: return None
|
| 54 |
+
|
| 55 |
+
def search_by_title(self, title: str) -> list[FetchResult]:
|
| 56 |
+
self._last_req = self._rate_limit(3.0, self._last_req)
|
| 57 |
+
clean = re.sub(r'[^\w\s]', ' ', title).strip()
|
| 58 |
+
try:
|
| 59 |
+
resp = requests.get(self.API_BASE, params={'search_query': f'ti:"{clean}"', 'max_results': 3}, timeout=30)
|
| 60 |
+
return self._parse(resp.text, multiple=True)
|
| 61 |
+
except Exception: return []
|
| 62 |
+
|
| 63 |
+
def _parse(self, xml: str, multiple=False) -> Optional[FetchResult] | list[FetchResult]:
|
| 64 |
+
try:
|
| 65 |
+
root = ET.fromstring(xml)
|
| 66 |
+
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
|
| 67 |
+
entries = root.findall('atom:entry', ns)
|
| 68 |
+
results = []
|
| 69 |
+
for entry in entries:
|
| 70 |
+
id_txt = entry.find('atom:id', ns).text
|
| 71 |
+
title = entry.find('atom:title', ns).text.strip()
|
| 72 |
+
authors = [a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)]
|
| 73 |
+
pub = entry.find('atom:published', ns).text
|
| 74 |
+
year = pub[:4] if pub else ""
|
| 75 |
+
doi_elem = entry.find('arxiv:doi', ns)
|
| 76 |
+
doi = doi_elem.text if doi_elem is not None else ""
|
| 77 |
+
|
| 78 |
+
# Extract conference year from journal_ref if available
|
| 79 |
+
conference_year = ""
|
| 80 |
+
journal_ref_elem = entry.find('arxiv:journal_ref', ns)
|
| 81 |
+
if journal_ref_elem is not None and journal_ref_elem.text:
|
| 82 |
+
jr_text = journal_ref_elem.text.strip()
|
| 83 |
+
year_match = re.search(r'\b(19|20)\d{2}\b', jr_text)
|
| 84 |
+
if year_match:
|
| 85 |
+
conference_year = year_match.group(0)
|
| 86 |
+
|
| 87 |
+
result = FetchResult(
|
| 88 |
+
title=title,
|
| 89 |
+
authors=authors,
|
| 90 |
+
year=year,
|
| 91 |
+
doi=doi,
|
| 92 |
+
url=id_txt,
|
| 93 |
+
source="arxiv",
|
| 94 |
+
conference_year=conference_year,
|
| 95 |
+
year_source="arxiv_journal_ref" if conference_year else "arxiv_submission",
|
| 96 |
+
)
|
| 97 |
+
results.append(result)
|
| 98 |
+
|
| 99 |
+
if multiple: return results
|
| 100 |
+
return results[0] if results else None
|
| 101 |
+
except Exception:
|
| 102 |
+
return [] if multiple else None
|
| 103 |
+
|
| 104 |
+
class CrossRefFetcher(BaseFetcher):
|
| 105 |
+
"""Fetches from CrossRef API."""
|
| 106 |
+
API_BASE = "https://api.crossref.org/works"
|
| 107 |
+
|
| 108 |
+
def __init__(self, email=None):
|
| 109 |
+
self._last_req = 0.0
|
| 110 |
+
self.headers = {'User-Agent': f'BibGuard/1.0 (mailto:{email or "user@example.com"})'}
|
| 111 |
+
|
| 112 |
+
def search_by_title(self, title: str) -> Optional[FetchResult]:
|
| 113 |
+
self._last_req = self._rate_limit(0.2, self._last_req)
|
| 114 |
+
try:
|
| 115 |
+
resp = requests.get(self.API_BASE, params={'query.bibliographic': title, 'rows': 1}, headers=self.headers, timeout=10)
|
| 116 |
+
data = resp.json()['message']['items']
|
| 117 |
+
if data: return self._parse(data[0])
|
| 118 |
+
except Exception: pass
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
def search_by_doi(self, doi: str) -> Optional[FetchResult]:
|
| 122 |
+
self._last_req = self._rate_limit(0.2, self._last_req)
|
| 123 |
+
try:
|
| 124 |
+
resp = requests.get(f"{self.API_BASE}/{quote(doi)}", headers=self.headers, timeout=10)
|
| 125 |
+
return self._parse(resp.json()['message'])
|
| 126 |
+
except Exception: return None
|
| 127 |
+
|
| 128 |
+
def _parse(self, item: dict) -> FetchResult:
|
| 129 |
+
title = item.get('title', [''])[0]
|
| 130 |
+
authors = [f"{a.get('given','')} {a.get('family','')}".strip() for a in item.get('author', [])]
|
| 131 |
+
year = str(item.get('published-print', {}).get('date-parts', [[None]])[0][0] or "")
|
| 132 |
+
return FetchResult(title, authors, year, item.get('DOI', ''), item.get('URL', ''), "crossref")
|
| 133 |
+
|
| 134 |
+
class DBLPFetcher(BaseFetcher):
|
| 135 |
+
"""Fetches from DBLP."""
|
| 136 |
+
API_BASE = "https://dblp.org/search/publ/api"
|
| 137 |
+
|
| 138 |
+
# DBLP disambiguation ID: 4-digit suffix appended to author names
|
| 139 |
+
# e.g. "Tian Tan 0019", "Wei Li 0119"
|
| 140 |
+
_DISAMBIG_RE = re.compile(r'\s+\d{4}\s*$')
|
| 141 |
+
|
| 142 |
+
def __init__(self):
|
| 143 |
+
self._last_req = 0.0
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def _strip_disambig(name: str) -> str:
|
| 147 |
+
"""Strip DBLP disambiguation suffix from author name."""
|
| 148 |
+
return DBLPFetcher._DISAMBIG_RE.sub('', name).strip()
|
| 149 |
+
|
| 150 |
+
def search_by_title(self, title: str) -> Optional[FetchResult]:
|
| 151 |
+
self._last_req = self._rate_limit(1.0, self._last_req)
|
| 152 |
+
try:
|
| 153 |
+
resp = requests.get(self.API_BASE, params={'q': title, 'format': 'json', 'h': 1}, timeout=10)
|
| 154 |
+
hits = resp.json().get('result', {}).get('hits', {}).get('hit', [])
|
| 155 |
+
if hits:
|
| 156 |
+
info = hits[0]['info']
|
| 157 |
+
authors = info.get('authors', {}).get('author', [])
|
| 158 |
+
if isinstance(authors, dict): authors = [self._strip_disambig(authors.get('text', ''))]
|
| 159 |
+
elif isinstance(authors, list): authors = [self._strip_disambig(a.get('text', '')) for a in authors]
|
| 160 |
+
return FetchResult(info.get('title', '').rstrip('.'), authors, info.get('year', ''), info.get('doi', ''), info.get('url', ''), "dblp")
|
| 161 |
+
except Exception: pass
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
class SemanticScholarFetcher(BaseFetcher):
|
| 165 |
+
"""Fetches from Semantic Scholar."""
|
| 166 |
+
API_BASE = "https://api.semanticscholar.org/graph/v1/paper"
|
| 167 |
+
|
| 168 |
+
def __init__(self):
|
| 169 |
+
self._last_req = 0.0
|
| 170 |
+
|
| 171 |
+
def search_by_title(self, title: str) -> Optional[FetchResult]:
|
| 172 |
+
return self._fetch(f"{self.API_BASE}/search", {'query': title, 'limit': 1, 'fields': 'title,authors,year,doi,url'})
|
| 173 |
+
|
| 174 |
+
def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
|
| 175 |
+
return self._fetch(f"{self.API_BASE}/DOI:{doi}", {'fields': 'title,authors,year,doi,url'})
|
| 176 |
+
|
| 177 |
+
def _fetch(self, url, params) -> Optional[FetchResult]:
|
| 178 |
+
self._last_req = self._rate_limit(2.0, self._last_req)
|
| 179 |
+
try:
|
| 180 |
+
resp = requests.get(url, params=params, timeout=10)
|
| 181 |
+
data = resp.json()
|
| 182 |
+
if 'data' in data and data['data']: data = data['data'][0] # Handle search result
|
| 183 |
+
if 'error' in data: return None
|
| 184 |
+
|
| 185 |
+
authors = [a['name'] for a in data.get('authors', [])]
|
| 186 |
+
return FetchResult(data.get('title', ''), authors, str(data.get('year', '')), data.get('doi', ''), data.get('url', ''), "semantic_scholar")
|
| 187 |
+
except Exception: return None
|
| 188 |
+
|
| 189 |
+
class OpenAlexFetcher(BaseFetcher):
|
| 190 |
+
"""Fetches from OpenAlex."""
|
| 191 |
+
API_BASE = "https://api.openalex.org/works"
|
| 192 |
+
|
| 193 |
+
def __init__(self):
|
| 194 |
+
self._last_req = 0.0
|
| 195 |
+
|
| 196 |
+
def search_by_title(self, title: str) -> Optional[FetchResult]:
|
| 197 |
+
self._last_req = self._rate_limit(0.2, self._last_req)
|
| 198 |
+
try:
|
| 199 |
+
resp = requests.get(self.API_BASE, params={'search': title, 'per-page': 1}, timeout=10)
|
| 200 |
+
data = resp.json().get('results', [])
|
| 201 |
+
if data: return self._parse(data[0])
|
| 202 |
+
except Exception: pass
|
| 203 |
+
return None
|
| 204 |
+
|
| 205 |
+
def fetch_by_doi(self, doi: str) -> Optional[FetchResult]:
|
| 206 |
+
self._last_req = self._rate_limit(0.2, self._last_req)
|
| 207 |
+
try:
|
| 208 |
+
resp = requests.get(f"{self.API_BASE}/https://doi.org/{doi}", timeout=10)
|
| 209 |
+
return self._parse(resp.json())
|
| 210 |
+
except Exception: return None
|
| 211 |
+
|
| 212 |
+
def _parse(self, data: dict) -> FetchResult:
|
| 213 |
+
authors = [a['author']['display_name'] for a in data.get('authorships', [])]
|
| 214 |
+
doi = data.get('doi', '').replace('https://doi.org/', '')
|
| 215 |
+
return FetchResult(data.get('title', ''), authors, str(data.get('publication_year', '')), doi, data.get('id', ''), "openalex")
|
| 216 |
+
|
| 217 |
+
class ScholarFetcher(BaseFetcher):
|
| 218 |
+
"""Google Scholar Scraper (Fallback)."""
|
| 219 |
+
SEARCH_URL = "https://scholar.google.com/scholar"
|
| 220 |
+
|
| 221 |
+
def __init__(self):
|
| 222 |
+
self._last_req = 0.0
|
| 223 |
+
self._session = requests.Session()
|
| 224 |
+
self._blocked = False
|
| 225 |
+
|
| 226 |
+
def search_by_title(self, title: str) -> Optional[FetchResult]:
|
| 227 |
+
if self._blocked: return None
|
| 228 |
+
self._last_req = self._rate_limit(5.0 + random.random() * 3, self._last_req) # Polite delay
|
| 229 |
+
try:
|
| 230 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
|
| 231 |
+
resp = self._session.get(self.SEARCH_URL, params={'q': f'"{title}"', 'hl': 'en', 'num': 1}, headers=headers, timeout=30)
|
| 232 |
+
if resp.status_code == 429 or 'unusual traffic' in resp.text:
|
| 233 |
+
self._blocked = True
|
| 234 |
+
return None
|
| 235 |
+
return self._parse(resp.text)
|
| 236 |
+
except Exception: return None
|
| 237 |
+
|
| 238 |
+
def _parse(self, html: str) -> Optional[FetchResult]:
|
| 239 |
+
soup = BeautifulSoup(html, 'lxml')
|
| 240 |
+
entry = soup.find('div', class_='gs_ri')
|
| 241 |
+
if not entry: return None
|
| 242 |
+
|
| 243 |
+
title_tag = entry.find('h3', class_='gs_rt')
|
| 244 |
+
title = title_tag.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '').strip()
|
| 245 |
+
url = title_tag.find('a')['href'] if title_tag.find('a') else ""
|
| 246 |
+
|
| 247 |
+
meta = entry.find('div', class_='gs_a').get_text(strip=True)
|
| 248 |
+
# Attempt to extract year
|
| 249 |
+
year_match = re.search(r'\b(19|20)\d{2}\b', meta)
|
| 250 |
+
year = year_match.group(0) if year_match else ""
|
| 251 |
+
# Attempt to extract authors (before " - ")
|
| 252 |
+
authors = meta.split(' - ')[0]
|
| 253 |
+
|
| 254 |
+
return FetchResult(title, authors, year, "", url, "scholar")
|
src/local_db.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local Conference Database: fast, offline title lookup against DBLP index.
|
| 3 |
+
|
| 4 |
+
This module provides a local database of conference/journal proceedings
|
| 5 |
+
downloaded from DBLP. It serves as a "ground truth" source that eliminates
|
| 6 |
+
the need for network API calls for entries that match known publications.
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _normalize(title: str) -> str:
|
| 16 |
+
"""Normalize a title for index lookup (must match build_index.py)."""
|
| 17 |
+
title = re.sub(r'\{([^}]*)\}', r'\1', title)
|
| 18 |
+
title = re.sub(r'[^\w\s]', ' ', title.lower())
|
| 19 |
+
return re.sub(r'\s+', ' ', title).strip()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class LocalMatch:
|
| 24 |
+
"""Result from a local DB lookup."""
|
| 25 |
+
title: str
|
| 26 |
+
author: str
|
| 27 |
+
year: str
|
| 28 |
+
booktitle: str
|
| 29 |
+
journal: str
|
| 30 |
+
doi: str
|
| 31 |
+
url: str
|
| 32 |
+
pages: str
|
| 33 |
+
volume: str
|
| 34 |
+
entry_type: str
|
| 35 |
+
source_file: str
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class LocalConferenceDB:
|
| 39 |
+
"""Title-based lookup against locally cached DBLP proceedings."""
|
| 40 |
+
|
| 41 |
+
def __init__(self, index_dir: str = None):
|
| 42 |
+
if index_dir is None:
|
| 43 |
+
base = Path(__file__).resolve().parent.parent / "data"
|
| 44 |
+
self._shard_dir = base / "index_shards"
|
| 45 |
+
self._legacy_path = base / "conference_index.json"
|
| 46 |
+
else:
|
| 47 |
+
self._shard_dir = Path(index_dir)
|
| 48 |
+
self._legacy_path = Path(index_dir).parent / "conference_index.json"
|
| 49 |
+
self._idx: dict = {}
|
| 50 |
+
self._loaded = False
|
| 51 |
+
|
| 52 |
+
def load(self) -> bool:
|
| 53 |
+
"""Load index from shards or legacy single file. Returns True if successful."""
|
| 54 |
+
try:
|
| 55 |
+
# Try sharded index first
|
| 56 |
+
if self._shard_dir.exists():
|
| 57 |
+
shard_files = sorted(self._shard_dir.glob("index_*.json"))
|
| 58 |
+
if shard_files:
|
| 59 |
+
for shard_path in shard_files:
|
| 60 |
+
shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
|
| 61 |
+
self._idx.update(shard_data)
|
| 62 |
+
self._loaded = True
|
| 63 |
+
print(f" 📚 Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).")
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
# Fallback: legacy single file
|
| 67 |
+
if self._legacy_path.exists():
|
| 68 |
+
self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8"))
|
| 69 |
+
self._loaded = True
|
| 70 |
+
print(f" 📚 Local DB: {len(self._idx):,} entries loaded.")
|
| 71 |
+
return True
|
| 72 |
+
|
| 73 |
+
print(" ⚠ Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py")
|
| 74 |
+
return False
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f" ⚠ Failed to load local DB: {e}")
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def is_loaded(self) -> bool:
|
| 81 |
+
return self._loaded and len(self._idx) > 0
|
| 82 |
+
|
| 83 |
+
def lookup(self, title: str) -> Optional[LocalMatch]:
|
| 84 |
+
"""
|
| 85 |
+
Look up an entry by title.
|
| 86 |
+
Returns LocalMatch if found, None otherwise.
|
| 87 |
+
"""
|
| 88 |
+
if not self._loaded:
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
key = _normalize(title)
|
| 92 |
+
data = self._idx.get(key)
|
| 93 |
+
if not data:
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
return LocalMatch(
|
| 97 |
+
title=data.get("title", ""),
|
| 98 |
+
author=data.get("author", ""),
|
| 99 |
+
year=data.get("year", ""),
|
| 100 |
+
booktitle=data.get("booktitle", ""),
|
| 101 |
+
journal=data.get("journal", ""),
|
| 102 |
+
doi=data.get("doi", ""),
|
| 103 |
+
url=data.get("url", ""),
|
| 104 |
+
pages=data.get("pages", ""),
|
| 105 |
+
volume=data.get("volume", ""),
|
| 106 |
+
entry_type=data.get("_type", "inproceedings"),
|
| 107 |
+
source_file=data.get("_source", ""),
|
| 108 |
+
)
|
src/normalizer.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Booktitle normalizer: maps verbose venue names to standard abbreviations.
|
| 3 |
+
|
| 4 |
+
Loads rules from data/abbr.tsv (regex → abbreviation).
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
import csv
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BooktitleNormalizer:
|
| 13 |
+
"""Normalizes booktitle/journal names to standard abbreviations."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, tsv_path: str = None):
|
| 16 |
+
if tsv_path is None:
|
| 17 |
+
tsv_path = str(Path(__file__).resolve().parent.parent / "data" / "abbr.tsv")
|
| 18 |
+
self.rules: list[tuple[re.Pattern, str]] = []
|
| 19 |
+
self._load_rules(tsv_path)
|
| 20 |
+
|
| 21 |
+
def _load_rules(self, tsv_path: str):
|
| 22 |
+
"""Load regex → abbreviation rules from TSV file."""
|
| 23 |
+
path = Path(tsv_path)
|
| 24 |
+
if not path.exists():
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 28 |
+
reader = csv.reader(f, delimiter='\t')
|
| 29 |
+
for row in reader:
|
| 30 |
+
if len(row) >= 2:
|
| 31 |
+
pattern_str = row[0].strip()
|
| 32 |
+
abbr = row[1].strip()
|
| 33 |
+
# Skip comments and empty lines
|
| 34 |
+
if not pattern_str or pattern_str.startswith('#'):
|
| 35 |
+
continue
|
| 36 |
+
try:
|
| 37 |
+
self.rules.append((re.compile(pattern_str, re.IGNORECASE), abbr))
|
| 38 |
+
except re.error:
|
| 39 |
+
pass # Skip invalid regex
|
| 40 |
+
|
| 41 |
+
def normalize(self, booktitle: str) -> Optional[str]:
|
| 42 |
+
"""
|
| 43 |
+
Normalize a booktitle to its standard abbreviation.
|
| 44 |
+
Returns the abbreviation if matched, None if no match found.
|
| 45 |
+
"""
|
| 46 |
+
if not booktitle:
|
| 47 |
+
return None
|
| 48 |
+
for pattern, abbr in self.rules:
|
| 49 |
+
if pattern.search(booktitle):
|
| 50 |
+
return abbr
|
| 51 |
+
return None
|
src/parser.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BibTeX file parser.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import bibtexparser
|
| 10 |
+
from bibtexparser.bparser import BibTexParser
|
| 11 |
+
from bibtexparser.customization import convert_to_unicode
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class BibEntry:
|
| 16 |
+
"""Represents a parsed bibliography entry."""
|
| 17 |
+
key: str
|
| 18 |
+
entry_type: str
|
| 19 |
+
title: str = ""
|
| 20 |
+
author: str = ""
|
| 21 |
+
year: str = ""
|
| 22 |
+
abstract: str = ""
|
| 23 |
+
url: str = ""
|
| 24 |
+
doi: str = ""
|
| 25 |
+
arxiv_id: str = ""
|
| 26 |
+
journal: str = ""
|
| 27 |
+
booktitle: str = ""
|
| 28 |
+
publisher: str = ""
|
| 29 |
+
pages: str = ""
|
| 30 |
+
volume: str = ""
|
| 31 |
+
number: str = ""
|
| 32 |
+
raw_entry: dict = field(default_factory=dict)
|
| 33 |
+
|
| 34 |
+
@property
|
| 35 |
+
def has_arxiv(self) -> bool:
|
| 36 |
+
"""Check if entry has arXiv information."""
|
| 37 |
+
return bool(self.arxiv_id)
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def search_query(self) -> str:
|
| 41 |
+
"""Get search query for this entry."""
|
| 42 |
+
return self.title or self.key
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class BibParser:
|
| 46 |
+
"""Parser for .bib files."""
|
| 47 |
+
|
| 48 |
+
# Patterns for extracting arXiv IDs
|
| 49 |
+
ARXIV_PATTERNS = [
|
| 50 |
+
# New format: 2301.00001 or 2301.00001v1
|
| 51 |
+
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
|
| 52 |
+
# Old format: hep-th/9901001 or math.GT/0309136
|
| 53 |
+
r'([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
|
| 54 |
+
# arXiv: prefix
|
| 55 |
+
r'arXiv:(\d{4}\.\d{4,5}(?:v\d+)?)',
|
| 56 |
+
r'arXiv:([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
# URL patterns for arXiv
|
| 60 |
+
ARXIV_URL_PATTERNS = [
|
| 61 |
+
r'arxiv\.org/abs/(\d{4}\.\d{4,5}(?:v\d+)?)',
|
| 62 |
+
r'arxiv\.org/abs/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)',
|
| 63 |
+
r'arxiv\.org/pdf/(\d{4}\.\d{4,5}(?:v\d+)?)(?:\.pdf)?',
|
| 64 |
+
r'arxiv\.org/pdf/([a-z-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)(?:\.pdf)?',
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
def __init__(self):
|
| 68 |
+
self.entries: list[BibEntry] = []
|
| 69 |
+
|
| 70 |
+
def parse_file(self, filepath: str) -> list[BibEntry]:
|
| 71 |
+
"""Parse a .bib file and return list of entries."""
|
| 72 |
+
path = Path(filepath)
|
| 73 |
+
if not path.exists():
|
| 74 |
+
raise FileNotFoundError(f"Bib file not found: {filepath}")
|
| 75 |
+
|
| 76 |
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
| 77 |
+
content = f.read()
|
| 78 |
+
|
| 79 |
+
return self.parse_content(content)
|
| 80 |
+
|
| 81 |
+
def parse_content(self, content: str) -> list[BibEntry]:
|
| 82 |
+
"""Parse bib content string."""
|
| 83 |
+
parser = BibTexParser(common_strings=True)
|
| 84 |
+
parser.customization = convert_to_unicode
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
bib_database = bibtexparser.loads(content, parser=parser)
|
| 88 |
+
except Exception as e:
|
| 89 |
+
raise ValueError(f"Failed to parse bib content: {e}")
|
| 90 |
+
|
| 91 |
+
self.entries = []
|
| 92 |
+
for entry in bib_database.entries:
|
| 93 |
+
bib_entry = self._convert_entry(entry)
|
| 94 |
+
self.entries.append(bib_entry)
|
| 95 |
+
|
| 96 |
+
return self.entries
|
| 97 |
+
|
| 98 |
+
def _convert_entry(self, entry: dict) -> BibEntry:
|
| 99 |
+
"""Convert a bibtexparser entry to BibEntry."""
|
| 100 |
+
# Extract basic fields
|
| 101 |
+
bib_entry = BibEntry(
|
| 102 |
+
key=entry.get('ID', ''),
|
| 103 |
+
entry_type=entry.get('ENTRYTYPE', ''),
|
| 104 |
+
title=entry.get('title', ''),
|
| 105 |
+
author=entry.get('author', ''),
|
| 106 |
+
year=entry.get('year', ''),
|
| 107 |
+
abstract=entry.get('abstract', ''),
|
| 108 |
+
url=entry.get('url', ''),
|
| 109 |
+
doi=entry.get('doi', ''),
|
| 110 |
+
journal=entry.get('journal', ''),
|
| 111 |
+
booktitle=entry.get('booktitle', ''),
|
| 112 |
+
publisher=entry.get('publisher', ''),
|
| 113 |
+
pages=entry.get('pages', ''),
|
| 114 |
+
volume=entry.get('volume', ''),
|
| 115 |
+
number=entry.get('number', ''),
|
| 116 |
+
raw_entry=entry.copy()
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Extract arXiv ID
|
| 120 |
+
bib_entry.arxiv_id = self._extract_arxiv_id(entry)
|
| 121 |
+
|
| 122 |
+
return bib_entry
|
| 123 |
+
|
| 124 |
+
def _extract_arxiv_id(self, entry: dict) -> str:
|
| 125 |
+
"""Extract arXiv ID from entry."""
|
| 126 |
+
# Check eprint field first
|
| 127 |
+
eprint = entry.get('eprint', '')
|
| 128 |
+
if eprint:
|
| 129 |
+
arxiv_id = self._parse_arxiv_id(eprint)
|
| 130 |
+
if arxiv_id:
|
| 131 |
+
return arxiv_id
|
| 132 |
+
|
| 133 |
+
# Check arxiv field
|
| 134 |
+
arxiv = entry.get('arxiv', '')
|
| 135 |
+
if arxiv:
|
| 136 |
+
arxiv_id = self._parse_arxiv_id(arxiv)
|
| 137 |
+
if arxiv_id:
|
| 138 |
+
return arxiv_id
|
| 139 |
+
|
| 140 |
+
# Check URL field
|
| 141 |
+
url = entry.get('url', '')
|
| 142 |
+
if url:
|
| 143 |
+
for pattern in self.ARXIV_URL_PATTERNS:
|
| 144 |
+
match = re.search(pattern, url, re.IGNORECASE)
|
| 145 |
+
if match:
|
| 146 |
+
return match.group(1)
|
| 147 |
+
|
| 148 |
+
# Check journal field for "arXiv preprint arXiv:XXXX.XXXXX" format
|
| 149 |
+
journal = entry.get('journal', '')
|
| 150 |
+
if journal and 'arxiv' in journal.lower():
|
| 151 |
+
arxiv_id = self._parse_arxiv_id(journal)
|
| 152 |
+
if arxiv_id:
|
| 153 |
+
return arxiv_id
|
| 154 |
+
|
| 155 |
+
# Check note field
|
| 156 |
+
note = entry.get('note', '')
|
| 157 |
+
if note:
|
| 158 |
+
arxiv_id = self._parse_arxiv_id(note)
|
| 159 |
+
if arxiv_id:
|
| 160 |
+
return arxiv_id
|
| 161 |
+
|
| 162 |
+
return ""
|
| 163 |
+
|
| 164 |
+
def _parse_arxiv_id(self, text: str) -> str:
|
| 165 |
+
"""Parse arXiv ID from text."""
|
| 166 |
+
for pattern in self.ARXIV_PATTERNS:
|
| 167 |
+
match = re.search(pattern, text)
|
| 168 |
+
if match:
|
| 169 |
+
return match.group(1)
|
| 170 |
+
return ""
|
| 171 |
+
|
| 172 |
+
def get_entry_by_key(self, key: str) -> Optional[BibEntry]:
|
| 173 |
+
"""Get entry by citation key."""
|
| 174 |
+
for entry in self.entries:
|
| 175 |
+
if entry.key == key:
|
| 176 |
+
return entry
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
def filter_file(self, input_path: str, output_path: str, keys_to_keep: set[str]):
|
| 180 |
+
"""
|
| 181 |
+
Create a new bib file containing only specified keys.
|
| 182 |
+
Preserves original formatting, comments, and strings.
|
| 183 |
+
"""
|
| 184 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 185 |
+
content = f.read()
|
| 186 |
+
|
| 187 |
+
filtered_content = self._filter_content(content, keys_to_keep)
|
| 188 |
+
|
| 189 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 190 |
+
f.write(filtered_content)
|
| 191 |
+
|
| 192 |
+
def _filter_content(self, content: str, keys_to_keep: set[str]) -> str:
|
| 193 |
+
"""Filter content string keeping only specified keys."""
|
| 194 |
+
ranges_to_remove = []
|
| 195 |
+
i = 0
|
| 196 |
+
length = len(content)
|
| 197 |
+
|
| 198 |
+
while i < length:
|
| 199 |
+
if content[i] == '@':
|
| 200 |
+
start = i
|
| 201 |
+
# Find opening brace
|
| 202 |
+
brace_open = content.find('{', i)
|
| 203 |
+
if brace_open == -1:
|
| 204 |
+
i += 1
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
# Get entry type
|
| 208 |
+
entry_type = content[i+1:brace_open].strip().lower()
|
| 209 |
+
|
| 210 |
+
# Skip comments
|
| 211 |
+
if entry_type == 'comment':
|
| 212 |
+
i = brace_open + 1
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
# Find matching closing brace to determine entry end
|
| 216 |
+
balance = 1
|
| 217 |
+
j = brace_open + 1
|
| 218 |
+
in_quote = False
|
| 219 |
+
|
| 220 |
+
while j < length and balance > 0:
|
| 221 |
+
char = content[j]
|
| 222 |
+
|
| 223 |
+
# Handle escaped characters
|
| 224 |
+
if char == '\\':
|
| 225 |
+
j += 2
|
| 226 |
+
continue
|
| 227 |
+
|
| 228 |
+
if char == '"':
|
| 229 |
+
in_quote = not in_quote
|
| 230 |
+
elif not in_quote:
|
| 231 |
+
if char == '{':
|
| 232 |
+
balance += 1
|
| 233 |
+
elif char == '}':
|
| 234 |
+
balance -= 1
|
| 235 |
+
j += 1
|
| 236 |
+
|
| 237 |
+
end = j
|
| 238 |
+
|
| 239 |
+
# Extract key (between { and ,)
|
| 240 |
+
# Only for standard entries, not @string or @preamble
|
| 241 |
+
if entry_type not in ('string', 'preamble'):
|
| 242 |
+
# Find comma or end of entry
|
| 243 |
+
# Key is usually the first token after {
|
| 244 |
+
key_part = content[brace_open+1:end]
|
| 245 |
+
comma_pos = key_part.find(',')
|
| 246 |
+
|
| 247 |
+
if comma_pos != -1:
|
| 248 |
+
key = key_part[:comma_pos].strip()
|
| 249 |
+
|
| 250 |
+
# If key is NOT in keep list, mark for removal
|
| 251 |
+
if key not in keys_to_keep:
|
| 252 |
+
ranges_to_remove.append((start, end))
|
| 253 |
+
|
| 254 |
+
i = end
|
| 255 |
+
else:
|
| 256 |
+
i += 1
|
| 257 |
+
|
| 258 |
+
# Reconstruct content
|
| 259 |
+
new_content = []
|
| 260 |
+
last_pos = 0
|
| 261 |
+
for start, end in ranges_to_remove:
|
| 262 |
+
new_content.append(content[last_pos:start])
|
| 263 |
+
|
| 264 |
+
# Clean up whitespace after removed entry
|
| 265 |
+
last_pos = end
|
| 266 |
+
while last_pos < length and content[last_pos] in ' \t\r':
|
| 267 |
+
last_pos += 1
|
| 268 |
+
if last_pos < length and content[last_pos] == '\n':
|
| 269 |
+
last_pos += 1
|
| 270 |
+
|
| 271 |
+
new_content.append(content[last_pos:])
|
| 272 |
+
return "".join(new_content)
|
| 273 |
+
|
| 274 |
+
def save_entries(self, filepath: str, entries: list[BibEntry]):
|
| 275 |
+
"""Save entries to a .bib file."""
|
| 276 |
+
db = bibtexparser.bibdatabase.BibDatabase()
|
| 277 |
+
|
| 278 |
+
db_entries = []
|
| 279 |
+
for entry in entries:
|
| 280 |
+
# Start with raw entry to preserve custom fields
|
| 281 |
+
db_entry = entry.raw_entry.copy()
|
| 282 |
+
|
| 283 |
+
# Update with potentially modified fields
|
| 284 |
+
db_entry['ID'] = entry.key
|
| 285 |
+
db_entry['ENTRYTYPE'] = entry.entry_type
|
| 286 |
+
if entry.title: db_entry['title'] = entry.title
|
| 287 |
+
if entry.author: db_entry['author'] = entry.author
|
| 288 |
+
if entry.year: db_entry['year'] = entry.year
|
| 289 |
+
if entry.journal: db_entry['journal'] = entry.journal
|
| 290 |
+
if entry.booktitle: db_entry['booktitle'] = entry.booktitle
|
| 291 |
+
if entry.publisher: db_entry['publisher'] = entry.publisher
|
| 292 |
+
if entry.pages: db_entry['pages'] = entry.pages
|
| 293 |
+
if entry.volume: db_entry['volume'] = entry.volume
|
| 294 |
+
if entry.number: db_entry['number'] = entry.number
|
| 295 |
+
if entry.doi: db_entry['doi'] = entry.doi
|
| 296 |
+
elif 'doi' in db_entry:
|
| 297 |
+
# DOI was removed (e.g., by DOI mismatch sanitizer)
|
| 298 |
+
del db_entry['doi']
|
| 299 |
+
if entry.url: db_entry['url'] = entry.url
|
| 300 |
+
|
| 301 |
+
# Handle entry type consistency:
|
| 302 |
+
# inproceedings should use booktitle, not journal
|
| 303 |
+
if entry.entry_type.lower() == 'inproceedings':
|
| 304 |
+
if not entry.journal and 'journal' in db_entry:
|
| 305 |
+
del db_entry['journal']
|
| 306 |
+
# article should use journal, not booktitle
|
| 307 |
+
elif entry.entry_type.lower() == 'article':
|
| 308 |
+
if not entry.booktitle and 'booktitle' in db_entry:
|
| 309 |
+
del db_entry['booktitle']
|
| 310 |
+
|
| 311 |
+
db_entries.append(db_entry)
|
| 312 |
+
|
| 313 |
+
db.entries = db_entries
|
| 314 |
+
|
| 315 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 316 |
+
bibtexparser.dump(db, f)
|
src/sanitizer.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BibTeX Sanitizer: Structural and formatting checks for bib entries.
|
| 3 |
+
|
| 4 |
+
Runs as a pre-processing phase before metadata fetch-and-compare,
|
| 5 |
+
detecting and auto-fixing common formatting issues that crawlers
|
| 6 |
+
and copy-paste introduce into .bib files.
|
| 7 |
+
"""
|
| 8 |
+
import re
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import List, Optional, Any
|
| 12 |
+
|
| 13 |
+
CURRENT_YEAR = datetime.now().year
|
| 14 |
+
|
| 15 |
+
from .parser import BibEntry
|
| 16 |
+
from .utils import TextNormalizer
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class SanitizeFix:
|
| 21 |
+
"""Describes a single sanitization fix applied to a bib entry."""
|
| 22 |
+
entry_key: str
|
| 23 |
+
category: str # e.g., "dblp_id", "corporate_author", "entry_type", "title_case", "doi_mismatch"
|
| 24 |
+
field: str # which field was affected
|
| 25 |
+
description: str # human-readable description
|
| 26 |
+
old_value: str = ""
|
| 27 |
+
new_value: str = ""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Known conference name keywords for entry type detection
|
| 31 |
+
CONFERENCE_KEYWORDS = [
|
| 32 |
+
"conference", "proceedings", "workshop", "symposium",
|
| 33 |
+
# Top ML/AI
|
| 34 |
+
"iclr", "icml", "neurips", "nips", "aaai", "ijcai",
|
| 35 |
+
# NLP
|
| 36 |
+
"acl", "emnlp", "naacl", "coling", "eacl",
|
| 37 |
+
# Vision
|
| 38 |
+
"cvpr", "iccv", "eccv",
|
| 39 |
+
# Speech
|
| 40 |
+
"interspeech", "icassp",
|
| 41 |
+
# IR/Data
|
| 42 |
+
"sigir", "kdd", "www", "wsdm",
|
| 43 |
+
# Systems
|
| 44 |
+
"osdi", "sosp", "nsdi",
|
| 45 |
+
# General
|
| 46 |
+
"international conference", "annual meeting",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class BibSanitizer:
|
| 51 |
+
"""Performs structural and formatting sanity checks on BibEntry objects."""
|
| 52 |
+
|
| 53 |
+
def sanitize_all(self, entries: List[BibEntry]) -> dict:
|
| 54 |
+
"""
|
| 55 |
+
Run all sanitization checks on a list of entries.
|
| 56 |
+
Returns dict: {entry_key: [SanitizeFix, ...]}
|
| 57 |
+
Entries are modified in-place.
|
| 58 |
+
"""
|
| 59 |
+
all_fixes = {}
|
| 60 |
+
for entry in entries:
|
| 61 |
+
fixes = []
|
| 62 |
+
fixes.extend(self._check_dblp_ids(entry))
|
| 63 |
+
fixes.extend(self._check_corporate_authors(entry))
|
| 64 |
+
fixes.extend(self._check_entry_type(entry))
|
| 65 |
+
fixes.extend(self._check_title_capitalization(entry))
|
| 66 |
+
fixes.extend(self._check_future_year(entry))
|
| 67 |
+
fixes.extend(self._clean_entry_fields(entry))
|
| 68 |
+
if fixes:
|
| 69 |
+
all_fixes[entry.key] = fixes
|
| 70 |
+
return all_fixes
|
| 71 |
+
|
| 72 |
+
# ------------------------------------------------------------------
|
| 73 |
+
# Check 1: DBLP Disambiguation ID Cleanup
|
| 74 |
+
# ------------------------------------------------------------------
|
| 75 |
+
def _check_dblp_ids(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 76 |
+
"""Strip DBLP disambiguation IDs (4-digit suffixes) from author names."""
|
| 77 |
+
fixes = []
|
| 78 |
+
if not entry.author:
|
| 79 |
+
return fixes
|
| 80 |
+
|
| 81 |
+
raw_authors = TextNormalizer.parse_author_list(entry.author)
|
| 82 |
+
cleaned_authors = []
|
| 83 |
+
any_changed = False
|
| 84 |
+
|
| 85 |
+
for author in raw_authors:
|
| 86 |
+
author = author.strip()
|
| 87 |
+
if TextNormalizer.has_dblp_disambiguation_id(author):
|
| 88 |
+
cleaned = TextNormalizer.strip_dblp_disambiguation_id(author)
|
| 89 |
+
fixes.append(SanitizeFix(
|
| 90 |
+
entry_key=entry.key,
|
| 91 |
+
category="dblp_id",
|
| 92 |
+
field="author",
|
| 93 |
+
description=f"Stripped DBLP disambiguation ID: '{author}' → '{cleaned}'",
|
| 94 |
+
old_value=author,
|
| 95 |
+
new_value=cleaned,
|
| 96 |
+
))
|
| 97 |
+
cleaned_authors.append(cleaned)
|
| 98 |
+
any_changed = True
|
| 99 |
+
else:
|
| 100 |
+
cleaned_authors.append(author)
|
| 101 |
+
|
| 102 |
+
if any_changed:
|
| 103 |
+
new_author_str = " and ".join(cleaned_authors)
|
| 104 |
+
entry.author = new_author_str
|
| 105 |
+
# Also update raw_entry so save_entries doesn't re-introduce the IDs
|
| 106 |
+
if 'author' in entry.raw_entry:
|
| 107 |
+
entry.raw_entry['author'] = new_author_str
|
| 108 |
+
|
| 109 |
+
return fixes
|
| 110 |
+
|
| 111 |
+
# ------------------------------------------------------------------
|
| 112 |
+
# Check 2: Corporate / Institutional Author Protection
|
| 113 |
+
# ------------------------------------------------------------------
|
| 114 |
+
def _check_corporate_authors(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 115 |
+
"""
|
| 116 |
+
Detect single-word author names and wrap in {{double braces}}.
|
| 117 |
+
|
| 118 |
+
BibTeX treats single-word names as a last name, rendering e.g.
|
| 119 |
+
"KimiTeam" as "K. Team". Wrapping in {{}} prevents this.
|
| 120 |
+
"""
|
| 121 |
+
fixes = []
|
| 122 |
+
if not entry.author:
|
| 123 |
+
return fixes
|
| 124 |
+
|
| 125 |
+
raw_authors = TextNormalizer.parse_author_list(entry.author)
|
| 126 |
+
new_authors = []
|
| 127 |
+
any_changed = False
|
| 128 |
+
|
| 129 |
+
for author in raw_authors:
|
| 130 |
+
author = author.strip()
|
| 131 |
+
# Already wrapped in double braces
|
| 132 |
+
if author.startswith('{{') and author.endswith('}}'):
|
| 133 |
+
new_authors.append(author)
|
| 134 |
+
continue
|
| 135 |
+
# Already wrapped in single braces (check if it's a corporate name)
|
| 136 |
+
if author.startswith('{') and author.endswith('}'):
|
| 137 |
+
new_authors.append(author)
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
# Single-word author (no spaces) that starts with uppercase
|
| 141 |
+
# e.g., "KimiTeam", "OpenAI", "Google"
|
| 142 |
+
stripped = author.strip('{}')
|
| 143 |
+
if ' ' not in stripped and stripped and stripped[0].isupper() and len(stripped) > 1:
|
| 144 |
+
wrapped = '{{' + stripped + '}}'
|
| 145 |
+
fixes.append(SanitizeFix(
|
| 146 |
+
entry_key=entry.key,
|
| 147 |
+
category="corporate_author",
|
| 148 |
+
field="author",
|
| 149 |
+
description=f"Corporate author protected: '{author}' → '{wrapped}'",
|
| 150 |
+
old_value=author,
|
| 151 |
+
new_value=wrapped,
|
| 152 |
+
))
|
| 153 |
+
new_authors.append(wrapped)
|
| 154 |
+
any_changed = True
|
| 155 |
+
else:
|
| 156 |
+
new_authors.append(author)
|
| 157 |
+
|
| 158 |
+
if any_changed:
|
| 159 |
+
new_author_str = " and ".join(new_authors)
|
| 160 |
+
entry.author = new_author_str
|
| 161 |
+
if 'author' in entry.raw_entry:
|
| 162 |
+
entry.raw_entry['author'] = new_author_str
|
| 163 |
+
|
| 164 |
+
return fixes
|
| 165 |
+
|
| 166 |
+
# ------------------------------------------------------------------
|
| 167 |
+
# Check 3: Entry Type Correction (article → inproceedings)
|
| 168 |
+
# ------------------------------------------------------------------
|
| 169 |
+
def _check_entry_type(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 170 |
+
"""
|
| 171 |
+
Detect conference papers incorrectly typed as @article.
|
| 172 |
+
|
| 173 |
+
Heuristics:
|
| 174 |
+
- Has booktitle field → should be inproceedings
|
| 175 |
+
- Journal field contains conference keywords → move to booktitle
|
| 176 |
+
"""
|
| 177 |
+
fixes = []
|
| 178 |
+
|
| 179 |
+
if entry.entry_type.lower() != 'article':
|
| 180 |
+
return fixes
|
| 181 |
+
|
| 182 |
+
# Case 1: Has booktitle but typed as article
|
| 183 |
+
if entry.booktitle:
|
| 184 |
+
old_type = entry.entry_type
|
| 185 |
+
entry.entry_type = 'inproceedings'
|
| 186 |
+
if 'ENTRYTYPE' in entry.raw_entry:
|
| 187 |
+
entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
|
| 188 |
+
fixes.append(SanitizeFix(
|
| 189 |
+
entry_key=entry.key,
|
| 190 |
+
category="entry_type",
|
| 191 |
+
field="ENTRYTYPE",
|
| 192 |
+
description=f"Entry has booktitle but was @{old_type} → @inproceedings",
|
| 193 |
+
old_value=old_type,
|
| 194 |
+
new_value='inproceedings',
|
| 195 |
+
))
|
| 196 |
+
return fixes
|
| 197 |
+
|
| 198 |
+
# Case 2: Journal field contains conference keywords
|
| 199 |
+
if entry.journal:
|
| 200 |
+
journal_lower = entry.journal.lower()
|
| 201 |
+
matched_keyword = None
|
| 202 |
+
for keyword in CONFERENCE_KEYWORDS:
|
| 203 |
+
if keyword in journal_lower:
|
| 204 |
+
matched_keyword = keyword
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
if matched_keyword:
|
| 208 |
+
old_type = entry.entry_type
|
| 209 |
+
old_journal = entry.journal
|
| 210 |
+
|
| 211 |
+
# Move journal → booktitle
|
| 212 |
+
entry.booktitle = entry.journal
|
| 213 |
+
entry.journal = ""
|
| 214 |
+
entry.entry_type = 'inproceedings'
|
| 215 |
+
|
| 216 |
+
# Update raw_entry
|
| 217 |
+
if 'ENTRYTYPE' in entry.raw_entry:
|
| 218 |
+
entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
|
| 219 |
+
entry.raw_entry['booktitle'] = old_journal
|
| 220 |
+
if 'journal' in entry.raw_entry:
|
| 221 |
+
del entry.raw_entry['journal']
|
| 222 |
+
|
| 223 |
+
fixes.append(SanitizeFix(
|
| 224 |
+
entry_key=entry.key,
|
| 225 |
+
category="entry_type",
|
| 226 |
+
field="ENTRYTYPE",
|
| 227 |
+
description=(
|
| 228 |
+
f"@{old_type} → @inproceedings "
|
| 229 |
+
f"(journal '{old_journal}' contains '{matched_keyword}', moved to booktitle)"
|
| 230 |
+
),
|
| 231 |
+
old_value=old_type,
|
| 232 |
+
new_value='inproceedings',
|
| 233 |
+
))
|
| 234 |
+
|
| 235 |
+
return fixes
|
| 236 |
+
|
| 237 |
+
# ------------------------------------------------------------------
|
| 238 |
+
# Check 4: DOI-Title Cross-Validation
|
| 239 |
+
# ------------------------------------------------------------------
|
| 240 |
+
def check_doi_title_match(self, entry: BibEntry, fetched_data: Any) -> List[SanitizeFix]:
|
| 241 |
+
"""
|
| 242 |
+
Validate that a DOI resolves to the same paper as the bib entry.
|
| 243 |
+
|
| 244 |
+
Called during the fetch phase (requires network), not during
|
| 245 |
+
the offline sanitize phase.
|
| 246 |
+
|
| 247 |
+
If the DOI metadata title doesn't match the bib entry title,
|
| 248 |
+
flag the DOI as potentially wrong and remove it.
|
| 249 |
+
"""
|
| 250 |
+
fixes = []
|
| 251 |
+
if not entry.doi or not fetched_data:
|
| 252 |
+
return fixes
|
| 253 |
+
|
| 254 |
+
fetched_title = getattr(fetched_data, 'title', '')
|
| 255 |
+
if not fetched_title:
|
| 256 |
+
return fixes
|
| 257 |
+
|
| 258 |
+
bib_title_norm = TextNormalizer.normalize_for_comparison(entry.title)
|
| 259 |
+
doi_title_norm = TextNormalizer.normalize_for_comparison(fetched_title)
|
| 260 |
+
|
| 261 |
+
similarity = TextNormalizer.similarity_ratio(bib_title_norm, doi_title_norm)
|
| 262 |
+
if len(bib_title_norm) < 100:
|
| 263 |
+
lev_sim = TextNormalizer.levenshtein_similarity(bib_title_norm, doi_title_norm)
|
| 264 |
+
similarity = max(similarity, lev_sim)
|
| 265 |
+
|
| 266 |
+
if similarity < 0.5:
|
| 267 |
+
old_doi = entry.doi
|
| 268 |
+
fixes.append(SanitizeFix(
|
| 269 |
+
entry_key=entry.key,
|
| 270 |
+
category="doi_mismatch",
|
| 271 |
+
field="doi",
|
| 272 |
+
description=(
|
| 273 |
+
f"DOI '{old_doi}' resolves to a different title "
|
| 274 |
+
f"('{fetched_title[:60]}...' vs '{entry.title[:60]}...'). "
|
| 275 |
+
f"Similarity: {similarity:.0%}. DOI removed."
|
| 276 |
+
),
|
| 277 |
+
old_value=old_doi,
|
| 278 |
+
new_value="",
|
| 279 |
+
))
|
| 280 |
+
entry.doi = ""
|
| 281 |
+
if 'doi' in entry.raw_entry:
|
| 282 |
+
del entry.raw_entry['doi']
|
| 283 |
+
|
| 284 |
+
return fixes
|
| 285 |
+
|
| 286 |
+
# ------------------------------------------------------------------
|
| 287 |
+
# Check 5: Title Capitalization Protection (for IEEEtran)
|
| 288 |
+
# ------------------------------------------------------------------
|
| 289 |
+
|
| 290 |
+
# Pattern: 2+ uppercase letters (acronyms like MMAU, SALMONN, GPT, BEATs)
|
| 291 |
+
_ACRONYM_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{2,}[a-z]?(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
|
| 292 |
+
|
| 293 |
+
# Pattern: CamelCase words (SpeechT5, HuBERT, ChatGPT, AudioPaLM)
|
| 294 |
+
_CAMELCASE_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][a-z]+(?:[\.-]?[A-Z][a-z]*)+)(?![A-Za-z0-9])')
|
| 295 |
+
|
| 296 |
+
# Pattern: Word with mixed case + digits, optionally with dots/hyphens (GPT-4o, Llama3, Qwen2.5-Omni)
|
| 297 |
+
_MIXED_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z][A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*\d[A-Za-z0-9]*(?:[\.-][A-Za-z0-9]+)*)(?![A-Za-z0-9])')
|
| 298 |
+
|
| 299 |
+
def _check_title_capitalization(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 300 |
+
"""
|
| 301 |
+
Wrap acronyms and proper nouns in {} to protect capitalization.
|
| 302 |
+
|
| 303 |
+
IEEEtran's .bst forces titles to sentence case.
|
| 304 |
+
Without braces, "SALMONN" becomes "salmonn".
|
| 305 |
+
"""
|
| 306 |
+
fixes = []
|
| 307 |
+
if not entry.title:
|
| 308 |
+
return fixes
|
| 309 |
+
|
| 310 |
+
title = entry.title
|
| 311 |
+
words_to_protect = set()
|
| 312 |
+
|
| 313 |
+
# Find acronyms (e.g., MMAU, CREMA-D, SALMONN)
|
| 314 |
+
for m in self._ACRONYM_RE.finditer(title):
|
| 315 |
+
word = m.group(1)
|
| 316 |
+
# Skip very common short words that might be false positives
|
| 317 |
+
if word in ('AI', 'ML', 'NLP', 'CV', 'LLM', 'ASR', 'TTS', 'NER',
|
| 318 |
+
'QA', 'MT', 'IR', 'RL', 'GAN', 'VAE', 'RNN', 'CNN',
|
| 319 |
+
'GPU', 'CPU', 'TPU', 'API', 'URL', 'PDF', 'HTML',
|
| 320 |
+
'II', 'III', 'IV', 'VI', 'VII', 'VIII', 'IX', 'XI',
|
| 321 |
+
'USB', 'RAM', 'ROM', 'SSD', 'TCP', 'HTTP', 'SSL',
|
| 322 |
+
'BERT', 'GPT', 'LSTM', 'MLP', 'FFN', 'LLM'):
|
| 323 |
+
# Still protect these! They're valid acronyms
|
| 324 |
+
words_to_protect.add(word)
|
| 325 |
+
elif len(word) >= 2:
|
| 326 |
+
words_to_protect.add(word)
|
| 327 |
+
|
| 328 |
+
# Find CamelCase (e.g., SpeechT5, HuBERT, ChatGPT, BEATs)
|
| 329 |
+
for m in self._CAMELCASE_RE.finditer(title):
|
| 330 |
+
words_to_protect.add(m.group(1))
|
| 331 |
+
|
| 332 |
+
# Find mixed-case+digit patterns (e.g., GPT4, Llama3)
|
| 333 |
+
for m in self._MIXED_RE.finditer(title):
|
| 334 |
+
words_to_protect.add(m.group(1))
|
| 335 |
+
|
| 336 |
+
if not words_to_protect:
|
| 337 |
+
return fixes
|
| 338 |
+
|
| 339 |
+
# Apply protection: wrap each word in {} if not already braced
|
| 340 |
+
new_title = title
|
| 341 |
+
protected_words = []
|
| 342 |
+
|
| 343 |
+
for word in sorted(words_to_protect, key=len, reverse=True):
|
| 344 |
+
# Check if this word is already inside braces
|
| 345 |
+
# Look for {word} already in the title
|
| 346 |
+
if '{' + word + '}' in new_title:
|
| 347 |
+
continue
|
| 348 |
+
if '{{' + word + '}}' in new_title:
|
| 349 |
+
continue
|
| 350 |
+
|
| 351 |
+
# Replace the bare word with {word}
|
| 352 |
+
# Use word boundary to avoid partial matches
|
| 353 |
+
pattern = re.compile(r'(?<!\{)\b' + re.escape(word) + r'\b(?!\})')
|
| 354 |
+
if pattern.search(new_title):
|
| 355 |
+
new_title = pattern.sub('{' + word + '}', new_title)
|
| 356 |
+
protected_words.append(word)
|
| 357 |
+
|
| 358 |
+
if protected_words and new_title != title:
|
| 359 |
+
fixes.append(SanitizeFix(
|
| 360 |
+
entry_key=entry.key,
|
| 361 |
+
category="title_case",
|
| 362 |
+
field="title",
|
| 363 |
+
description=f"Protected capitalization: {', '.join(protected_words)}",
|
| 364 |
+
old_value=title,
|
| 365 |
+
new_value=new_title,
|
| 366 |
+
))
|
| 367 |
+
entry.title = new_title
|
| 368 |
+
if 'title' in entry.raw_entry:
|
| 369 |
+
entry.raw_entry['title'] = new_title
|
| 370 |
+
|
| 371 |
+
return fixes
|
| 372 |
+
|
| 373 |
+
# ------------------------------------------------------------------
|
| 374 |
+
# Check 6: Future Year Detection
|
| 375 |
+
# ------------------------------------------------------------------
|
| 376 |
+
def _check_future_year(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 377 |
+
"""
|
| 378 |
+
Detect entries with year > current year.
|
| 379 |
+
|
| 380 |
+
These are likely arXiv submission dates that will be wrong once
|
| 381 |
+
the paper is published at a conference. Flag them for forced
|
| 382 |
+
API lookup so the correct conference year can be found.
|
| 383 |
+
"""
|
| 384 |
+
fixes = []
|
| 385 |
+
year_str = str(entry.year).strip()
|
| 386 |
+
if not year_str or not year_str.isdigit():
|
| 387 |
+
return fixes
|
| 388 |
+
|
| 389 |
+
year = int(year_str)
|
| 390 |
+
|
| 391 |
+
if year > CURRENT_YEAR:
|
| 392 |
+
# Flag the entry for forced API lookup
|
| 393 |
+
entry._force_api_lookup = True
|
| 394 |
+
fixes.append(SanitizeFix(
|
| 395 |
+
entry_key=entry.key,
|
| 396 |
+
category="future_year",
|
| 397 |
+
field="year",
|
| 398 |
+
description=(
|
| 399 |
+
f"Future year {year} detected (current: {CURRENT_YEAR}). "
|
| 400 |
+
f"Will force API lookup to find correct year."
|
| 401 |
+
),
|
| 402 |
+
old_value=year_str,
|
| 403 |
+
new_value="", # Will be resolved by API
|
| 404 |
+
))
|
| 405 |
+
elif year < 1950:
|
| 406 |
+
fixes.append(SanitizeFix(
|
| 407 |
+
entry_key=entry.key,
|
| 408 |
+
category="future_year",
|
| 409 |
+
field="year",
|
| 410 |
+
description=f"Suspiciously old year: {year}",
|
| 411 |
+
old_value=year_str,
|
| 412 |
+
new_value="",
|
| 413 |
+
))
|
| 414 |
+
|
| 415 |
+
return fixes
|
| 416 |
+
|
| 417 |
+
# ------------------------------------------------------------------
|
| 418 |
+
# Check 7: Field Cleanup Policy
|
| 419 |
+
# ------------------------------------------------------------------
|
| 420 |
+
# Fields to remove per entry type
|
| 421 |
+
FIELD_REMOVE_POLICY = {
|
| 422 |
+
"inproceedings": [
|
| 423 |
+
"address", "month", "abstract",
|
| 424 |
+
"archiveprefix", "primaryclass",
|
| 425 |
+
"biburl", "bibsource", "timestamp",
|
| 426 |
+
"copyright", "issn", "isbn",
|
| 427 |
+
],
|
| 428 |
+
"article": [
|
| 429 |
+
"address", "month", "abstract",
|
| 430 |
+
"archiveprefix", "primaryclass",
|
| 431 |
+
"biburl", "bibsource", "timestamp",
|
| 432 |
+
"copyright", "issn",
|
| 433 |
+
],
|
| 434 |
+
"misc": [
|
| 435 |
+
"address", "month", "abstract",
|
| 436 |
+
"biburl", "bibsource", "timestamp",
|
| 437 |
+
"copyright",
|
| 438 |
+
],
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
def _clean_entry_fields(self, entry: BibEntry) -> List[SanitizeFix]:
|
| 442 |
+
"""
|
| 443 |
+
Remove junk/noise fields that crawlers often include.
|
| 444 |
+
These fields add clutter and can cause formatting issues.
|
| 445 |
+
"""
|
| 446 |
+
fixes = []
|
| 447 |
+
entry_type = entry.entry_type.lower()
|
| 448 |
+
to_remove = self.FIELD_REMOVE_POLICY.get(entry_type, [])
|
| 449 |
+
|
| 450 |
+
removed_fields = []
|
| 451 |
+
for field_name in to_remove:
|
| 452 |
+
# Check in raw_entry (case-insensitive)
|
| 453 |
+
for raw_key in list(entry.raw_entry.keys()):
|
| 454 |
+
if raw_key.lower() == field_name.lower() and raw_key not in ('ID', 'ENTRYTYPE'):
|
| 455 |
+
del entry.raw_entry[raw_key]
|
| 456 |
+
removed_fields.append(raw_key)
|
| 457 |
+
|
| 458 |
+
if removed_fields:
|
| 459 |
+
fixes.append(SanitizeFix(
|
| 460 |
+
entry_key=entry.key,
|
| 461 |
+
category="field_cleanup",
|
| 462 |
+
field="multiple",
|
| 463 |
+
description=f"Removed junk fields: {', '.join(removed_fields)}",
|
| 464 |
+
old_value=", ".join(removed_fields),
|
| 465 |
+
new_value="",
|
| 466 |
+
))
|
| 467 |
+
|
| 468 |
+
return fixes
|
| 469 |
+
|
| 470 |
+
# ------------------------------------------------------------------
|
| 471 |
+
# Standalone: Duplicate Detection
|
| 472 |
+
# ------------------------------------------------------------------
|
| 473 |
+
@staticmethod
|
| 474 |
+
def find_duplicates(entries: List[BibEntry]) -> dict:
|
| 475 |
+
"""
|
| 476 |
+
Find entries that share the same normalized title.
|
| 477 |
+
Returns {normalized_title: [key1, key2, ...]} for duplicates.
|
| 478 |
+
"""
|
| 479 |
+
import re as _re
|
| 480 |
+
from collections import defaultdict
|
| 481 |
+
|
| 482 |
+
def _norm(t: str) -> str:
|
| 483 |
+
t = _re.sub(r'\{([^}]*)\}', r'\1', t)
|
| 484 |
+
t = _re.sub(r'[^\w\s]', ' ', t.lower())
|
| 485 |
+
return _re.sub(r'\s+', ' ', t).strip()
|
| 486 |
+
|
| 487 |
+
title_map = defaultdict(list)
|
| 488 |
+
for entry in entries:
|
| 489 |
+
key = _norm(entry.title)
|
| 490 |
+
if key:
|
| 491 |
+
title_map[key].append(entry.key)
|
| 492 |
+
|
| 493 |
+
return {t: keys for t, keys in title_map.items() if len(keys) > 1}
|
src/space_service.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Non-interactive RefCheck workflow for Hugging Face Spaces.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import tempfile
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 12 |
+
|
| 13 |
+
from main import (
|
| 14 |
+
apply_fix,
|
| 15 |
+
apply_local_fix,
|
| 16 |
+
get_default_workflow,
|
| 17 |
+
validate_entry,
|
| 18 |
+
)
|
| 19 |
+
from src.comparator import EntryReport, MetadataComparator
|
| 20 |
+
from src.fetcher import (
|
| 21 |
+
ArxivFetcher,
|
| 22 |
+
CrossRefFetcher,
|
| 23 |
+
DBLPFetcher,
|
| 24 |
+
OpenAlexFetcher,
|
| 25 |
+
ScholarFetcher,
|
| 26 |
+
SemanticScholarFetcher,
|
| 27 |
+
)
|
| 28 |
+
from src.local_db import LocalConferenceDB
|
| 29 |
+
from src.parser import BibEntry, BibParser
|
| 30 |
+
from src.sanitizer import BibSanitizer, SanitizeFix
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class RefCheckOptions:
|
| 35 |
+
"""Options for a non-interactive RefCheck run."""
|
| 36 |
+
|
| 37 |
+
remove_unverified: bool = True
|
| 38 |
+
enable_google_scholar: bool = False
|
| 39 |
+
max_workers: int = 4
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class RefCheckResult:
|
| 44 |
+
"""Artifacts and summary produced by a Space run."""
|
| 45 |
+
|
| 46 |
+
total_input: int = 0
|
| 47 |
+
total_output: int = 0
|
| 48 |
+
verified: int = 0
|
| 49 |
+
issues: int = 0
|
| 50 |
+
not_found: int = 0
|
| 51 |
+
fixed_details: dict[str, list[str]] = field(default_factory=dict)
|
| 52 |
+
removed_details: list[tuple[str, str, str]] = field(default_factory=list)
|
| 53 |
+
review_details: list[dict[str, Any]] = field(default_factory=list)
|
| 54 |
+
duplicate_details: dict[str, list[str]] = field(default_factory=dict)
|
| 55 |
+
sanitize_fixes: dict[str, list[SanitizeFix]] = field(default_factory=dict)
|
| 56 |
+
local_matches: int = 0
|
| 57 |
+
local_db_loaded: bool = False
|
| 58 |
+
fixed_bib_path: str = ""
|
| 59 |
+
report_path: str = ""
|
| 60 |
+
report_markdown: str = ""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = None) -> RefCheckResult:
|
| 64 |
+
"""Validate and fix an uploaded BibTeX file without interactive prompts."""
|
| 65 |
+
options = options or RefCheckOptions()
|
| 66 |
+
source_path = Path(file_path)
|
| 67 |
+
parser = BibParser()
|
| 68 |
+
entries = parser.parse_file(str(source_path))
|
| 69 |
+
result = RefCheckResult(total_input=len(entries))
|
| 70 |
+
|
| 71 |
+
if not entries:
|
| 72 |
+
result.report_markdown = "## RefCheck Report\n\nNo BibTeX entries were found."
|
| 73 |
+
result.report_path = _write_report(result.report_markdown)
|
| 74 |
+
result.fixed_bib_path = _write_bib(parser, [], source_path.stem)
|
| 75 |
+
return result
|
| 76 |
+
|
| 77 |
+
sanitizer = BibSanitizer()
|
| 78 |
+
result.sanitize_fixes = sanitizer.sanitize_all(entries)
|
| 79 |
+
_record_sanitize_fixes(result.fixed_details, result.sanitize_fixes)
|
| 80 |
+
result.duplicate_details = sanitizer.find_duplicates(entries)
|
| 81 |
+
|
| 82 |
+
result.local_db_loaded, api_entries, result.local_matches = _apply_local_db(entries, result.fixed_details)
|
| 83 |
+
|
| 84 |
+
fetchers = _build_fetchers()
|
| 85 |
+
workflow = get_default_workflow()
|
| 86 |
+
for step in workflow.steps:
|
| 87 |
+
if step.name == "google_scholar":
|
| 88 |
+
step.enabled = options.enable_google_scholar
|
| 89 |
+
|
| 90 |
+
comparator = MetadataComparator()
|
| 91 |
+
analysis = _analyze_entries(api_entries, workflow, fetchers, comparator, options.max_workers)
|
| 92 |
+
|
| 93 |
+
actions: dict[str, tuple[str, Any, list[Any]]] = {}
|
| 94 |
+
|
| 95 |
+
for entry, best_result, candidates in analysis:
|
| 96 |
+
if not best_result:
|
| 97 |
+
actions[entry.key] = ("keep", None, [])
|
| 98 |
+
elif getattr(entry, "_force_api_lookup", False) and best_result.fetched_data:
|
| 99 |
+
actions[entry.key] = ("fix", best_result, candidates)
|
| 100 |
+
elif best_result.confidence > 0.85 and best_result.fetched_data:
|
| 101 |
+
actions[entry.key] = ("fix", best_result, candidates)
|
| 102 |
+
elif best_result.is_match:
|
| 103 |
+
actions[entry.key] = ("keep", best_result, candidates)
|
| 104 |
+
elif candidates:
|
| 105 |
+
actions[entry.key] = ("review", best_result, candidates)
|
| 106 |
+
else:
|
| 107 |
+
actions[entry.key] = ("remove", best_result, candidates)
|
| 108 |
+
|
| 109 |
+
updated_entries: list[BibEntry] = []
|
| 110 |
+
|
| 111 |
+
for entry in entries:
|
| 112 |
+
action, best_result, candidates = actions.get(entry.key, ("keep", None, []))
|
| 113 |
+
|
| 114 |
+
if action == "fix":
|
| 115 |
+
changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
|
| 116 |
+
if changes:
|
| 117 |
+
result.fixed_details.setdefault(entry.key, []).extend(changes)
|
| 118 |
+
updated_entries.append(entry)
|
| 119 |
+
elif action == "review":
|
| 120 |
+
result.review_details.append(_review_payload(entry, best_result, candidates))
|
| 121 |
+
updated_entries.append(entry)
|
| 122 |
+
elif action == "remove":
|
| 123 |
+
if options.remove_unverified:
|
| 124 |
+
result.removed_details.append((entry.key, entry.title, "No matching metadata found in any source"))
|
| 125 |
+
else:
|
| 126 |
+
result.review_details.append(
|
| 127 |
+
{
|
| 128 |
+
"key": entry.key,
|
| 129 |
+
"title": entry.title,
|
| 130 |
+
"reason": "No matching metadata found in any source",
|
| 131 |
+
"candidates": [],
|
| 132 |
+
}
|
| 133 |
+
)
|
| 134 |
+
updated_entries.append(entry)
|
| 135 |
+
else:
|
| 136 |
+
updated_entries.append(entry)
|
| 137 |
+
|
| 138 |
+
result.total_output = len(updated_entries)
|
| 139 |
+
fixed_path = _write_bib(parser, updated_entries, source_path.stem)
|
| 140 |
+
result.fixed_bib_path = fixed_path
|
| 141 |
+
|
| 142 |
+
verified_entries = parser.parse_file(fixed_path)
|
| 143 |
+
verification_reports = _verify_entries(
|
| 144 |
+
verified_entries,
|
| 145 |
+
workflow,
|
| 146 |
+
fetchers,
|
| 147 |
+
comparator,
|
| 148 |
+
options.max_workers,
|
| 149 |
+
)
|
| 150 |
+
result.verified = sum(1 for r in verification_reports if r.comparison and r.comparison.is_match)
|
| 151 |
+
result.issues = sum(1 for r in verification_reports if r.comparison and r.comparison.has_issues)
|
| 152 |
+
result.not_found = sum(
|
| 153 |
+
1
|
| 154 |
+
for r in verification_reports
|
| 155 |
+
if r.comparison and not r.comparison.is_match and not r.comparison.has_issues
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
result.report_markdown = _build_report(result, verification_reports)
|
| 159 |
+
result.report_path = _write_report(result.report_markdown)
|
| 160 |
+
return result
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _build_fetchers() -> dict[str, Any]:
|
| 164 |
+
return {
|
| 165 |
+
"arxiv": ArxivFetcher(),
|
| 166 |
+
"crossref": CrossRefFetcher(),
|
| 167 |
+
"scholar": ScholarFetcher(),
|
| 168 |
+
"semantic": SemanticScholarFetcher(),
|
| 169 |
+
"openalex": OpenAlexFetcher(),
|
| 170 |
+
"dblp": DBLPFetcher(),
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _analyze_entries(
|
| 175 |
+
entries: list[BibEntry],
|
| 176 |
+
workflow: Any,
|
| 177 |
+
fetchers: dict[str, Any],
|
| 178 |
+
comparator: MetadataComparator,
|
| 179 |
+
max_workers: int,
|
| 180 |
+
) -> list[tuple[BibEntry, Any, list[Any]]]:
|
| 181 |
+
if not entries:
|
| 182 |
+
return []
|
| 183 |
+
|
| 184 |
+
analysis: list[tuple[BibEntry, Any, list[Any]]] = []
|
| 185 |
+
worker_count = min(max(1, max_workers), len(entries))
|
| 186 |
+
with ThreadPoolExecutor(max_workers=worker_count) as executor:
|
| 187 |
+
futures = {
|
| 188 |
+
executor.submit(validate_entry, entry, workflow, fetchers, comparator): entry
|
| 189 |
+
for entry in entries
|
| 190 |
+
}
|
| 191 |
+
for future in as_completed(futures):
|
| 192 |
+
entry = futures[future]
|
| 193 |
+
try:
|
| 194 |
+
best_result, candidates = future.result()
|
| 195 |
+
except Exception:
|
| 196 |
+
best_result, candidates = None, []
|
| 197 |
+
analysis.append((entry, best_result, candidates))
|
| 198 |
+
return analysis
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _verify_entries(
|
| 202 |
+
entries: list[BibEntry],
|
| 203 |
+
workflow: Any,
|
| 204 |
+
fetchers: dict[str, Any],
|
| 205 |
+
comparator: MetadataComparator,
|
| 206 |
+
max_workers: int,
|
| 207 |
+
) -> list[EntryReport]:
|
| 208 |
+
reports: list[EntryReport] = []
|
| 209 |
+
for entry, best_result, _ in _analyze_entries(entries, workflow, fetchers, comparator, max_workers):
|
| 210 |
+
reports.append(EntryReport(entry=entry, comparison=best_result))
|
| 211 |
+
return reports
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _record_sanitize_fixes(
|
| 215 |
+
fixed_details: dict[str, list[str]],
|
| 216 |
+
sanitize_fixes: dict[str, list[SanitizeFix]],
|
| 217 |
+
) -> None:
|
| 218 |
+
for key, fixes in sanitize_fixes.items():
|
| 219 |
+
fixed_details.setdefault(key, [])
|
| 220 |
+
fixed_details[key].extend(fix.description for fix in fixes)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def _apply_local_db(
|
| 224 |
+
entries: list[BibEntry],
|
| 225 |
+
fixed_details: dict[str, list[str]],
|
| 226 |
+
) -> tuple[bool, list[BibEntry], int]:
|
| 227 |
+
local_db = _load_local_db()
|
| 228 |
+
if not local_db.is_loaded:
|
| 229 |
+
return False, entries, 0
|
| 230 |
+
|
| 231 |
+
api_entries = []
|
| 232 |
+
match_count = 0
|
| 233 |
+
for entry in entries:
|
| 234 |
+
official = local_db.lookup(entry.title)
|
| 235 |
+
if not official:
|
| 236 |
+
api_entries.append(entry)
|
| 237 |
+
continue
|
| 238 |
+
|
| 239 |
+
changes = apply_local_fix(entry, official)
|
| 240 |
+
match_count += 1
|
| 241 |
+
if changes:
|
| 242 |
+
fixed_details.setdefault(entry.key, []).extend(changes)
|
| 243 |
+
|
| 244 |
+
return True, api_entries, match_count
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
@lru_cache(maxsize=1)
|
| 248 |
+
def _load_local_db() -> LocalConferenceDB:
|
| 249 |
+
local_db = LocalConferenceDB()
|
| 250 |
+
local_db.load()
|
| 251 |
+
return local_db
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def _review_payload(entry: BibEntry, best_result: Any, candidates: list[Any]) -> dict[str, Any]:
|
| 255 |
+
return {
|
| 256 |
+
"key": entry.key,
|
| 257 |
+
"title": entry.title,
|
| 258 |
+
"reason": "; ".join(best_result.issues) if best_result and best_result.issues else "Ambiguous match",
|
| 259 |
+
"candidates": [
|
| 260 |
+
{
|
| 261 |
+
"source": candidate.source,
|
| 262 |
+
"confidence": candidate.confidence,
|
| 263 |
+
"title": getattr(candidate.fetched_data, "title", ""),
|
| 264 |
+
"year": getattr(candidate.fetched_data, "year", ""),
|
| 265 |
+
"doi": getattr(candidate.fetched_data, "doi", ""),
|
| 266 |
+
}
|
| 267 |
+
for candidate in candidates[:5]
|
| 268 |
+
],
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _write_bib(parser: BibParser, entries: list[BibEntry], original_stem: str) -> str:
|
| 273 |
+
out_dir = Path(tempfile.mkdtemp(prefix="refcheck_"))
|
| 274 |
+
out_path = out_dir / f"{original_stem or 'references'}_refcheck_fixed.bib"
|
| 275 |
+
parser.save_entries(str(out_path), entries)
|
| 276 |
+
return str(out_path)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def _write_report(markdown: str) -> str:
|
| 280 |
+
out_dir = Path(tempfile.mkdtemp(prefix="refcheck_report_"))
|
| 281 |
+
out_path = out_dir / "refcheck_report.md"
|
| 282 |
+
out_path.write_text(markdown, encoding="utf-8")
|
| 283 |
+
return str(out_path)
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def _build_report(result: RefCheckResult, reports: list[EntryReport]) -> str:
|
| 287 |
+
lines = [
|
| 288 |
+
"## RefCheck Report",
|
| 289 |
+
"",
|
| 290 |
+
"### Summary",
|
| 291 |
+
"",
|
| 292 |
+
f"- Input entries: {result.total_input}",
|
| 293 |
+
f"- Output entries: {result.total_output}",
|
| 294 |
+
f"- Verified after fix: {result.verified}",
|
| 295 |
+
f"- Remaining issues: {result.issues}",
|
| 296 |
+
f"- Not found after fix: {result.not_found}",
|
| 297 |
+
f"- Local DB loaded: {'yes' if result.local_db_loaded else 'no'}",
|
| 298 |
+
f"- Local DB matches: {result.local_matches}",
|
| 299 |
+
"",
|
| 300 |
+
]
|
| 301 |
+
|
| 302 |
+
if result.removed_details:
|
| 303 |
+
lines.extend(["### Removed", ""])
|
| 304 |
+
for key, title, reason in result.removed_details:
|
| 305 |
+
lines.append(f"- `{key}`: {title} ({reason})")
|
| 306 |
+
lines.append("")
|
| 307 |
+
|
| 308 |
+
if result.fixed_details:
|
| 309 |
+
lines.extend(["### Fixed", ""])
|
| 310 |
+
for key, changes in sorted(result.fixed_details.items()):
|
| 311 |
+
lines.append(f"- `{key}`")
|
| 312 |
+
for change in changes:
|
| 313 |
+
lines.append(f" - {change}")
|
| 314 |
+
lines.append("")
|
| 315 |
+
|
| 316 |
+
if result.duplicate_details:
|
| 317 |
+
lines.extend(["### Duplicate Titles", ""])
|
| 318 |
+
for title, keys in result.duplicate_details.items():
|
| 319 |
+
lines.append(f"- `{', '.join(keys)}`: {title}")
|
| 320 |
+
lines.append("")
|
| 321 |
+
|
| 322 |
+
if result.review_details:
|
| 323 |
+
lines.extend(["### Needs Review", ""])
|
| 324 |
+
for item in result.review_details:
|
| 325 |
+
lines.append(f"- `{item['key']}`: {item['title']}")
|
| 326 |
+
lines.append(f" - Reason: {item['reason']}")
|
| 327 |
+
for candidate in item["candidates"]:
|
| 328 |
+
lines.append(
|
| 329 |
+
" - Candidate: "
|
| 330 |
+
f"{candidate['source']} "
|
| 331 |
+
f"(confidence {candidate['confidence']:.2f}) "
|
| 332 |
+
f"{candidate['title']} "
|
| 333 |
+
f"{candidate['year']} "
|
| 334 |
+
f"{candidate['doi']}".strip()
|
| 335 |
+
)
|
| 336 |
+
lines.append("")
|
| 337 |
+
|
| 338 |
+
remaining = [
|
| 339 |
+
report
|
| 340 |
+
for report in reports
|
| 341 |
+
if report.comparison and not report.comparison.is_match
|
| 342 |
+
]
|
| 343 |
+
if remaining:
|
| 344 |
+
lines.extend(["### Verification Issues", ""])
|
| 345 |
+
for report in remaining:
|
| 346 |
+
comparison = report.comparison
|
| 347 |
+
issues = "; ".join(comparison.issues) if comparison.issues else "Not matched"
|
| 348 |
+
lines.append(
|
| 349 |
+
f"- `{report.entry.key}` via {comparison.source} "
|
| 350 |
+
f"(confidence {comparison.confidence:.2f}): {issues}"
|
| 351 |
+
)
|
| 352 |
+
lines.append("")
|
| 353 |
+
|
| 354 |
+
return "\n".join(lines).strip() + "\n"
|
src/ui.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rich.console import Console
|
| 2 |
+
from rich.table import Table
|
| 3 |
+
from rich.panel import Panel
|
| 4 |
+
from rich.tree import Tree
|
| 5 |
+
import copy
|
| 6 |
+
|
| 7 |
+
class BibUI:
|
| 8 |
+
"""Handles all terminal UI interactions for BibGuard."""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.console = Console()
|
| 12 |
+
|
| 13 |
+
def show_analysis_report(self, ok_entries, to_fix, to_review, to_remove):
|
| 14 |
+
"""Display the initial analysis summary table."""
|
| 15 |
+
table = Table(title="📊 Analysis Report", show_header=True, header_style="bold magenta")
|
| 16 |
+
table.add_column("Category", style="cyan")
|
| 17 |
+
table.add_column("Count", justify="right")
|
| 18 |
+
table.add_column("Description")
|
| 19 |
+
|
| 20 |
+
table.add_row("✅ Correct", str(len(ok_entries)), "Entries match valid metadata")
|
| 21 |
+
table.add_row("🛠️ To Fix", str(len(to_fix)), "[green]High confidence auto-fixes[/green]")
|
| 22 |
+
table.add_row("🔍 Review", str(len(to_review)), "[yellow]Ambiguous or low confidence[/yellow]")
|
| 23 |
+
table.add_row("🗑️ Remove", str(len(to_remove)), "[red]No metadata found (Hallucinations)[/red]")
|
| 24 |
+
|
| 25 |
+
self.console.print(table)
|
| 26 |
+
|
| 27 |
+
if not (to_fix or to_review or to_remove):
|
| 28 |
+
self.console.print(Panel("[green]✓ No issues found. All entries are valid.[/green]", title="Status"))
|
| 29 |
+
|
| 30 |
+
def show_manual_review(self, entry, best_res, candidates, apply_fix_func):
|
| 31 |
+
"""Display manual review table for a single entry."""
|
| 32 |
+
self.console.print(f"\n[bold]Entry: {entry.key}[/bold]")
|
| 33 |
+
self.console.print(f"Title: {entry.title}")
|
| 34 |
+
self.console.print(f"Year: {entry.year}")
|
| 35 |
+
self.console.print(f"Auth: {entry.author}")
|
| 36 |
+
|
| 37 |
+
cand_table = Table(show_header=True, header_style="bold blue")
|
| 38 |
+
cand_table.add_column("#", style="dim", width=4)
|
| 39 |
+
cand_table.add_column("Source", style="cyan", width=12)
|
| 40 |
+
cand_table.add_column("Conf", justify="right")
|
| 41 |
+
cand_table.add_column("Candidate Metadata (Fetched)", style="white")
|
| 42 |
+
cand_table.add_column("Proposed Changes", style="green")
|
| 43 |
+
|
| 44 |
+
for i, cand in enumerate(candidates, 1):
|
| 45 |
+
# We need to simulate the fix to show changes
|
| 46 |
+
# We pass the apply_fix function to avoid circular dependency or logic duplication
|
| 47 |
+
temp_entry = copy.deepcopy(entry)
|
| 48 |
+
changes = apply_fix_func(temp_entry, cand.fetched_data)
|
| 49 |
+
change_desc = "\n".join(changes) if changes else "[dim]No changes[/dim]"
|
| 50 |
+
|
| 51 |
+
conf_style = "green" if cand.confidence > 0.7 else "yellow" if cand.confidence > 0.4 else "red"
|
| 52 |
+
|
| 53 |
+
# Format the candidate's actual metadata
|
| 54 |
+
fd = cand.fetched_data
|
| 55 |
+
meta_lines = []
|
| 56 |
+
if getattr(fd, 'title', None):
|
| 57 |
+
meta_lines.append(f"[bold]Title:[/bold] {fd.title[:60] + '...' if len(fd.title) > 60 else fd.title}")
|
| 58 |
+
if getattr(fd, 'authors', None):
|
| 59 |
+
a_str = " and ".join(fd.authors)
|
| 60 |
+
meta_lines.append(f"[bold]Authors:[/bold] {a_str[:60] + '...' if len(a_str) > 60 else a_str}")
|
| 61 |
+
if getattr(fd, 'year', None):
|
| 62 |
+
meta_lines.append(f"[bold]Year:[/bold] {fd.year}")
|
| 63 |
+
if getattr(fd, 'doi', None):
|
| 64 |
+
meta_lines.append(f"[bold]DOI:[/bold] {fd.doi}")
|
| 65 |
+
meta_desc = "\n".join(meta_lines) if meta_lines else "[dim]No metadata details[/dim]"
|
| 66 |
+
|
| 67 |
+
cand_table.add_row(
|
| 68 |
+
str(i),
|
| 69 |
+
cand.source,
|
| 70 |
+
f"[{conf_style}]{cand.confidence:.2f}[/{conf_style}]",
|
| 71 |
+
meta_desc,
|
| 72 |
+
change_desc
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
self.console.print(cand_table)
|
| 76 |
+
|
| 77 |
+
def show_final_report(self, total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details):
|
| 78 |
+
"""Display the verification status and modification tree."""
|
| 79 |
+
# Visual Final Status
|
| 80 |
+
status_table = Table(box=None, padding=(0, 2))
|
| 81 |
+
status_table.add_column("Metric", style="bold")
|
| 82 |
+
status_table.add_column("Value", justify="right")
|
| 83 |
+
status_table.add_row("Total Entries", str(total))
|
| 84 |
+
status_table.add_row("Verified", f"[green]{verified}[/green]")
|
| 85 |
+
status_table.add_row("Issues", f"[red]{issues}[/red]" if issues > 0 else "0")
|
| 86 |
+
status_table.add_row("Not Found", f"[yellow]{not_found}[/yellow]" if not_found > 0 else "0")
|
| 87 |
+
|
| 88 |
+
self.console.print(Panel(status_table, title="📊 Final Status", expand=False))
|
| 89 |
+
|
| 90 |
+
if issues > 0:
|
| 91 |
+
self.console.print("\n[bold red]⚠ Remaining Issues (Not Auto-Fixed):[/bold red]")
|
| 92 |
+
for r in reports:
|
| 93 |
+
if r.comparison and r.comparison.has_issues:
|
| 94 |
+
self.console.print(f" - [bold]{r.entry.key}[/bold] (Conf: {r.comparison.confidence:.2f}): {', '.join(r.comparison.issues)}")
|
| 95 |
+
|
| 96 |
+
# Report fixes and removals
|
| 97 |
+
if fixed_count > 0 or removed_details:
|
| 98 |
+
tree = Tree("✏️ Modifications Report")
|
| 99 |
+
|
| 100 |
+
if removed_details:
|
| 101 |
+
rem_node = tree.add(f"[red]Removed {len(removed_details)} entries[/red]")
|
| 102 |
+
for entry, reason in removed_details:
|
| 103 |
+
rem_node.add(f"[bold]{entry.key}[/bold]: \"{entry.title}\" ([italic]{reason}[/italic])")
|
| 104 |
+
|
| 105 |
+
if fixed_count > 0:
|
| 106 |
+
fix_node = tree.add(f"[green]Fixed {fixed_count} entries[/green]")
|
| 107 |
+
for key, changes in fixed_details.items():
|
| 108 |
+
entry_node = fix_node.add(f"[bold]{key}[/bold]")
|
| 109 |
+
for change in changes:
|
| 110 |
+
entry_node.add(change)
|
| 111 |
+
|
| 112 |
+
self.console.print(tree)
|
| 113 |
+
self.console.print("\n[green]✓ Changes applied and saved to file.[/green]")
|
| 114 |
+
else:
|
| 115 |
+
self.console.print("\n[green]✓ No changes were needed.[/green]")
|
| 116 |
+
|
| 117 |
+
def show_sanitize_report(self, sanitize_fixes: dict):
|
| 118 |
+
"""Display sanitization results as a rich tree."""
|
| 119 |
+
if not sanitize_fixes:
|
| 120 |
+
self.console.print("[green]✓ No formatting issues found.[/green]\n")
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
# Category display info
|
| 124 |
+
category_info = {
|
| 125 |
+
"dblp_id": ("🔢", "DBLP Disambiguation ID Cleanup", "red"),
|
| 126 |
+
"corporate_author": ("🏢", "Corporate Author Protection", "yellow"),
|
| 127 |
+
"entry_type": ("📋", "Entry Type Correction", "cyan"),
|
| 128 |
+
"title_case": ("🔤", "Title Capitalization Protection", "blue"),
|
| 129 |
+
"doi_mismatch": ("🔗", "DOI Mismatch", "red"),
|
| 130 |
+
"future_year": ("📅", "Future Year Detection", "magenta"),
|
| 131 |
+
"field_cleanup": ("🧹", "Junk Field Removal", "dim"),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
total_fixes = sum(len(fixes) for fixes in sanitize_fixes.values())
|
| 135 |
+
tree = Tree(f"🧹 Sanitization Report ({total_fixes} fixes in {len(sanitize_fixes)} entries)")
|
| 136 |
+
|
| 137 |
+
# Group fixes by category across all entries
|
| 138 |
+
by_category = {}
|
| 139 |
+
for entry_key, fixes in sanitize_fixes.items():
|
| 140 |
+
for fix in fixes:
|
| 141 |
+
if fix.category not in by_category:
|
| 142 |
+
by_category[fix.category] = []
|
| 143 |
+
by_category[fix.category].append(fix)
|
| 144 |
+
|
| 145 |
+
for cat, fixes in by_category.items():
|
| 146 |
+
icon, label, color = category_info.get(cat, ("❓", cat, "white"))
|
| 147 |
+
cat_node = tree.add(f"{icon} [{color}]{label} ({len(fixes)})[/{color}]")
|
| 148 |
+
for fix in fixes:
|
| 149 |
+
cat_node.add(f"[bold]{fix.entry_key}[/bold]: {fix.description}")
|
| 150 |
+
|
| 151 |
+
self.console.print(tree)
|
| 152 |
+
self.console.print("")
|
| 153 |
+
|
src/utils.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities for BibGuard: Normalization and Progress Display.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
import unicodedata
|
| 6 |
+
import time
|
| 7 |
+
from contextlib import contextmanager
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Optional, List
|
| 10 |
+
from unidecode import unidecode
|
| 11 |
+
from rich.console import Console
|
| 12 |
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TextNormalizer:
|
| 16 |
+
"""Utility class for normalizing text for comparison."""
|
| 17 |
+
|
| 18 |
+
# DBLP disambiguation ID pattern: 4-digit number at end of author name
|
| 19 |
+
# e.g. "Tian Tan 0019", "Wei Li 0119", "Zejun Ma 0001"
|
| 20 |
+
DBLP_DISAMBIG_PATTERN = re.compile(r'\s+\d{4}\s*$')
|
| 21 |
+
|
| 22 |
+
# LaTeX command patterns
|
| 23 |
+
LATEX_COMMANDS = [
|
| 24 |
+
(r'\\textbf\{([^}]*)\}', r'\1'),
|
| 25 |
+
(r'\\textit\{([^}]*)\}', r'\1'),
|
| 26 |
+
(r'\\emph\{([^}]*)\}', r'\1'),
|
| 27 |
+
(r'\\textrm\{([^}]*)\}', r'\1'),
|
| 28 |
+
(r'\\texttt\{([^}]*)\}', r'\1'),
|
| 29 |
+
(r'\\textsf\{([^}]*)\}', r'\1'),
|
| 30 |
+
(r'\\textsc\{([^}]*)\}', r'\1'),
|
| 31 |
+
(r'\\text\{([^}]*)\}', r'\1'),
|
| 32 |
+
(r'\\mathrm\{([^}]*)\}', r'\1'),
|
| 33 |
+
(r'\\mathbf\{([^}]*)\}', r'\1'),
|
| 34 |
+
(r'\\mathit\{([^}]*)\}', r'\1'),
|
| 35 |
+
(r'\\url\{([^}]*)\}', r'\1'),
|
| 36 |
+
(r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
# LaTeX special character mappings
|
| 40 |
+
LATEX_CHARS = {
|
| 41 |
+
r'\&': '&',
|
| 42 |
+
r'\%': '%',
|
| 43 |
+
r'\$': '$',
|
| 44 |
+
r'\#': '#',
|
| 45 |
+
r'\_': '_',
|
| 46 |
+
r'\{': '{',
|
| 47 |
+
r'\}': '}',
|
| 48 |
+
r'\~': '~',
|
| 49 |
+
r'\^': '^',
|
| 50 |
+
r'``': '"',
|
| 51 |
+
r"''": '"',
|
| 52 |
+
r'`': "'",
|
| 53 |
+
r"'": "'",
|
| 54 |
+
r'--': '–',
|
| 55 |
+
r'---': '—',
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# LaTeX accent commands
|
| 59 |
+
LATEX_ACCENTS = [
|
| 60 |
+
(r"\\'([aeiouAEIOU])", r'\1'), # acute
|
| 61 |
+
(r'\\`([aeiouAEIOU])', r'\1'), # grave
|
| 62 |
+
(r'\\^([aeiouAEIOU])', r'\1'), # circumflex
|
| 63 |
+
(r'\\"([aeiouAEIOU])', r'\1'), # umlaut
|
| 64 |
+
(r'\\~([nNaAoO])', r'\1'), # tilde
|
| 65 |
+
(r'\\c\{([cC])\}', r'\1'), # cedilla
|
| 66 |
+
(r"\\'{([aeiouAEIOU])}", r'\1'),
|
| 67 |
+
(r'\\`{([aeiouAEIOU])}', r'\1'),
|
| 68 |
+
(r'\\^{([aeiouAEIOU])}', r'\1'),
|
| 69 |
+
(r'\\"{([aeiouAEIOU])}', r'\1'),
|
| 70 |
+
(r'\\~{([nNaAoO])}', r'\1'),
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
@classmethod
|
| 74 |
+
def normalize_latex(cls, text: str) -> str:
|
| 75 |
+
"""Remove LaTeX formatting commands."""
|
| 76 |
+
if not text: return ""
|
| 77 |
+
result = text
|
| 78 |
+
for pattern, replacement in cls.LATEX_COMMANDS:
|
| 79 |
+
result = re.sub(pattern, replacement, result)
|
| 80 |
+
for pattern, replacement in cls.LATEX_ACCENTS:
|
| 81 |
+
result = re.sub(pattern, replacement, result)
|
| 82 |
+
for latex_char, normal_char in cls.LATEX_CHARS.items():
|
| 83 |
+
result = result.replace(latex_char, normal_char)
|
| 84 |
+
return re.sub(r'[{}]', '', result)
|
| 85 |
+
|
| 86 |
+
@classmethod
|
| 87 |
+
def normalize_unicode(cls, text: str) -> str:
|
| 88 |
+
"""Normalize Unicode characters to ASCII."""
|
| 89 |
+
if not text: return ""
|
| 90 |
+
text = unicodedata.normalize('NFKD', text)
|
| 91 |
+
return unidecode(text)
|
| 92 |
+
|
| 93 |
+
@classmethod
|
| 94 |
+
def normalize_for_comparison(cls, text: str) -> str:
|
| 95 |
+
"""Full normalization pipeline for text comparison."""
|
| 96 |
+
if not text: return ""
|
| 97 |
+
text = cls.normalize_latex(text)
|
| 98 |
+
text = cls.normalize_unicode(text)
|
| 99 |
+
text = text.lower()
|
| 100 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 101 |
+
return re.sub(r'[^\w\s]', '', text)
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def strip_dblp_disambiguation_id(cls, name: str) -> str:
|
| 105 |
+
"""Strip DBLP disambiguation suffix (4-digit number) from author name.
|
| 106 |
+
|
| 107 |
+
DBLP appends codes like '0001', '0019' to disambiguate homonymous authors.
|
| 108 |
+
e.g. 'Tian Tan 0019' -> 'Tian Tan'
|
| 109 |
+
'Wei Li 0119' -> 'Wei Li'
|
| 110 |
+
"""
|
| 111 |
+
if not name:
|
| 112 |
+
return name
|
| 113 |
+
return cls.DBLP_DISAMBIG_PATTERN.sub('', name).strip()
|
| 114 |
+
|
| 115 |
+
@classmethod
|
| 116 |
+
def has_dblp_disambiguation_id(cls, name: str) -> bool:
|
| 117 |
+
"""Check if an author name contains a DBLP disambiguation ID."""
|
| 118 |
+
if not name:
|
| 119 |
+
return False
|
| 120 |
+
return bool(cls.DBLP_DISAMBIG_PATTERN.search(name))
|
| 121 |
+
|
| 122 |
+
@classmethod
|
| 123 |
+
def normalize_author_name(cls, name: str) -> str:
|
| 124 |
+
"""Normalize author name format."""
|
| 125 |
+
if not name: return ""
|
| 126 |
+
name = cls.normalize_latex(name)
|
| 127 |
+
name = cls.normalize_unicode(name)
|
| 128 |
+
# Strip DBLP disambiguation IDs before further processing
|
| 129 |
+
name = cls.strip_dblp_disambiguation_id(name)
|
| 130 |
+
name = re.sub(r'\s+', ' ', name).strip()
|
| 131 |
+
if ',' in name:
|
| 132 |
+
parts = name.split(',', 1)
|
| 133 |
+
if len(parts) == 2:
|
| 134 |
+
name = f"{parts[1].strip()} {parts[0].strip()}"
|
| 135 |
+
name = name.lower()
|
| 136 |
+
return re.sub(r'[^\w\s]', '', name)
|
| 137 |
+
|
| 138 |
+
@classmethod
|
| 139 |
+
def parse_author_list(cls, authors: str) -> list[str]:
|
| 140 |
+
"""Parse author string into a list of raw author names."""
|
| 141 |
+
if not authors: return []
|
| 142 |
+
# Split by ' and ', keeping original formatting
|
| 143 |
+
return re.split(r'\s+and\s+', authors, flags=re.IGNORECASE)
|
| 144 |
+
|
| 145 |
+
@classmethod
|
| 146 |
+
def normalize_author_list(cls, authors: str) -> list[str]:
|
| 147 |
+
"""Parse and normalize a list of authors."""
|
| 148 |
+
if not authors: return []
|
| 149 |
+
author_list = cls.parse_author_list(authors)
|
| 150 |
+
normalized = []
|
| 151 |
+
for author in author_list:
|
| 152 |
+
norm = cls.normalize_author_name(author.strip())
|
| 153 |
+
if norm: normalized.append(norm)
|
| 154 |
+
return normalized
|
| 155 |
+
|
| 156 |
+
@classmethod
|
| 157 |
+
def similarity_ratio(cls, text1: str, text2: str) -> float:
|
| 158 |
+
"""Calculate Jaccard similarity between two strings."""
|
| 159 |
+
if not text1 or not text2: return 0.0
|
| 160 |
+
words1, words2 = set(text1.split()), set(text2.split())
|
| 161 |
+
if not words1 and not words2: return 1.0
|
| 162 |
+
if not words1 or not words2: return 0.0
|
| 163 |
+
return len(words1 & words2) / len(words1 | words2)
|
| 164 |
+
|
| 165 |
+
@classmethod
|
| 166 |
+
def levenshtein_similarity(cls, s1: str, s2: str) -> float:
|
| 167 |
+
"""Calculate normalized Levenshtein similarity."""
|
| 168 |
+
if not s1 and not s2: return 1.0
|
| 169 |
+
if not s1 or not s2: return 0.0
|
| 170 |
+
m, n = len(s1), len(s2)
|
| 171 |
+
dp = [list(range(n + 1))] + [[i] + [0]*n for i in range(1, m + 1)]
|
| 172 |
+
for i in range(1, m + 1):
|
| 173 |
+
for j in range(1, n + 1):
|
| 174 |
+
dp[i][j] = dp[i-1][j-1] if s1[i-1] == s2[j-1] else min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
|
| 175 |
+
return 1.0 - (dp[m][n] / max(m, n))
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@dataclass
|
| 179 |
+
class ProgressStats:
|
| 180 |
+
"""Statistics for progress display."""
|
| 181 |
+
total: int = 0
|
| 182 |
+
processed: int = 0
|
| 183 |
+
success: int = 0
|
| 184 |
+
warnings: int = 0
|
| 185 |
+
errors: int = 0
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class ProgressDisplay:
|
| 189 |
+
"""Rich terminal progress display."""
|
| 190 |
+
|
| 191 |
+
def __init__(self):
|
| 192 |
+
self.console = Console()
|
| 193 |
+
self.stats = ProgressStats()
|
| 194 |
+
self._progress: Optional[Progress] = None
|
| 195 |
+
self._task = None
|
| 196 |
+
|
| 197 |
+
@contextmanager
|
| 198 |
+
def progress_context(self, total: int, description: str = "Processing"):
|
| 199 |
+
"""Context manager for progress display."""
|
| 200 |
+
self.stats.total = total
|
| 201 |
+
with Progress(
|
| 202 |
+
SpinnerColumn(),
|
| 203 |
+
TextColumn("[progress.description]{task.description}"),
|
| 204 |
+
BarColumn(bar_width=40),
|
| 205 |
+
TaskProgressColumn(),
|
| 206 |
+
TimeElapsedColumn(),
|
| 207 |
+
console=self.console,
|
| 208 |
+
transient=False
|
| 209 |
+
) as progress:
|
| 210 |
+
self._progress = progress
|
| 211 |
+
self._task = progress.add_task(description, total=total)
|
| 212 |
+
try:
|
| 213 |
+
yield self
|
| 214 |
+
finally:
|
| 215 |
+
self._progress = None
|
| 216 |
+
self._task = None
|
| 217 |
+
|
| 218 |
+
def update(self, entry_key: str = "", task: str = "", advance: int = 0):
|
| 219 |
+
"""Update progress display."""
|
| 220 |
+
if self._progress and self._task is not None:
|
| 221 |
+
desc = f"[cyan]{entry_key}[/cyan] - {task}" if entry_key else task
|
| 222 |
+
self._progress.update(self._task, description=desc, advance=advance)
|
| 223 |
+
self.stats.processed += advance
|
| 224 |
+
|
| 225 |
+
def mark_success(self): self.stats.success += 1
|
| 226 |
+
def mark_warning(self): self.stats.warnings += 1
|
| 227 |
+
def mark_error(self): self.stats.errors += 1
|
| 228 |
+
def print_error(self, message: str):
|
| 229 |
+
self.console.print(f" [red]✗[/red] {message}")
|