RefCheck / src /local_db.py
voidful's picture
Add RefCheck Gradio Space
11a28db verified
"""
Local Conference Database: fast, offline title lookup against DBLP index.
This module provides a local database of conference/journal proceedings
downloaded from DBLP. It serves as a "ground truth" source that eliminates
the need for network API calls for entries that match known publications.
"""
import json
import re
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
def _normalize(title: str) -> str:
"""Normalize a title for index lookup (must match build_index.py)."""
title = re.sub(r'\{([^}]*)\}', r'\1', title)
title = re.sub(r'[^\w\s]', ' ', title.lower())
return re.sub(r'\s+', ' ', title).strip()
@dataclass
class LocalMatch:
"""Result from a local DB lookup."""
title: str
author: str
year: str
booktitle: str
journal: str
doi: str
url: str
pages: str
volume: str
entry_type: str
source_file: str
class LocalConferenceDB:
"""Title-based lookup against locally cached DBLP proceedings."""
def __init__(self, index_dir: str = None):
if index_dir is None:
base = Path(__file__).resolve().parent.parent / "data"
self._shard_dir = base / "index_shards"
self._legacy_path = base / "conference_index.json"
else:
self._shard_dir = Path(index_dir)
self._legacy_path = Path(index_dir).parent / "conference_index.json"
self._idx: dict = {}
self._loaded = False
def load(self) -> bool:
"""Load index from shards or legacy single file. Returns True if successful."""
try:
# Try sharded index first
if self._shard_dir.exists():
shard_files = sorted(self._shard_dir.glob("index_*.json"))
if shard_files:
for shard_path in shard_files:
shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
self._idx.update(shard_data)
self._loaded = True
print(f" πŸ“š Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).")
return True
# Fallback: legacy single file
if self._legacy_path.exists():
self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8"))
self._loaded = True
print(f" πŸ“š Local DB: {len(self._idx):,} entries loaded.")
return True
print(" ⚠ Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py")
return False
except Exception as e:
print(f" ⚠ Failed to load local DB: {e}")
return False
@property
def is_loaded(self) -> bool:
return self._loaded and len(self._idx) > 0
def lookup(self, title: str) -> Optional[LocalMatch]:
"""
Look up an entry by title.
Returns LocalMatch if found, None otherwise.
"""
if not self._loaded:
return None
key = _normalize(title)
data = self._idx.get(key)
if not data:
return None
return LocalMatch(
title=data.get("title", ""),
author=data.get("author", ""),
year=data.get("year", ""),
booktitle=data.get("booktitle", ""),
journal=data.get("journal", ""),
doi=data.get("doi", ""),
url=data.get("url", ""),
pages=data.get("pages", ""),
volume=data.get("volume", ""),
entry_type=data.get("_type", "inproceedings"),
source_file=data.get("_source", ""),
)