| """ |
| Local Conference Database: fast, offline title lookup against DBLP index. |
| |
| This module provides a local database of conference/journal proceedings |
| downloaded from DBLP. It serves as a "ground truth" source that eliminates |
| the need for network API calls for entries that match known publications. |
| """ |
| import json |
| import re |
| from pathlib import Path |
| from typing import Optional |
| from dataclasses import dataclass |
|
|
|
|
| def _normalize(title: str) -> str: |
| """Normalize a title for index lookup (must match build_index.py).""" |
| title = re.sub(r'\{([^}]*)\}', r'\1', title) |
| title = re.sub(r'[^\w\s]', ' ', title.lower()) |
| return re.sub(r'\s+', ' ', title).strip() |
|
|
|
|
| @dataclass |
| class LocalMatch: |
| """Result from a local DB lookup.""" |
| title: str |
| author: str |
| year: str |
| booktitle: str |
| journal: str |
| doi: str |
| url: str |
| pages: str |
| volume: str |
| entry_type: str |
| source_file: str |
|
|
|
|
| class LocalConferenceDB: |
| """Title-based lookup against locally cached DBLP proceedings.""" |
|
|
| def __init__(self, index_dir: str = None): |
| if index_dir is None: |
| base = Path(__file__).resolve().parent.parent / "data" |
| self._shard_dir = base / "index_shards" |
| self._legacy_path = base / "conference_index.json" |
| else: |
| self._shard_dir = Path(index_dir) |
| self._legacy_path = Path(index_dir).parent / "conference_index.json" |
| self._idx: dict = {} |
| self._loaded = False |
|
|
| def load(self) -> bool: |
| """Load index from shards or legacy single file. Returns True if successful.""" |
| try: |
| |
| if self._shard_dir.exists(): |
| shard_files = sorted(self._shard_dir.glob("index_*.json")) |
| if shard_files: |
| for shard_path in shard_files: |
| shard_data = json.loads(shard_path.read_text(encoding="utf-8")) |
| self._idx.update(shard_data) |
| self._loaded = True |
| print(f" π Local DB: {len(self._idx):,} entries loaded ({len(shard_files)} shards).") |
| return True |
|
|
| |
| if self._legacy_path.exists(): |
| self._idx = json.loads(self._legacy_path.read_text(encoding="utf-8")) |
| self._loaded = True |
| print(f" π Local DB: {len(self._idx):,} entries loaded.") |
| return True |
|
|
| print(" β Local DB not found. Run: python scripts/update_db.py && python scripts/build_index.py") |
| return False |
| except Exception as e: |
| print(f" β Failed to load local DB: {e}") |
| return False |
|
|
| @property |
| def is_loaded(self) -> bool: |
| return self._loaded and len(self._idx) > 0 |
|
|
| def lookup(self, title: str) -> Optional[LocalMatch]: |
| """ |
| Look up an entry by title. |
| Returns LocalMatch if found, None otherwise. |
| """ |
| if not self._loaded: |
| return None |
|
|
| key = _normalize(title) |
| data = self._idx.get(key) |
| if not data: |
| return None |
|
|
| return LocalMatch( |
| title=data.get("title", ""), |
| author=data.get("author", ""), |
| year=data.get("year", ""), |
| booktitle=data.get("booktitle", ""), |
| journal=data.get("journal", ""), |
| doi=data.get("doi", ""), |
| url=data.get("url", ""), |
| pages=data.get("pages", ""), |
| volume=data.get("volume", ""), |
| entry_type=data.get("_type", "inproceedings"), |
| source_file=data.get("_source", ""), |
| ) |
|
|