| |
| """ |
| Build a title-based index from downloaded DBLP bib files. |
| |
| Reads all .bib files in data/raw/ and produces sharded JSON files |
| under data/index_shards/ (~25MB each) for GitHub-friendly storage. |
| |
| Usage: |
| python scripts/build_index.py |
| """ |
| import json |
| import os |
| import re |
| import shutil |
| import sys |
| from pathlib import Path |
|
|
| try: |
| import bibtexparser |
| from bibtexparser.bparser import BibTexParser |
| from bibtexparser.customization import convert_to_unicode |
| except ImportError: |
| print("Error: bibtexparser required. Install: pip install bibtexparser") |
| sys.exit(1) |
|
|
| MAX_SHARD_MB = 25 |
|
|
|
|
| def normalize_title(title: str) -> str: |
| """Normalize a title for index lookup.""" |
| title = re.sub(r'\{([^}]*)\}', r'\1', title) |
| title = re.sub(r'[^\w\s]', ' ', title.lower()) |
| return re.sub(r'\s+', ' ', title).strip() |
|
|
|
|
| def write_shards(index: dict, shard_dir: Path): |
| """Split index into ~25MB JSON shard files.""" |
| if shard_dir.exists(): |
| shutil.rmtree(shard_dir) |
| shard_dir.mkdir(parents=True) |
|
|
| shard_num = 0 |
| shard_items = [] |
| shard_size = 0 |
| max_bytes = MAX_SHARD_MB * 1024 * 1024 |
|
|
| for key, val in index.items(): |
| entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8')) |
|
|
| if shard_size + entry_size > max_bytes and shard_items: |
| path = shard_dir / f"index_{shard_num:02d}.json" |
| path.write_text( |
| json.dumps(dict(shard_items), ensure_ascii=False), |
| encoding="utf-8" |
| ) |
| mb = path.stat().st_size / 1024 / 1024 |
| print(f" β index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)") |
| shard_num += 1 |
| shard_items = [] |
| shard_size = 0 |
|
|
| shard_items.append((key, val)) |
| shard_size += entry_size |
|
|
| |
| if shard_items: |
| path = shard_dir / f"index_{shard_num:02d}.json" |
| path.write_text( |
| json.dumps(dict(shard_items), ensure_ascii=False), |
| encoding="utf-8" |
| ) |
| mb = path.stat().st_size / 1024 / 1024 |
| print(f" β index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)") |
| shard_num += 1 |
|
|
| return shard_num |
|
|
|
|
| def main(): |
| raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw" |
| shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards" |
| |
| if not raw_dir.exists(): |
| print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first") |
| sys.exit(1) |
|
|
| bib_files = sorted(raw_dir.glob("*.bib")) |
| if not bib_files: |
| print(f"No .bib files found in {raw_dir}") |
| sys.exit(1) |
|
|
| print(f"π¦ Building index from {len(bib_files)} bib files...") |
|
|
| index = {} |
| skipped_files = 0 |
|
|
| for bib_file in bib_files: |
| try: |
| parser = BibTexParser(common_strings=True) |
| parser.customization = convert_to_unicode |
| with open(bib_file, encoding="utf-8", errors="replace") as f: |
| db = bibtexparser.load(f, parser=parser) |
| except Exception as e: |
| print(f" β Skip {bib_file.name}: {e}") |
| skipped_files += 1 |
| continue |
|
|
| for entry in db.entries: |
| title = entry.get("title", "") |
| if not title: |
| continue |
| |
| key = normalize_title(title) |
| if not key: |
| continue |
|
|
| if key not in index: |
| index[key] = { |
| "title": title.rstrip('.'), |
| "author": entry.get("author", ""), |
| "year": entry.get("year", ""), |
| "booktitle": entry.get("booktitle", ""), |
| "journal": entry.get("journal", ""), |
| "doi": entry.get("doi", ""), |
| "url": entry.get("url", ""), |
| "pages": entry.get("pages", ""), |
| "volume": entry.get("volume", ""), |
| "_type": entry.get("ENTRYTYPE", "inproceedings"), |
| "_source": bib_file.stem, |
| } |
|
|
| print(f"\nπ Writing sharded index...") |
| n_shards = write_shards(index, shard_dir) |
|
|
| total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024 |
| print(f"\nβ
Index: {len(index):,} unique entries β {n_shards} shards ({total_mb:.1f} MB total)") |
| print(f" Saved to: {shard_dir}/") |
| if skipped_files: |
| print(f" β {skipped_files} file(s) skipped due to parse errors") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|