#!/usr/bin/env python3 """ Build a title-based index from downloaded DBLP bib files. Reads all .bib files in data/raw/ and produces sharded JSON files under data/index_shards/ (~25MB each) for GitHub-friendly storage. Usage: python scripts/build_index.py """ import json import os import re import shutil import sys from pathlib import Path try: import bibtexparser from bibtexparser.bparser import BibTexParser from bibtexparser.customization import convert_to_unicode except ImportError: print("Error: bibtexparser required. Install: pip install bibtexparser") sys.exit(1) MAX_SHARD_MB = 25 # Target shard size in MB def normalize_title(title: str) -> str: """Normalize a title for index lookup.""" title = re.sub(r'\{([^}]*)\}', r'\1', title) title = re.sub(r'[^\w\s]', ' ', title.lower()) return re.sub(r'\s+', ' ', title).strip() def write_shards(index: dict, shard_dir: Path): """Split index into ~25MB JSON shard files.""" if shard_dir.exists(): shutil.rmtree(shard_dir) shard_dir.mkdir(parents=True) shard_num = 0 shard_items = [] shard_size = 0 max_bytes = MAX_SHARD_MB * 1024 * 1024 for key, val in index.items(): entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8')) if shard_size + entry_size > max_bytes and shard_items: path = shard_dir / f"index_{shard_num:02d}.json" path.write_text( json.dumps(dict(shard_items), ensure_ascii=False), encoding="utf-8" ) mb = path.stat().st_size / 1024 / 1024 print(f" āœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)") shard_num += 1 shard_items = [] shard_size = 0 shard_items.append((key, val)) shard_size += entry_size # Last shard if shard_items: path = shard_dir / f"index_{shard_num:02d}.json" path.write_text( json.dumps(dict(shard_items), ensure_ascii=False), encoding="utf-8" ) mb = path.stat().st_size / 1024 / 1024 print(f" āœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)") shard_num += 1 return shard_num def main(): raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw" shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards" if not raw_dir.exists(): print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first") sys.exit(1) bib_files = sorted(raw_dir.glob("*.bib")) if not bib_files: print(f"No .bib files found in {raw_dir}") sys.exit(1) print(f"šŸ“¦ Building index from {len(bib_files)} bib files...") index = {} skipped_files = 0 for bib_file in bib_files: try: parser = BibTexParser(common_strings=True) parser.customization = convert_to_unicode with open(bib_file, encoding="utf-8", errors="replace") as f: db = bibtexparser.load(f, parser=parser) except Exception as e: print(f" ⚠ Skip {bib_file.name}: {e}") skipped_files += 1 continue for entry in db.entries: title = entry.get("title", "") if not title: continue key = normalize_title(title) if not key: continue if key not in index: index[key] = { "title": title.rstrip('.'), "author": entry.get("author", ""), "year": entry.get("year", ""), "booktitle": entry.get("booktitle", ""), "journal": entry.get("journal", ""), "doi": entry.get("doi", ""), "url": entry.get("url", ""), "pages": entry.get("pages", ""), "volume": entry.get("volume", ""), "_type": entry.get("ENTRYTYPE", "inproceedings"), "_source": bib_file.stem, } print(f"\nšŸ“‚ Writing sharded index...") n_shards = write_shards(index, shard_dir) total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024 print(f"\nāœ… Index: {len(index):,} unique entries → {n_shards} shards ({total_mb:.1f} MB total)") print(f" Saved to: {shard_dir}/") if skipped_files: print(f" ⚠ {skipped_files} file(s) skipped due to parse errors") if __name__ == "__main__": main()