RefCheck / scripts /build_index.py
voidful's picture
Add RefCheck Gradio Space
11a28db verified
#!/usr/bin/env python3
"""
Build a title-based index from downloaded DBLP bib files.
Reads all .bib files in data/raw/ and produces sharded JSON files
under data/index_shards/ (~25MB each) for GitHub-friendly storage.
Usage:
python scripts/build_index.py
"""
import json
import os
import re
import shutil
import sys
from pathlib import Path
try:
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
except ImportError:
print("Error: bibtexparser required. Install: pip install bibtexparser")
sys.exit(1)
MAX_SHARD_MB = 25 # Target shard size in MB
def normalize_title(title: str) -> str:
"""Normalize a title for index lookup."""
title = re.sub(r'\{([^}]*)\}', r'\1', title)
title = re.sub(r'[^\w\s]', ' ', title.lower())
return re.sub(r'\s+', ' ', title).strip()
def write_shards(index: dict, shard_dir: Path):
"""Split index into ~25MB JSON shard files."""
if shard_dir.exists():
shutil.rmtree(shard_dir)
shard_dir.mkdir(parents=True)
shard_num = 0
shard_items = []
shard_size = 0
max_bytes = MAX_SHARD_MB * 1024 * 1024
for key, val in index.items():
entry_size = len(json.dumps({key: val}, ensure_ascii=False).encode('utf-8'))
if shard_size + entry_size > max_bytes and shard_items:
path = shard_dir / f"index_{shard_num:02d}.json"
path.write_text(
json.dumps(dict(shard_items), ensure_ascii=False),
encoding="utf-8"
)
mb = path.stat().st_size / 1024 / 1024
print(f" βœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
shard_num += 1
shard_items = []
shard_size = 0
shard_items.append((key, val))
shard_size += entry_size
# Last shard
if shard_items:
path = shard_dir / f"index_{shard_num:02d}.json"
path.write_text(
json.dumps(dict(shard_items), ensure_ascii=False),
encoding="utf-8"
)
mb = path.stat().st_size / 1024 / 1024
print(f" βœ“ index_{shard_num:02d}.json: {len(shard_items):,} entries ({mb:.1f} MB)")
shard_num += 1
return shard_num
def main():
raw_dir = Path(__file__).resolve().parent.parent / "data" / "raw"
shard_dir = Path(__file__).resolve().parent.parent / "data" / "index_shards"
if not raw_dir.exists():
print(f"Error: {raw_dir} not found. Run: python scripts/update_db.py first")
sys.exit(1)
bib_files = sorted(raw_dir.glob("*.bib"))
if not bib_files:
print(f"No .bib files found in {raw_dir}")
sys.exit(1)
print(f"πŸ“¦ Building index from {len(bib_files)} bib files...")
index = {}
skipped_files = 0
for bib_file in bib_files:
try:
parser = BibTexParser(common_strings=True)
parser.customization = convert_to_unicode
with open(bib_file, encoding="utf-8", errors="replace") as f:
db = bibtexparser.load(f, parser=parser)
except Exception as e:
print(f" ⚠ Skip {bib_file.name}: {e}")
skipped_files += 1
continue
for entry in db.entries:
title = entry.get("title", "")
if not title:
continue
key = normalize_title(title)
if not key:
continue
if key not in index:
index[key] = {
"title": title.rstrip('.'),
"author": entry.get("author", ""),
"year": entry.get("year", ""),
"booktitle": entry.get("booktitle", ""),
"journal": entry.get("journal", ""),
"doi": entry.get("doi", ""),
"url": entry.get("url", ""),
"pages": entry.get("pages", ""),
"volume": entry.get("volume", ""),
"_type": entry.get("ENTRYTYPE", "inproceedings"),
"_source": bib_file.stem,
}
print(f"\nπŸ“‚ Writing sharded index...")
n_shards = write_shards(index, shard_dir)
total_mb = sum(f.stat().st_size for f in shard_dir.glob("*.json")) / 1024 / 1024
print(f"\nβœ… Index: {len(index):,} unique entries β†’ {n_shards} shards ({total_mb:.1f} MB total)")
print(f" Saved to: {shard_dir}/")
if skipped_files:
print(f" ⚠ {skipped_files} file(s) skipped due to parse errors")
if __name__ == "__main__":
main()