RefCheck / scripts /update_db.py
voidful's picture
Add RefCheck Gradio Space
11a28db verified
#!/usr/bin/env python3
"""
Download conference/journal proceedings from DBLP as BibTeX files.
Uses the DBLP venue-based search API which is more reliable than
the TOC-based .bht queries (which often return 404 or single entries).
API format:
https://dblp.org/search/publ/api
?q=venue:{VenueName}: year:{year}:
&h=1000 # max results per batch
&f={offset} # pagination offset
&format=bib1 # BibTeX format
Usage:
python scripts/update_db.py
"""
import requests
import time
import sys
from pathlib import Path
DBLP_API = "https://dblp.org/search/publ/api"
# (dblp_venue_name, output_prefix, years)
# dblp_venue_name: exact venue string used in DBLP's venue: filter
# output_prefix: filename prefix for saved .bib files
CONFERENCES = [
# ── Speech & Audio ──────────────────────────────────────────
("INTERSPEECH", "interspeech", range(2018, 2027)),
("ICASSP", "icassp", range(2018, 2027)),
("ASRU", "asru", [2019, 2021, 2023, 2025]),
("SLT", "slt", [2018, 2021, 2022, 2024]),
# ── ML / AI ─────────────────────────────────────────────────
("ICML", "icml", range(2018, 2027)),
("NeurIPS", "neurips", range(2017, 2027)),
("ICLR", "iclr", range(2018, 2027)),
("AAAI", "aaai", range(2018, 2027)),
("IJCAI", "ijcai", range(2018, 2027)),
("CVPR", "cvpr", range(2018, 2027)),
("ECCV", "eccv", [2018, 2020, 2022, 2024]),
("ICCV", "iccv", [2019, 2021, 2023, 2025]),
# ── NLP ─────────────────────────────────────────────────────
("ACL", "acl", range(2018, 2027)), # includes Findings
("EMNLP", "emnlp", range(2018, 2027)), # includes Findings
("NAACL", "naacl", range(2018, 2027)),
("EACL", "eacl", range(2018, 2027)),
("LREC/COLING", "coling", [2024, 2025]),
# Older COLING uses different venue
# ("COLING", "coling", [2018, 2020, 2022]),
# ── IR / Web / Data ─────────────────────────────────────────
("SIGIR", "sigir", range(2018, 2027)),
("KDD", "kdd", range(2018, 2027)),
("WWW", "www", range(2018, 2027)),
("WSDM", "wsdm", range(2018, 2027)),
]
# Journals use venue search too
JOURNALS = [
("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)),
("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)),
]
def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path):
"""Download a conference/journal year from DBLP using venue search."""
out_file = out_dir / f"{prefix}{year}.bib"
if out_file.exists():
return # Skip if already downloaded
query = f"venue:{venue_name}: year:{year}:"
all_bib = []
offset = 0
while True:
try:
r = requests.get(DBLP_API, params={
"q": query, "h": 1000, "f": offset,
"format": "bib1",
}, timeout=30, headers={"User-Agent": "BibGuard/1.0"})
text = r.text.strip()
except Exception as e:
print(f" βœ— {prefix}{year}: network error ({e})")
return
# Check for HTML error pages
if not text or "<!DOCTYPE" in text[:100] or "@" not in text:
break
all_bib.append(text)
n_entries = text.count("@")
if n_entries < 1000:
break
offset += 1000
time.sleep(1)
if all_bib:
total = sum(b.count("@") for b in all_bib)
out_file.write_text("\n\n".join(all_bib), encoding="utf-8")
print(f" βœ“ {prefix}{year}: {total} entries")
else:
print(f" βœ— {prefix}{year}: not on DBLP yet")
def main():
out = Path(__file__).resolve().parent.parent / "data" / "raw"
out.mkdir(parents=True, exist_ok=True)
print("πŸ“₯ Downloading conference proceedings from DBLP...")
total_confs = sum(len(list(years)) for _, _, years in CONFERENCES)
done = 0
for venue, prefix, years in CONFERENCES:
for y in years:
download_venue(venue, prefix, y, out)
done += 1
time.sleep(0.5)
print(f"\nπŸ“₯ Downloading journal volumes from DBLP...")
for venue, prefix, years in JOURNALS:
for y in years:
download_venue(venue, prefix, y, out)
time.sleep(0.5)
print(f"\nβœ… Done. Run: python scripts/build_index.py")
if __name__ == "__main__":
main()