#!/usr/bin/env python3 """ Download conference/journal proceedings from DBLP as BibTeX files. Uses the DBLP venue-based search API which is more reliable than the TOC-based .bht queries (which often return 404 or single entries). API format: https://dblp.org/search/publ/api ?q=venue:{VenueName}: year:{year}: &h=1000 # max results per batch &f={offset} # pagination offset &format=bib1 # BibTeX format Usage: python scripts/update_db.py """ import requests import time import sys from pathlib import Path DBLP_API = "https://dblp.org/search/publ/api" # (dblp_venue_name, output_prefix, years) # dblp_venue_name: exact venue string used in DBLP's venue: filter # output_prefix: filename prefix for saved .bib files CONFERENCES = [ # ── Speech & Audio ────────────────────────────────────────── ("INTERSPEECH", "interspeech", range(2018, 2027)), ("ICASSP", "icassp", range(2018, 2027)), ("ASRU", "asru", [2019, 2021, 2023, 2025]), ("SLT", "slt", [2018, 2021, 2022, 2024]), # ── ML / AI ───────────────────────────────────────────────── ("ICML", "icml", range(2018, 2027)), ("NeurIPS", "neurips", range(2017, 2027)), ("ICLR", "iclr", range(2018, 2027)), ("AAAI", "aaai", range(2018, 2027)), ("IJCAI", "ijcai", range(2018, 2027)), ("CVPR", "cvpr", range(2018, 2027)), ("ECCV", "eccv", [2018, 2020, 2022, 2024]), ("ICCV", "iccv", [2019, 2021, 2023, 2025]), # ── NLP ───────────────────────────────────────────────────── ("ACL", "acl", range(2018, 2027)), # includes Findings ("EMNLP", "emnlp", range(2018, 2027)), # includes Findings ("NAACL", "naacl", range(2018, 2027)), ("EACL", "eacl", range(2018, 2027)), ("LREC/COLING", "coling", [2024, 2025]), # Older COLING uses different venue # ("COLING", "coling", [2018, 2020, 2022]), # ── IR / Web / Data ───────────────────────────────────────── ("SIGIR", "sigir", range(2018, 2027)), ("KDD", "kdd", range(2018, 2027)), ("WWW", "www", range(2018, 2027)), ("WSDM", "wsdm", range(2018, 2027)), ] # Journals use venue search too JOURNALS = [ ("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)), ("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)), ] def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path): """Download a conference/journal year from DBLP using venue search.""" out_file = out_dir / f"{prefix}{year}.bib" if out_file.exists(): return # Skip if already downloaded query = f"venue:{venue_name}: year:{year}:" all_bib = [] offset = 0 while True: try: r = requests.get(DBLP_API, params={ "q": query, "h": 1000, "f": offset, "format": "bib1", }, timeout=30, headers={"User-Agent": "BibGuard/1.0"}) text = r.text.strip() except Exception as e: print(f" ✗ {prefix}{year}: network error ({e})") return # Check for HTML error pages if not text or "