| |
| """ |
| Download conference/journal proceedings from DBLP as BibTeX files. |
| |
| Uses the DBLP venue-based search API which is more reliable than |
| the TOC-based .bht queries (which often return 404 or single entries). |
| |
| API format: |
| https://dblp.org/search/publ/api |
| ?q=venue:{VenueName}: year:{year}: |
| &h=1000 # max results per batch |
| &f={offset} # pagination offset |
| &format=bib1 # BibTeX format |
| |
| Usage: |
| python scripts/update_db.py |
| """ |
| import requests |
| import time |
| import sys |
| from pathlib import Path |
|
|
| DBLP_API = "https://dblp.org/search/publ/api" |
|
|
| |
| |
| |
| CONFERENCES = [ |
| |
| ("INTERSPEECH", "interspeech", range(2018, 2027)), |
| ("ICASSP", "icassp", range(2018, 2027)), |
| ("ASRU", "asru", [2019, 2021, 2023, 2025]), |
| ("SLT", "slt", [2018, 2021, 2022, 2024]), |
|
|
| |
| ("ICML", "icml", range(2018, 2027)), |
| ("NeurIPS", "neurips", range(2017, 2027)), |
| ("ICLR", "iclr", range(2018, 2027)), |
| ("AAAI", "aaai", range(2018, 2027)), |
| ("IJCAI", "ijcai", range(2018, 2027)), |
| ("CVPR", "cvpr", range(2018, 2027)), |
| ("ECCV", "eccv", [2018, 2020, 2022, 2024]), |
| ("ICCV", "iccv", [2019, 2021, 2023, 2025]), |
|
|
| |
| ("ACL", "acl", range(2018, 2027)), |
| ("EMNLP", "emnlp", range(2018, 2027)), |
| ("NAACL", "naacl", range(2018, 2027)), |
| ("EACL", "eacl", range(2018, 2027)), |
| ("LREC/COLING", "coling", [2024, 2025]), |
| |
| |
|
|
| |
| ("SIGIR", "sigir", range(2018, 2027)), |
| ("KDD", "kdd", range(2018, 2027)), |
| ("WWW", "www", range(2018, 2027)), |
| ("WSDM", "wsdm", range(2018, 2027)), |
| ] |
|
|
| |
| JOURNALS = [ |
| ("IEEE ACM Trans Audio Speech Lang Process", "taslp", range(2018, 2027)), |
| ("Trans. Assoc. Comput. Linguistics", "tacl", range(2018, 2027)), |
| ] |
|
|
|
|
| def download_venue(venue_name: str, prefix: str, year: int, out_dir: Path): |
| """Download a conference/journal year from DBLP using venue search.""" |
| out_file = out_dir / f"{prefix}{year}.bib" |
| if out_file.exists(): |
| return |
|
|
| query = f"venue:{venue_name}: year:{year}:" |
| all_bib = [] |
| offset = 0 |
|
|
| while True: |
| try: |
| r = requests.get(DBLP_API, params={ |
| "q": query, "h": 1000, "f": offset, |
| "format": "bib1", |
| }, timeout=30, headers={"User-Agent": "BibGuard/1.0"}) |
| text = r.text.strip() |
| except Exception as e: |
| print(f" β {prefix}{year}: network error ({e})") |
| return |
|
|
| |
| if not text or "<!DOCTYPE" in text[:100] or "@" not in text: |
| break |
|
|
| all_bib.append(text) |
| n_entries = text.count("@") |
| if n_entries < 1000: |
| break |
| offset += 1000 |
| time.sleep(1) |
|
|
| if all_bib: |
| total = sum(b.count("@") for b in all_bib) |
| out_file.write_text("\n\n".join(all_bib), encoding="utf-8") |
| print(f" β {prefix}{year}: {total} entries") |
| else: |
| print(f" β {prefix}{year}: not on DBLP yet") |
|
|
|
|
| def main(): |
| out = Path(__file__).resolve().parent.parent / "data" / "raw" |
| out.mkdir(parents=True, exist_ok=True) |
|
|
| print("π₯ Downloading conference proceedings from DBLP...") |
| total_confs = sum(len(list(years)) for _, _, years in CONFERENCES) |
| done = 0 |
| for venue, prefix, years in CONFERENCES: |
| for y in years: |
| download_venue(venue, prefix, y, out) |
| done += 1 |
| time.sleep(0.5) |
|
|
| print(f"\nπ₯ Downloading journal volumes from DBLP...") |
| for venue, prefix, years in JOURNALS: |
| for y in years: |
| download_venue(venue, prefix, y, out) |
| time.sleep(0.5) |
|
|
| print(f"\nβ
Done. Run: python scripts/build_index.py") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|