| """ |
| Unified metadata fetchers for BibGuard. |
| """ |
| import re |
| import time |
| import random |
| import requests |
| import xml.etree.ElementTree as ET |
| from dataclasses import dataclass |
| from typing import Optional, Any |
| from urllib.parse import quote |
| from bs4 import BeautifulSoup |
|
|
| @dataclass |
| class FetchResult: |
| """Unified fetch result.""" |
| title: str = "" |
| authors: list[str] | str = "" |
| year: str = "" |
| doi: str = "" |
| url: str = "" |
| source: str = "" |
| conference_year: str = "" |
| year_source: str = "" |
|
|
| def __post_init__(self): |
| if self.authors is None: self.authors = [] |
| if isinstance(self.authors, str) and self.authors: |
| |
| self.authors = [a.strip() for a in re.split(r',| and ', self.authors) if a.strip()] |
|
|
| class BaseFetcher: |
| """Base class for fetchers.""" |
| def _rate_limit(self, delay: float, last_time: float) -> float: |
| elapsed = time.time() - last_time |
| if elapsed < delay: |
| time.sleep(delay - elapsed) |
| return time.time() |
|
|
| class ArxivFetcher(BaseFetcher): |
| """Fetches metadata from arXiv API.""" |
| API_BASE = "http://export.arxiv.org/api/query" |
| |
| def __init__(self): |
| self._last_req = 0.0 |
|
|
| def fetch_by_id(self, arxiv_id: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(3.0, self._last_req) |
| clean_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE).strip() |
| try: |
| resp = requests.get(self.API_BASE, params={'id_list': clean_id, 'max_results': 1}, timeout=30) |
| return self._parse(resp.text) |
| except Exception: return None |
|
|
| def search_by_title(self, title: str) -> list[FetchResult]: |
| self._last_req = self._rate_limit(3.0, self._last_req) |
| clean = re.sub(r'[^\w\s]', ' ', title).strip() |
| try: |
| resp = requests.get(self.API_BASE, params={'search_query': f'ti:"{clean}"', 'max_results': 3}, timeout=30) |
| return self._parse(resp.text, multiple=True) |
| except Exception: return [] |
|
|
| def _parse(self, xml: str, multiple=False) -> Optional[FetchResult] | list[FetchResult]: |
| try: |
| root = ET.fromstring(xml) |
| ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} |
| entries = root.findall('atom:entry', ns) |
| results = [] |
| for entry in entries: |
| id_txt = entry.find('atom:id', ns).text |
| title = entry.find('atom:title', ns).text.strip() |
| authors = [a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)] |
| pub = entry.find('atom:published', ns).text |
| year = pub[:4] if pub else "" |
| doi_elem = entry.find('arxiv:doi', ns) |
| doi = doi_elem.text if doi_elem is not None else "" |
| |
| |
| conference_year = "" |
| journal_ref_elem = entry.find('arxiv:journal_ref', ns) |
| if journal_ref_elem is not None and journal_ref_elem.text: |
| jr_text = journal_ref_elem.text.strip() |
| year_match = re.search(r'\b(19|20)\d{2}\b', jr_text) |
| if year_match: |
| conference_year = year_match.group(0) |
| |
| result = FetchResult( |
| title=title, |
| authors=authors, |
| year=year, |
| doi=doi, |
| url=id_txt, |
| source="arxiv", |
| conference_year=conference_year, |
| year_source="arxiv_journal_ref" if conference_year else "arxiv_submission", |
| ) |
| results.append(result) |
| |
| if multiple: return results |
| return results[0] if results else None |
| except Exception: |
| return [] if multiple else None |
|
|
| class CrossRefFetcher(BaseFetcher): |
| """Fetches from CrossRef API.""" |
| API_BASE = "https://api.crossref.org/works" |
| |
| def __init__(self, email=None): |
| self._last_req = 0.0 |
| self.headers = {'User-Agent': f'BibGuard/1.0 (mailto:{email or "user@example.com"})'} |
|
|
| def search_by_title(self, title: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(0.2, self._last_req) |
| try: |
| resp = requests.get(self.API_BASE, params={'query.bibliographic': title, 'rows': 1}, headers=self.headers, timeout=10) |
| data = resp.json()['message']['items'] |
| if data: return self._parse(data[0]) |
| except Exception: pass |
| return None |
| |
| def search_by_doi(self, doi: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(0.2, self._last_req) |
| try: |
| resp = requests.get(f"{self.API_BASE}/{quote(doi)}", headers=self.headers, timeout=10) |
| return self._parse(resp.json()['message']) |
| except Exception: return None |
|
|
| def _parse(self, item: dict) -> FetchResult: |
| title = item.get('title', [''])[0] |
| authors = [f"{a.get('given','')} {a.get('family','')}".strip() for a in item.get('author', [])] |
| year = str(item.get('published-print', {}).get('date-parts', [[None]])[0][0] or "") |
| return FetchResult(title, authors, year, item.get('DOI', ''), item.get('URL', ''), "crossref") |
|
|
| class DBLPFetcher(BaseFetcher): |
| """Fetches from DBLP.""" |
| API_BASE = "https://dblp.org/search/publ/api" |
| |
| |
| |
| _DISAMBIG_RE = re.compile(r'\s+\d{4}\s*$') |
|
|
| def __init__(self): |
| self._last_req = 0.0 |
|
|
| @staticmethod |
| def _strip_disambig(name: str) -> str: |
| """Strip DBLP disambiguation suffix from author name.""" |
| return DBLPFetcher._DISAMBIG_RE.sub('', name).strip() |
|
|
| def search_by_title(self, title: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(1.0, self._last_req) |
| try: |
| resp = requests.get(self.API_BASE, params={'q': title, 'format': 'json', 'h': 1}, timeout=10) |
| hits = resp.json().get('result', {}).get('hits', {}).get('hit', []) |
| if hits: |
| info = hits[0]['info'] |
| authors = info.get('authors', {}).get('author', []) |
| if isinstance(authors, dict): authors = [self._strip_disambig(authors.get('text', ''))] |
| elif isinstance(authors, list): authors = [self._strip_disambig(a.get('text', '')) for a in authors] |
| return FetchResult(info.get('title', '').rstrip('.'), authors, info.get('year', ''), info.get('doi', ''), info.get('url', ''), "dblp") |
| except Exception: pass |
| return None |
|
|
| class SemanticScholarFetcher(BaseFetcher): |
| """Fetches from Semantic Scholar.""" |
| API_BASE = "https://api.semanticscholar.org/graph/v1/paper" |
|
|
| def __init__(self): |
| self._last_req = 0.0 |
|
|
| def search_by_title(self, title: str) -> Optional[FetchResult]: |
| return self._fetch(f"{self.API_BASE}/search", {'query': title, 'limit': 1, 'fields': 'title,authors,year,doi,url'}) |
|
|
| def fetch_by_doi(self, doi: str) -> Optional[FetchResult]: |
| return self._fetch(f"{self.API_BASE}/DOI:{doi}", {'fields': 'title,authors,year,doi,url'}) |
|
|
| def _fetch(self, url, params) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(2.0, self._last_req) |
| try: |
| resp = requests.get(url, params=params, timeout=10) |
| data = resp.json() |
| if 'data' in data and data['data']: data = data['data'][0] |
| if 'error' in data: return None |
| |
| authors = [a['name'] for a in data.get('authors', [])] |
| return FetchResult(data.get('title', ''), authors, str(data.get('year', '')), data.get('doi', ''), data.get('url', ''), "semantic_scholar") |
| except Exception: return None |
|
|
| class OpenAlexFetcher(BaseFetcher): |
| """Fetches from OpenAlex.""" |
| API_BASE = "https://api.openalex.org/works" |
|
|
| def __init__(self): |
| self._last_req = 0.0 |
|
|
| def search_by_title(self, title: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(0.2, self._last_req) |
| try: |
| resp = requests.get(self.API_BASE, params={'search': title, 'per-page': 1}, timeout=10) |
| data = resp.json().get('results', []) |
| if data: return self._parse(data[0]) |
| except Exception: pass |
| return None |
|
|
| def fetch_by_doi(self, doi: str) -> Optional[FetchResult]: |
| self._last_req = self._rate_limit(0.2, self._last_req) |
| try: |
| resp = requests.get(f"{self.API_BASE}/https://doi.org/{doi}", timeout=10) |
| return self._parse(resp.json()) |
| except Exception: return None |
|
|
| def _parse(self, data: dict) -> FetchResult: |
| authors = [a['author']['display_name'] for a in data.get('authorships', [])] |
| doi = data.get('doi', '').replace('https://doi.org/', '') |
| return FetchResult(data.get('title', ''), authors, str(data.get('publication_year', '')), doi, data.get('id', ''), "openalex") |
|
|
| class ScholarFetcher(BaseFetcher): |
| """Google Scholar Scraper (Fallback).""" |
| SEARCH_URL = "https://scholar.google.com/scholar" |
| |
| def __init__(self): |
| self._last_req = 0.0 |
| self._session = requests.Session() |
| self._blocked = False |
|
|
| def search_by_title(self, title: str) -> Optional[FetchResult]: |
| if self._blocked: return None |
| self._last_req = self._rate_limit(5.0 + random.random() * 3, self._last_req) |
| try: |
| headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'} |
| resp = self._session.get(self.SEARCH_URL, params={'q': f'"{title}"', 'hl': 'en', 'num': 1}, headers=headers, timeout=30) |
| if resp.status_code == 429 or 'unusual traffic' in resp.text: |
| self._blocked = True |
| return None |
| return self._parse(resp.text) |
| except Exception: return None |
|
|
| def _parse(self, html: str) -> Optional[FetchResult]: |
| soup = BeautifulSoup(html, 'lxml') |
| entry = soup.find('div', class_='gs_ri') |
| if not entry: return None |
| |
| title_tag = entry.find('h3', class_='gs_rt') |
| title = title_tag.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '').strip() |
| url = title_tag.find('a')['href'] if title_tag.find('a') else "" |
| |
| meta = entry.find('div', class_='gs_a').get_text(strip=True) |
| |
| year_match = re.search(r'\b(19|20)\d{2}\b', meta) |
| year = year_match.group(0) if year_match else "" |
| |
| authors = meta.split(' - ')[0] |
| |
| return FetchResult(title, authors, year, "", url, "scholar") |
|
|