""" Unified metadata fetchers for BibGuard. """ import re import time import random import requests import xml.etree.ElementTree as ET from dataclasses import dataclass from typing import Optional, Any from urllib.parse import quote from bs4 import BeautifulSoup @dataclass class FetchResult: """Unified fetch result.""" title: str = "" authors: list[str] | str = "" year: str = "" doi: str = "" url: str = "" source: str = "" conference_year: str = "" # Year from journal_ref / conference proceedings year_source: str = "" # Where the year came from def __post_init__(self): if self.authors is None: self.authors = [] if isinstance(self.authors, str) and self.authors: # Simple split if string provided self.authors = [a.strip() for a in re.split(r',| and ', self.authors) if a.strip()] class BaseFetcher: """Base class for fetchers.""" def _rate_limit(self, delay: float, last_time: float) -> float: elapsed = time.time() - last_time if elapsed < delay: time.sleep(delay - elapsed) return time.time() class ArxivFetcher(BaseFetcher): """Fetches metadata from arXiv API.""" API_BASE = "http://export.arxiv.org/api/query" def __init__(self): self._last_req = 0.0 def fetch_by_id(self, arxiv_id: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(3.0, self._last_req) clean_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE).strip() try: resp = requests.get(self.API_BASE, params={'id_list': clean_id, 'max_results': 1}, timeout=30) return self._parse(resp.text) except Exception: return None def search_by_title(self, title: str) -> list[FetchResult]: self._last_req = self._rate_limit(3.0, self._last_req) clean = re.sub(r'[^\w\s]', ' ', title).strip() try: resp = requests.get(self.API_BASE, params={'search_query': f'ti:"{clean}"', 'max_results': 3}, timeout=30) return self._parse(resp.text, multiple=True) except Exception: return [] def _parse(self, xml: str, multiple=False) -> Optional[FetchResult] | list[FetchResult]: try: root = ET.fromstring(xml) ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} entries = root.findall('atom:entry', ns) results = [] for entry in entries: id_txt = entry.find('atom:id', ns).text title = entry.find('atom:title', ns).text.strip() authors = [a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)] pub = entry.find('atom:published', ns).text year = pub[:4] if pub else "" doi_elem = entry.find('arxiv:doi', ns) doi = doi_elem.text if doi_elem is not None else "" # Extract conference year from journal_ref if available conference_year = "" journal_ref_elem = entry.find('arxiv:journal_ref', ns) if journal_ref_elem is not None and journal_ref_elem.text: jr_text = journal_ref_elem.text.strip() year_match = re.search(r'\b(19|20)\d{2}\b', jr_text) if year_match: conference_year = year_match.group(0) result = FetchResult( title=title, authors=authors, year=year, doi=doi, url=id_txt, source="arxiv", conference_year=conference_year, year_source="arxiv_journal_ref" if conference_year else "arxiv_submission", ) results.append(result) if multiple: return results return results[0] if results else None except Exception: return [] if multiple else None class CrossRefFetcher(BaseFetcher): """Fetches from CrossRef API.""" API_BASE = "https://api.crossref.org/works" def __init__(self, email=None): self._last_req = 0.0 self.headers = {'User-Agent': f'BibGuard/1.0 (mailto:{email or "user@example.com"})'} def search_by_title(self, title: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(0.2, self._last_req) try: resp = requests.get(self.API_BASE, params={'query.bibliographic': title, 'rows': 1}, headers=self.headers, timeout=10) data = resp.json()['message']['items'] if data: return self._parse(data[0]) except Exception: pass return None def search_by_doi(self, doi: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(0.2, self._last_req) try: resp = requests.get(f"{self.API_BASE}/{quote(doi)}", headers=self.headers, timeout=10) return self._parse(resp.json()['message']) except Exception: return None def _parse(self, item: dict) -> FetchResult: title = item.get('title', [''])[0] authors = [f"{a.get('given','')} {a.get('family','')}".strip() for a in item.get('author', [])] year = str(item.get('published-print', {}).get('date-parts', [[None]])[0][0] or "") return FetchResult(title, authors, year, item.get('DOI', ''), item.get('URL', ''), "crossref") class DBLPFetcher(BaseFetcher): """Fetches from DBLP.""" API_BASE = "https://dblp.org/search/publ/api" # DBLP disambiguation ID: 4-digit suffix appended to author names # e.g. "Tian Tan 0019", "Wei Li 0119" _DISAMBIG_RE = re.compile(r'\s+\d{4}\s*$') def __init__(self): self._last_req = 0.0 @staticmethod def _strip_disambig(name: str) -> str: """Strip DBLP disambiguation suffix from author name.""" return DBLPFetcher._DISAMBIG_RE.sub('', name).strip() def search_by_title(self, title: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(1.0, self._last_req) try: resp = requests.get(self.API_BASE, params={'q': title, 'format': 'json', 'h': 1}, timeout=10) hits = resp.json().get('result', {}).get('hits', {}).get('hit', []) if hits: info = hits[0]['info'] authors = info.get('authors', {}).get('author', []) if isinstance(authors, dict): authors = [self._strip_disambig(authors.get('text', ''))] elif isinstance(authors, list): authors = [self._strip_disambig(a.get('text', '')) for a in authors] return FetchResult(info.get('title', '').rstrip('.'), authors, info.get('year', ''), info.get('doi', ''), info.get('url', ''), "dblp") except Exception: pass return None class SemanticScholarFetcher(BaseFetcher): """Fetches from Semantic Scholar.""" API_BASE = "https://api.semanticscholar.org/graph/v1/paper" def __init__(self): self._last_req = 0.0 def search_by_title(self, title: str) -> Optional[FetchResult]: return self._fetch(f"{self.API_BASE}/search", {'query': title, 'limit': 1, 'fields': 'title,authors,year,doi,url'}) def fetch_by_doi(self, doi: str) -> Optional[FetchResult]: return self._fetch(f"{self.API_BASE}/DOI:{doi}", {'fields': 'title,authors,year,doi,url'}) def _fetch(self, url, params) -> Optional[FetchResult]: self._last_req = self._rate_limit(2.0, self._last_req) try: resp = requests.get(url, params=params, timeout=10) data = resp.json() if 'data' in data and data['data']: data = data['data'][0] # Handle search result if 'error' in data: return None authors = [a['name'] for a in data.get('authors', [])] return FetchResult(data.get('title', ''), authors, str(data.get('year', '')), data.get('doi', ''), data.get('url', ''), "semantic_scholar") except Exception: return None class OpenAlexFetcher(BaseFetcher): """Fetches from OpenAlex.""" API_BASE = "https://api.openalex.org/works" def __init__(self): self._last_req = 0.0 def search_by_title(self, title: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(0.2, self._last_req) try: resp = requests.get(self.API_BASE, params={'search': title, 'per-page': 1}, timeout=10) data = resp.json().get('results', []) if data: return self._parse(data[0]) except Exception: pass return None def fetch_by_doi(self, doi: str) -> Optional[FetchResult]: self._last_req = self._rate_limit(0.2, self._last_req) try: resp = requests.get(f"{self.API_BASE}/https://doi.org/{doi}", timeout=10) return self._parse(resp.json()) except Exception: return None def _parse(self, data: dict) -> FetchResult: authors = [a['author']['display_name'] for a in data.get('authorships', [])] doi = data.get('doi', '').replace('https://doi.org/', '') return FetchResult(data.get('title', ''), authors, str(data.get('publication_year', '')), doi, data.get('id', ''), "openalex") class ScholarFetcher(BaseFetcher): """Google Scholar Scraper (Fallback).""" SEARCH_URL = "https://scholar.google.com/scholar" def __init__(self): self._last_req = 0.0 self._session = requests.Session() self._blocked = False def search_by_title(self, title: str) -> Optional[FetchResult]: if self._blocked: return None self._last_req = self._rate_limit(5.0 + random.random() * 3, self._last_req) # Polite delay try: headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'} resp = self._session.get(self.SEARCH_URL, params={'q': f'"{title}"', 'hl': 'en', 'num': 1}, headers=headers, timeout=30) if resp.status_code == 429 or 'unusual traffic' in resp.text: self._blocked = True return None return self._parse(resp.text) except Exception: return None def _parse(self, html: str) -> Optional[FetchResult]: soup = BeautifulSoup(html, 'lxml') entry = soup.find('div', class_='gs_ri') if not entry: return None title_tag = entry.find('h3', class_='gs_rt') title = title_tag.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '').strip() url = title_tag.find('a')['href'] if title_tag.find('a') else "" meta = entry.find('div', class_='gs_a').get_text(strip=True) # Attempt to extract year year_match = re.search(r'\b(19|20)\d{2}\b', meta) year = year_match.group(0) if year_match else "" # Attempt to extract authors (before " - ") authors = meta.split(' - ')[0] return FetchResult(title, authors, year, "", url, "scholar")