""" Metadata comparison between bib entries and fetched metadata. """ from datetime import datetime from dataclasses import dataclass from typing import Optional, List, Union, Any, Tuple from .parser import BibEntry from .utils import TextNormalizer CURRENT_YEAR = datetime.now().year # Year source priority: lower number = more trustworthy YEAR_SOURCE_PRIORITY = { "crossref": 0, # DOI-verified, most accurate "dblp": 1, # Conference proceedings "openalex": 2, "semantic_scholar": 3, "arxiv_journal_ref": 4, # arXiv's journal_ref field "scholar": 5, "arxiv": 99, # arXiv submission date — last resort } def resolve_year(candidates: list, bib_year: str = "") -> Tuple[Optional[str], Optional[str]]: """ Pick the best year across all candidate results using source priority. Conference/journal year always beats arXiv submission year. Never returns a future year. Args: candidates: list of ComparisonResult objects bib_year: the current bib entry year (fallback) Returns: (best_year, best_source) or (None, None) """ pool = [] for cand in candidates: if not cand or not cand.fetched_data: continue source = cand.source fetched_year = str(getattr(cand.fetched_data, 'year', '') or '').strip() if not fetched_year or not fetched_year.isdigit(): continue # Check for conference_year from arXiv journal_ref conf_year = str(getattr(cand.fetched_data, 'conference_year', '') or '').strip() if source == "arxiv" and conf_year and conf_year.isdigit(): pool.append((YEAR_SOURCE_PRIORITY.get("arxiv_journal_ref", 4), conf_year, "arxiv_journal_ref")) priority = YEAR_SOURCE_PRIORITY.get(source, 50) pool.append((priority, fetched_year, source)) if not pool: return None, None pool.sort() # Pick best year that isn't in the future for _, year, source in pool: if int(year) <= CURRENT_YEAR: return year, source # All years are future — return None return None, None @dataclass class ComparisonResult: """Result of comparing bib entry with fetched metadata.""" entry_key: str # Title comparison title_match: bool title_similarity: float bib_title: str fetched_title: str # Author comparison author_match: bool author_similarity: float bib_authors: list[str] fetched_authors: list[str] # Year comparison year_match: bool bib_year: str fetched_year: str # Overall assessment is_match: bool confidence: float issues: list[str] source: str # Raw metadata for auto-fixing fetched_data: Any = None # Author initial conflict flag author_initial_conflict: bool = False @property def has_issues(self) -> bool: return len(self.issues) > 0 @dataclass class EntryReport: """Complete report for a single bib entry.""" entry: BibEntry comparison: Optional[ComparisonResult] evaluations: list = None def __post_init__(self): if self.evaluations is None: self.evaluations = [] class MetadataComparator: """Compares bibliography entries with fetched metadata.""" # Thresholds for matching TITLE_THRESHOLD = 0.8 AUTHOR_THRESHOLD = 0.6 def __init__(self): self.normalizer = TextNormalizer def compare(self, bib_entry: BibEntry, fetched_data: Any, source_name: str) -> ComparisonResult: """ Generic comparison method for any data source. fetched_data must have 'title', 'year', and 'authors' attributes. """ issues = [] # --- Title Comparison --- bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) fetched_title_norm = self.normalizer.normalize_for_comparison(fetched_data.title) title_similarity = self.normalizer.similarity_ratio(bib_title_norm, fetched_title_norm) if len(bib_title_norm) < 100: lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, fetched_title_norm) title_similarity = max(title_similarity, lev_sim) title_match = title_similarity >= self.TITLE_THRESHOLD if not title_match: issues.append(f"Title mismatch (similarity: {title_similarity:.2%})") # --- Author Comparison --- bib_authors = self.normalizer.normalize_author_list(bib_entry.author) # Check for DBLP disambiguation IDs in bib entry author names raw_author_list = self.normalizer.parse_author_list(bib_entry.author) for raw_auth in raw_author_list: if self.normalizer.has_dblp_disambiguation_id(raw_auth.strip()): issues.append(f"DBLP disambiguation ID in author: '{raw_auth.strip()}'") # Handle different author formats (list vs string) fetched_authors_raw = getattr(fetched_data, 'authors', []) if isinstance(fetched_authors_raw, str): # Scholar style: "Author1, Author2" fetched_authors_raw = [a.strip() for a in fetched_authors_raw.split(',')] fetched_authors = [ self.normalizer.normalize_author_name(str(a)) for a in fetched_authors_raw ] author_similarity = self._compare_author_lists(bib_authors, fetched_authors) author_match = author_similarity >= self.AUTHOR_THRESHOLD allows_truncated_authors = any( token in str(raw_author).lower() for raw_author in raw_author_list for token in ("others", "et al") ) if ( author_match and bib_authors and fetched_authors and len(bib_authors) != len(fetched_authors) and not allows_truncated_authors ): author_match = False issues.append(f"Author count mismatch: bib={len(bib_authors)}, fetched={len(fetched_authors)}") if not author_match: issues.append(f"Author mismatch (similarity: {author_similarity:.2%})") # --- Year Comparison --- bib_year = str(bib_entry.year).strip() fetched_year = str(getattr(fetched_data, 'year', '')).strip() conference_year = str(getattr(fetched_data, 'conference_year', '') or '').strip() if source_name.startswith("arxiv") and conference_year and conference_year.isdigit(): fetched_year = conference_year year_match = bool(bib_year and fetched_year and bib_year == fetched_year) if not bib_year: issues.append("Missing year in BibTeX entry") elif not fetched_year: issues.append(f"Missing year from {source_name} metadata") elif not year_match: issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}") # --- Overall Assessment --- is_match = title_match and author_match and year_match # Simple weighted confidence score confidence = ( title_similarity * 0.5 + author_similarity * 0.3 + (1.0 if year_match else 0.5) * 0.2 ) if not year_match: # A title/author match with the wrong year is not safe enough to auto-fix. confidence = min(confidence, 0.8) # --- Author Initial Conflict Detection --- author_initial_conflict = self._check_author_initial_conflict( bib_authors, fetched_authors, self.normalizer.parse_author_list(bib_entry.author), fetched_authors_raw ) if author_initial_conflict: issues.append("Author initial conflict detected (e.g., first-name initials differ)") # Cap confidence — don't auto-adopt these authors confidence = min(confidence, 0.7) return ComparisonResult( entry_key=bib_entry.key, title_match=title_match, title_similarity=title_similarity, bib_title=bib_entry.title, fetched_title=fetched_data.title, author_match=author_match, author_similarity=author_similarity, bib_authors=bib_authors, fetched_authors=fetched_authors, year_match=year_match, bib_year=bib_year, fetched_year=fetched_year, is_match=is_match, confidence=confidence, issues=issues, source=source_name, fetched_data=fetched_data, author_initial_conflict=author_initial_conflict ) def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult: """Create result when metadata couldn't be fetched.""" return ComparisonResult( entry_key=bib_entry.key, title_match=False, title_similarity=0.0, bib_title=bib_entry.title, fetched_title="", author_match=False, author_similarity=0.0, bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[], year_match=False, bib_year=bib_entry.year, fetched_year="", is_match=False, confidence=0.0, issues=[reason], source="unable", fetched_data=None ) def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float: """Compare two author lists.""" if not list1 and not list2: return 1.0 if not list1 or not list2: return 0.0 total_similarity = 0.0 for author1 in list1: best_match = 0.0 for author2 in list2: if self._names_match(author1, author2): best_match = 1.0 break sim = self.normalizer.similarity_ratio(author1, author2) best_match = max(best_match, sim) total_similarity += best_match return total_similarity / len(list1) def _names_match(self, name1: str, name2: str) -> bool: """Check if two names match (handles abbreviated names).""" def split_name(n): parts = n.lower().replace('.', '').split() return parts words1 = split_name(name1) words2 = split_name(name2) if not words1 or not words2: return False # Last name must match (assuming last word is last name) if words1[-1] != words2[-1]: return False # First name check: if len(words1) > 1 and len(words2) > 1: f1 = words1[0] f2 = words2[0] # If one is just an initial if len(f1) == 1 or len(f2) == 1: if f1[0] != f2[0]: return False else: # Both full names - must match if f1 != f2: return False return True def _check_author_initial_conflict( self, bib_authors_norm: list[str], fetched_authors_norm: list[str], bib_authors_raw: list[str], fetched_authors_raw: list, ) -> bool: """ Detect when first-name initials clearly conflict between bib entry and fetched data. e.g., "Y. Zhou" (bib) vs "Henry Zhou" (fetched) → True (Y ≠ H) This prevents blindly overwriting authors with wrong names. """ # Compare by position — aligned authors min_len = min(len(bib_authors_norm), len(fetched_authors_norm)) if min_len == 0: return False for i in range(min_len): bib_parts = bib_authors_norm[i].split() fetched_parts = fetched_authors_norm[i].split() if len(bib_parts) < 2 or len(fetched_parts) < 2: continue # Last name must match to consider this a potential conflict if bib_parts[-1] != fetched_parts[-1]: continue bib_first = bib_parts[0] fetched_first = fetched_parts[0] # Both have first name info (not empty) if not bib_first or not fetched_first: continue # If initials differ, it's a conflict if bib_first[0] != fetched_first[0]: return True return False