RefCheck / src /comparator.py
voidful's picture
Require primary evidence for verified references
a3caaa3 verified
"""
Metadata comparison between bib entries and fetched metadata.
"""
from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List, Union, Any, Tuple
from .parser import BibEntry
from .utils import TextNormalizer
CURRENT_YEAR = datetime.now().year
# Year source priority: lower number = more trustworthy
YEAR_SOURCE_PRIORITY = {
"crossref": 0, # DOI-verified, most accurate
"dblp": 1, # Conference proceedings
"openalex": 2,
"semantic_scholar": 3,
"arxiv_journal_ref": 4, # arXiv's journal_ref field
"scholar": 5,
"arxiv": 99, # arXiv submission date — last resort
}
def resolve_year(candidates: list, bib_year: str = "") -> Tuple[Optional[str], Optional[str]]:
"""
Pick the best year across all candidate results using source priority.
Conference/journal year always beats arXiv submission year.
Never returns a future year.
Args:
candidates: list of ComparisonResult objects
bib_year: the current bib entry year (fallback)
Returns:
(best_year, best_source) or (None, None)
"""
pool = []
for cand in candidates:
if not cand or not cand.fetched_data:
continue
source = cand.source
fetched_year = str(getattr(cand.fetched_data, 'year', '') or '').strip()
if not fetched_year or not fetched_year.isdigit():
continue
# Check for conference_year from arXiv journal_ref
conf_year = str(getattr(cand.fetched_data, 'conference_year', '') or '').strip()
if source == "arxiv" and conf_year and conf_year.isdigit():
pool.append((YEAR_SOURCE_PRIORITY.get("arxiv_journal_ref", 4), conf_year, "arxiv_journal_ref"))
priority = YEAR_SOURCE_PRIORITY.get(source, 50)
pool.append((priority, fetched_year, source))
if not pool:
return None, None
pool.sort()
# Pick best year that isn't in the future
for _, year, source in pool:
if int(year) <= CURRENT_YEAR:
return year, source
# All years are future — return None
return None, None
@dataclass
class ComparisonResult:
"""Result of comparing bib entry with fetched metadata."""
entry_key: str
# Title comparison
title_match: bool
title_similarity: float
bib_title: str
fetched_title: str
# Author comparison
author_match: bool
author_similarity: float
bib_authors: list[str]
fetched_authors: list[str]
# Year comparison
year_match: bool
bib_year: str
fetched_year: str
# Overall assessment
is_match: bool
confidence: float
issues: list[str]
source: str
# Raw metadata for auto-fixing
fetched_data: Any = None
# Author initial conflict flag
author_initial_conflict: bool = False
@property
def has_issues(self) -> bool:
return len(self.issues) > 0
@dataclass
class EntryReport:
"""Complete report for a single bib entry."""
entry: BibEntry
comparison: Optional[ComparisonResult]
evaluations: list = None
def __post_init__(self):
if self.evaluations is None:
self.evaluations = []
class MetadataComparator:
"""Compares bibliography entries with fetched metadata."""
# Thresholds for matching
TITLE_THRESHOLD = 0.8
AUTHOR_THRESHOLD = 0.6
def __init__(self):
self.normalizer = TextNormalizer
def compare(self, bib_entry: BibEntry, fetched_data: Any, source_name: str) -> ComparisonResult:
"""
Generic comparison method for any data source.
fetched_data must have 'title', 'year', and 'authors' attributes.
"""
issues = []
# --- Title Comparison ---
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
fetched_title_norm = self.normalizer.normalize_for_comparison(fetched_data.title)
title_similarity = self.normalizer.similarity_ratio(bib_title_norm, fetched_title_norm)
if len(bib_title_norm) < 100:
lev_sim = self.normalizer.levenshtein_similarity(bib_title_norm, fetched_title_norm)
title_similarity = max(title_similarity, lev_sim)
title_match = title_similarity >= self.TITLE_THRESHOLD
if not title_match:
issues.append(f"Title mismatch (similarity: {title_similarity:.2%})")
# --- Author Comparison ---
bib_authors = self.normalizer.normalize_author_list(bib_entry.author)
# Check for DBLP disambiguation IDs in bib entry author names
raw_author_list = self.normalizer.parse_author_list(bib_entry.author)
for raw_auth in raw_author_list:
if self.normalizer.has_dblp_disambiguation_id(raw_auth.strip()):
issues.append(f"DBLP disambiguation ID in author: '{raw_auth.strip()}'")
# Handle different author formats (list vs string)
fetched_authors_raw = getattr(fetched_data, 'authors', [])
if isinstance(fetched_authors_raw, str):
# Scholar style: "Author1, Author2"
fetched_authors_raw = [a.strip() for a in fetched_authors_raw.split(',')]
fetched_authors = [
self.normalizer.normalize_author_name(str(a))
for a in fetched_authors_raw
]
author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
author_match = author_similarity >= self.AUTHOR_THRESHOLD
allows_truncated_authors = any(
token in str(raw_author).lower()
for raw_author in raw_author_list
for token in ("others", "et al")
)
if (
author_match
and bib_authors
and fetched_authors
and len(bib_authors) != len(fetched_authors)
and not allows_truncated_authors
):
author_match = False
issues.append(f"Author count mismatch: bib={len(bib_authors)}, fetched={len(fetched_authors)}")
if not author_match:
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
# --- Year Comparison ---
bib_year = str(bib_entry.year).strip()
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
conference_year = str(getattr(fetched_data, 'conference_year', '') or '').strip()
if source_name.startswith("arxiv") and conference_year and conference_year.isdigit():
fetched_year = conference_year
year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
if not bib_year:
issues.append("Missing year in BibTeX entry")
elif not fetched_year:
issues.append(f"Missing year from {source_name} metadata")
elif not year_match:
issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
# --- Overall Assessment ---
is_match = title_match and author_match and year_match
# Simple weighted confidence score
confidence = (
title_similarity * 0.5 +
author_similarity * 0.3 +
(1.0 if year_match else 0.5) * 0.2
)
if not year_match:
# A title/author match with the wrong year is not safe enough to auto-fix.
confidence = min(confidence, 0.8)
# --- Author Initial Conflict Detection ---
author_initial_conflict = self._check_author_initial_conflict(
bib_authors, fetched_authors,
self.normalizer.parse_author_list(bib_entry.author),
fetched_authors_raw
)
if author_initial_conflict:
issues.append("Author initial conflict detected (e.g., first-name initials differ)")
# Cap confidence — don't auto-adopt these authors
confidence = min(confidence, 0.7)
return ComparisonResult(
entry_key=bib_entry.key,
title_match=title_match,
title_similarity=title_similarity,
bib_title=bib_entry.title,
fetched_title=fetched_data.title,
author_match=author_match,
author_similarity=author_similarity,
bib_authors=bib_authors,
fetched_authors=fetched_authors,
year_match=year_match,
bib_year=bib_year,
fetched_year=fetched_year,
is_match=is_match,
confidence=confidence,
issues=issues,
source=source_name,
fetched_data=fetched_data,
author_initial_conflict=author_initial_conflict
)
def create_unable_result(self, bib_entry: BibEntry, reason: str = "Unable to fetch metadata") -> ComparisonResult:
"""Create result when metadata couldn't be fetched."""
return ComparisonResult(
entry_key=bib_entry.key,
title_match=False, title_similarity=0.0,
bib_title=bib_entry.title, fetched_title="",
author_match=False, author_similarity=0.0,
bib_authors=self.normalizer.normalize_author_list(bib_entry.author), fetched_authors=[],
year_match=False, bib_year=bib_entry.year, fetched_year="",
is_match=False, confidence=0.0,
issues=[reason], source="unable",
fetched_data=None
)
def _compare_author_lists(self, list1: list[str], list2: list[str]) -> float:
"""Compare two author lists."""
if not list1 and not list2: return 1.0
if not list1 or not list2: return 0.0
total_similarity = 0.0
for author1 in list1:
best_match = 0.0
for author2 in list2:
if self._names_match(author1, author2):
best_match = 1.0
break
sim = self.normalizer.similarity_ratio(author1, author2)
best_match = max(best_match, sim)
total_similarity += best_match
return total_similarity / len(list1)
def _names_match(self, name1: str, name2: str) -> bool:
"""Check if two names match (handles abbreviated names)."""
def split_name(n):
parts = n.lower().replace('.', '').split()
return parts
words1 = split_name(name1)
words2 = split_name(name2)
if not words1 or not words2: return False
# Last name must match (assuming last word is last name)
if words1[-1] != words2[-1]:
return False
# First name check:
if len(words1) > 1 and len(words2) > 1:
f1 = words1[0]
f2 = words2[0]
# If one is just an initial
if len(f1) == 1 or len(f2) == 1:
if f1[0] != f2[0]: return False
else:
# Both full names - must match
if f1 != f2: return False
return True
def _check_author_initial_conflict(
self,
bib_authors_norm: list[str],
fetched_authors_norm: list[str],
bib_authors_raw: list[str],
fetched_authors_raw: list,
) -> bool:
"""
Detect when first-name initials clearly conflict between
bib entry and fetched data.
e.g., "Y. Zhou" (bib) vs "Henry Zhou" (fetched) → True (Y ≠ H)
This prevents blindly overwriting authors with wrong names.
"""
# Compare by position — aligned authors
min_len = min(len(bib_authors_norm), len(fetched_authors_norm))
if min_len == 0:
return False
for i in range(min_len):
bib_parts = bib_authors_norm[i].split()
fetched_parts = fetched_authors_norm[i].split()
if len(bib_parts) < 2 or len(fetched_parts) < 2:
continue
# Last name must match to consider this a potential conflict
if bib_parts[-1] != fetched_parts[-1]:
continue
bib_first = bib_parts[0]
fetched_first = fetched_parts[0]
# Both have first name info (not empty)
if not bib_first or not fetched_first:
continue
# If initials differ, it's a conflict
if bib_first[0] != fetched_first[0]:
return True
return False