Spaces:

voidful
/

RefCheck

Running

App Files Files Community

voidful commited on 1 day ago

Commit

838da49

verified ·

1 Parent(s): a2ec1b5

Make RefCheck verification strict

Browse files

Files changed (4) hide show

app.py +1 -1
main.py +93 -55
src/comparator.py +26 -4
src/space_service.py +46 -18

app.py CHANGED Viewed

@@ -258,7 +258,7 @@ with gr.Blocks(title="RefCheck") as demo:
                 review_action = gr.Radio(label="Candidate/action", choices=[])
                 with gr.Row():
                     test_button = gr.Button("Test selected")
-                    apply_button = gr.Button("Apply selected", variant="primary")
                 review_preview = gr.Markdown()
     run_button.click(

                 review_action = gr.Radio(label="Candidate/action", choices=[])
                 with gr.Row():
                     test_button = gr.Button("Test selected")
+                    apply_button = gr.Button("Apply exact selected", variant="primary")
                 review_preview = gr.Markdown()
     run_button.click(

main.py CHANGED Viewed

@@ -142,27 +142,16 @@ def run_fix_and_verify(bib_path: Path, workflow):
     local_db = LocalConferenceDB()
     local_db_loaded = local_db.load()
-    api_needed_entries = entries  # Default: all entries need API
     if local_db_loaded:
-        api_needed_entries = []
         local_matched_count = 0
         for entry in entries:
             official = local_db.lookup(entry.title)
             if official:
-                # Apply local DB fix
-                changes = apply_local_fix(entry, official)
-                if changes:
-                    local_matched_count += 1
-                    if entry.key not in fixed_details:
-                        fixed_details[entry.key] = []
-                    fixed_details[entry.key].extend(changes)
-                    fixed_count += 1
-            else:
-                api_needed_entries.append(entry)
         if local_matched_count > 0:
-            print(f"  📚 Local DB matched: {local_matched_count}, API needed: {len(api_needed_entries)}")
-            bib_parser.save_entries(str(bib_path), entries)
     # --- Phase 1: Analysis (API Fetch) ---
     analysis_results = []
@@ -195,13 +184,8 @@ def run_fix_and_verify(bib_path: Path, workflow):
             ok_entries.append(entry)
             continue
-        # Entries flagged for forced API lookup (e.g., future year) always go to to_fix
-        if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
             to_fix.append((entry, best_result, candidates))
-        elif best_result.confidence > 0.85 and best_result.fetched_data:
-            to_fix.append((entry, best_result, candidates))
-        elif best_result.is_match:
-            ok_entries.append(entry)
         elif candidates:
             to_review.append((entry, best_result, candidates))
         else:
@@ -222,7 +206,7 @@ def run_fix_and_verify(bib_path: Path, workflow):
     # Process Fixes
     for entry, best_result, candidates in to_fix:
-        changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
         if changes:
              fixed_count += 1
              fixed_details[entry.key] = changes
@@ -270,7 +254,10 @@ def run_fix_and_verify(bib_path: Path, workflow):
                     idx = int(choice) - 1
                     if 0 <= idx < len(candidates):
                         selected = candidates[idx]
-                        changes = apply_fix(entry, selected.fetched_data)
                         if changes:
                             fixed_count += 1
                             if entry.key not in fixed_details: fixed_details[entry.key] = []
@@ -337,19 +324,12 @@ def run_fix_and_verify(bib_path: Path, workflow):
 def apply_local_fix(entry, official) -> list:
     """
-    Apply fixes from local conference DB (ground truth).
-    Only updates year, booktitle, and entry type — not authors or title,
-    since DBLP data for those may have different formatting conventions.
     """
     changes = []
-    # Year: conference year is ground truth
-    if official.year and official.year != entry.year:
-        year_int = int(official.year) if official.year.isdigit() else 0
-        if 1950 <= year_int <= CURRENT_YEAR:
-            changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
-            entry.year = official.year
     # Entry type upgrade: misc/article → inproceedings if booktitle exists
     if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
         old_type = entry.entry_type
@@ -378,8 +358,20 @@ def apply_local_fix(entry, official) -> list:
     return changes
-def apply_fix(entry, data, all_candidates=None) -> list:
-    """Update entry metadata from fetched data. Returns list of changes strings."""
     changes = []
     # Helper to clean string
@@ -388,27 +380,29 @@ def apply_fix(entry, data, all_candidates=None) -> list:
     # Title
     new_title = clean(data.title)
     if new_title and new_title.lower() != entry.title.lower():
-        changes.append(f"Title: {entry.title} -> {new_title}")
-        entry.title = new_title
     # Year: Use resolve_year() if we have multiple candidates
-    if all_candidates:
-        best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
-        if best_year and best_year != entry.year:
-            if int(best_year) > CURRENT_YEAR:
-                changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
-            else:
-                changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
-                entry.year = best_year
-    else:
-        # Single candidate fallback
-        new_year = clean(getattr(data, 'year', ''))
-        if new_year and new_year != entry.year:
-            if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
-                changes.append(f"⚠ Skip suspicious future year {new_year}")
-            else:
-                changes.append(f"Year: {entry.year} -> {new_year}")
-                entry.year = new_year
     # Author: Smart Merge Strategy
     # Check for author initial conflict first
@@ -419,7 +413,9 @@ def apply_fix(entry, data, all_candidates=None) -> list:
                 has_initial_conflict = True
                 break
-    if has_initial_conflict:
         # Don't overwrite authors when initials conflict
         changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
     else:
@@ -471,13 +467,24 @@ def apply_fix(entry, data, all_candidates=None) -> list:
              entry.author = new_author_str
     # Optional fields (doi, journal, etc.)
-    if hasattr(data, 'doi') and data.doi and not entry.doi:
         changes.append(f"DOI: [Added] {data.doi}")
         entry.doi = data.doi
     return changes
 def validate_entry(entry, workflow, fetchers, comparator):
     """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
     from src.utils import TextNormalizer
@@ -548,12 +555,43 @@ def validate_entry(entry, workflow, fetchers, comparator):
     if results:
         best = max(results, key=lambda r: r.confidence)
         return best, results
     # No results
     return comparator.create_unable_result(entry, "Not found in any data source"), []

     local_db = LocalConferenceDB()
     local_db_loaded = local_db.load()
+    api_needed_entries = entries  # Always verify against live/network sources.
     if local_db_loaded:
         local_matched_count = 0
         for entry in entries:
             official = local_db.lookup(entry.title)
             if official:
+                local_matched_count += 1
         if local_matched_count > 0:
+            print(f"  📚 Local DB matched: {local_matched_count}; still verifying all entries online")
     # --- Phase 1: Analysis (API Fetch) ---
     analysis_results = []
             ok_entries.append(entry)
             continue
+        if best_result.is_match and best_result.fetched_data:
             to_fix.append((entry, best_result, candidates))
         elif candidates:
             to_review.append((entry, best_result, candidates))
         else:
     # Process Fixes
     for entry, best_result, candidates in to_fix:
+        changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates, allow_optional_updates=True)
         if changes:
              fixed_count += 1
              fixed_details[entry.key] = changes
                     idx = int(choice) - 1
                     if 0 <= idx < len(candidates):
                         selected = candidates[idx]
+                        if not _candidate_exact_match(selected):
+                            print("Cannot apply: selected candidate is not an exact title/author/year match.")
+                            continue
+                        changes = apply_fix(entry, selected.fetched_data, allow_optional_updates=True)
                         if changes:
                             fixed_count += 1
                             if entry.key not in fixed_details: fixed_details[entry.key] = []
 def apply_local_fix(entry, official) -> list:
     """
+    Apply non-core fixes from local conference DB.
+    This never changes title, authors, or year; those fields define the
+    reference identity and must be verified against live metadata.
     """
     changes = []
     # Entry type upgrade: misc/article → inproceedings if booktitle exists
     if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
         old_type = entry.entry_type
     return changes
+def apply_fix(
+    entry,
+    data,
+    all_candidates=None,
+    *,
+    allow_core_updates: bool = False,
+    allow_optional_updates: bool = False,
+) -> list:
+    """Update only safe metadata by default.
+    Core identity fields (title, author, year) are not overwritten unless
+    allow_core_updates=True. RefCheck should validate references, not transform
+    a nearby candidate into a different citation.
+    """
     changes = []
     # Helper to clean string
     # Title
     new_title = clean(data.title)
     if new_title and new_title.lower() != entry.title.lower():
+        if allow_core_updates:
+            changes.append(f"Title: {entry.title} -> {new_title}")
+            entry.title = new_title
     # Year: Use resolve_year() if we have multiple candidates
+    if allow_core_updates:
+        if all_candidates:
+            best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
+            if best_year and best_year != entry.year:
+                if int(best_year) > CURRENT_YEAR:
+                    changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
+                else:
+                    changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
+                    entry.year = best_year
+        else:
+            # Single candidate fallback
+            new_year = clean(getattr(data, 'year', ''))
+            if new_year and new_year != entry.year:
+                if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
+                    changes.append(f"⚠ Skip suspicious future year {new_year}")
+                else:
+                    changes.append(f"Year: {entry.year} -> {new_year}")
+                    entry.year = new_year
     # Author: Smart Merge Strategy
     # Check for author initial conflict first
                 has_initial_conflict = True
                 break
+    if not allow_core_updates:
+        pass
+    elif has_initial_conflict:
         # Don't overwrite authors when initials conflict
         changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
     else:
              entry.author = new_author_str
     # Optional fields (doi, journal, etc.)
+    if allow_optional_updates and hasattr(data, 'doi') and data.doi and not entry.doi:
         changes.append(f"DOI: [Added] {data.doi}")
         entry.doi = data.doi
     return changes
+def _candidate_exact_match(candidate) -> bool:
+    return bool(
+        candidate
+        and getattr(candidate, "is_match", False)
+        and getattr(candidate, "title_match", False)
+        and getattr(candidate, "author_match", False)
+        and getattr(candidate, "year_match", False)
+        and not getattr(candidate, "author_initial_conflict", False)
+    )
 def validate_entry(entry, workflow, fetchers, comparator):
     """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
     from src.utils import TextNormalizer
     if results:
         best = max(results, key=lambda r: r.confidence)
+        _apply_cross_source_conflict_guard(best, results)
         return best, results
     # No results
     return comparator.create_unable_result(entry, "Not found in any data source"), []
+def _apply_cross_source_conflict_guard(best, results) -> None:
+    """Reject candidates when exact-title sources disagree on core metadata."""
+    if not best or not getattr(best, "fetched_title", ""):
+        return
+    conflicts = []
+    for result in results:
+        if result is best:
+            continue
+        if getattr(result, "title_similarity", 0.0) < 0.95:
+            continue
+        best_year = str(getattr(best, "fetched_year", "") or "").strip()
+        other_year = str(getattr(result, "fetched_year", "") or "").strip()
+        if best_year and other_year and best_year != other_year:
+            conflicts.append(f"{result.source}={other_year}")
+    if not conflicts:
+        return
+    issue = (
+        f"Cross-source year conflict: best {best.source}={best.fetched_year}, "
+        f"also found {'; '.join(dict.fromkeys(conflicts))}"
+    )
+    if issue not in best.issues:
+        best.issues.append(issue)
+    best.is_match = False
+    best.confidence = min(best.confidence, 0.8)

src/comparator.py CHANGED Viewed

@@ -169,6 +169,21 @@ class MetadataComparator:
         author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
         author_match = author_similarity >= self.AUTHOR_THRESHOLD
         if not author_match:
             issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
@@ -176,19 +191,26 @@ class MetadataComparator:
         # --- Year Comparison ---
         bib_year = str(bib_entry.year).strip()
         fetched_year = str(getattr(fetched_data, 'year', '')).strip()
-        year_match = bib_year == fetched_year
-        if not year_match and bib_year and fetched_year:
             issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
         # --- Overall Assessment ---
-        is_match = title_match and author_match
         # Simple weighted confidence score
         confidence = (
             title_similarity * 0.5 +
             author_similarity * 0.3 +
             (1.0 if year_match else 0.5) * 0.2
         )
         # --- Author Initial Conflict Detection ---
         author_initial_conflict = self._check_author_initial_conflict(

         author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
         author_match = author_similarity >= self.AUTHOR_THRESHOLD
+        allows_truncated_authors = any(
+            token in str(raw_author).lower()
+            for raw_author in raw_author_list
+            for token in ("others", "et al")
+        )
+        if (
+            author_match
+            and bib_authors
+            and fetched_authors
+            and len(bib_authors) != len(fetched_authors)
+            and not allows_truncated_authors
+        ):
+            author_match = False
+            issues.append(f"Author count mismatch: bib={len(bib_authors)}, fetched={len(fetched_authors)}")
         if not author_match:
             issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
         # --- Year Comparison ---
         bib_year = str(bib_entry.year).strip()
         fetched_year = str(getattr(fetched_data, 'year', '')).strip()
+        year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
+        if not bib_year:
+            issues.append("Missing year in BibTeX entry")
+        elif not fetched_year:
+            issues.append(f"Missing year from {source_name} metadata")
+        elif not year_match:
             issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
         # --- Overall Assessment ---
+        is_match = title_match and author_match and year_match
         # Simple weighted confidence score
         confidence = (
             title_similarity * 0.5 +
             author_similarity * 0.3 +
             (1.0 if year_match else 0.5) * 0.2
         )
+        if not year_match:
+            # A title/author match with the wrong year is not safe enough to auto-fix.
+            confidence = min(confidence, 0.8)
         # --- Author Initial Conflict Detection ---
         author_initial_conflict = self._check_author_initial_conflict(

src/space_service.py CHANGED Viewed

@@ -13,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from main import (
     apply_fix,
-    apply_local_fix,
     get_default_workflow,
     validate_entry,
 )
@@ -99,12 +98,8 @@ def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = N
     for entry, best_result, candidates in analysis:
         if not best_result:
             actions[entry.key] = ("keep", None, [])
-        elif getattr(entry, "_force_api_lookup", False) and best_result.fetched_data:
             actions[entry.key] = ("fix", best_result, candidates)
-        elif best_result.confidence > 0.85 and best_result.fetched_data:
-            actions[entry.key] = ("fix", best_result, candidates)
-        elif best_result.is_match:
-            actions[entry.key] = ("keep", best_result, candidates)
         elif candidates:
             actions[entry.key] = ("review", best_result, candidates)
         else:
@@ -204,8 +199,20 @@ def preview_review_action(
         return "Select a candidate first."
     candidate = candidates[candidate_index]
     temp_entry = copy.deepcopy(entry)
-    changes = apply_fix(temp_entry, candidate.fetched_data)
     if not changes:
         changes = ["No field-level changes are needed for this candidate."]
@@ -258,7 +265,11 @@ def apply_review_action(
         if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
             raise ValueError("Select a candidate first.")
         candidate = candidates[candidate_index]
-        changes = apply_fix(entry, candidate.fetched_data)
         changes.append(f"Resolved manually with candidate from {candidate.source}.")
         result.fixed_details.setdefault(entry.key, []).extend(changes)
     elif action == "remove":
@@ -280,6 +291,30 @@ def _find_entry(entries: list[BibEntry], key: str) -> BibEntry | None:
     return None
 def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
     body = "\n".join(f"- {line}" for line in lines)
     return (
@@ -360,20 +395,13 @@ def _apply_local_db(
     if not local_db.is_loaded:
         return False, entries, 0
-    api_entries = []
     match_count = 0
     for entry in entries:
         official = local_db.lookup(entry.title)
-        if not official:
-            api_entries.append(entry)
-            continue
-        changes = apply_local_fix(entry, official)
-        match_count += 1
-        if changes:
-            fixed_details.setdefault(entry.key, []).extend(changes)
-    return True, api_entries, match_count
 @lru_cache(maxsize=1)

 from main import (
     apply_fix,
     get_default_workflow,
     validate_entry,
 )
     for entry, best_result, candidates in analysis:
         if not best_result:
             actions[entry.key] = ("keep", None, [])
+        elif best_result.is_match and best_result.fetched_data:
             actions[entry.key] = ("fix", best_result, candidates)
         elif candidates:
             actions[entry.key] = ("review", best_result, candidates)
         else:
         return "Select a candidate first."
     candidate = candidates[candidate_index]
+    if not _candidate_exact_match(candidate):
+        return _entry_preview_markdown(
+            entry,
+            "Candidate blocked",
+            [
+                "This candidate is not an exact title/author/year match, so RefCheck will not auto-apply it.",
+                f"Candidate source: {candidate.source}",
+                f"Candidate confidence: {candidate.confidence:.2f}",
+                *_candidate_issue_lines(candidate),
+            ],
+        )
     temp_entry = copy.deepcopy(entry)
+    changes = apply_fix(temp_entry, candidate.fetched_data, allow_optional_updates=True)
     if not changes:
         changes = ["No field-level changes are needed for this candidate."]
         if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
             raise ValueError("Select a candidate first.")
         candidate = candidates[candidate_index]
+        if not _candidate_exact_match(candidate):
+            raise ValueError(
+                "Selected candidate is not an exact title/author/year match; RefCheck will not auto-overwrite core metadata."
+            )
+        changes = apply_fix(entry, candidate.fetched_data, allow_optional_updates=True)
         changes.append(f"Resolved manually with candidate from {candidate.source}.")
         result.fixed_details.setdefault(entry.key, []).extend(changes)
     elif action == "remove":
     return None
+def _candidate_exact_match(candidate: Any) -> bool:
+    return bool(
+        candidate
+        and getattr(candidate, "is_match", False)
+        and getattr(candidate, "title_match", False)
+        and getattr(candidate, "author_match", False)
+        and getattr(candidate, "year_match", False)
+        and not getattr(candidate, "author_initial_conflict", False)
+    )
+def _candidate_issue_lines(candidate: Any) -> list[str]:
+    lines = list(getattr(candidate, "issues", []) or [])
+    if not getattr(candidate, "title_match", False):
+        lines.append("Title is not an exact-enough match")
+    if not getattr(candidate, "author_match", False):
+        lines.append("Authors are not an exact-enough match")
+    if not getattr(candidate, "year_match", False):
+        bib_year = getattr(candidate, "bib_year", "") or "[missing]"
+        fetched_year = getattr(candidate, "fetched_year", "") or "[missing]"
+        lines.append(f"Year mismatch: bib={bib_year}, candidate={fetched_year}")
+    return [f"Blocking issue: {line}" for line in dict.fromkeys(lines)]
 def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
     body = "\n".join(f"- {line}" for line in lines)
     return (
     if not local_db.is_loaded:
         return False, entries, 0
     match_count = 0
     for entry in entries:
         official = local_db.lookup(entry.title)
+        if official:
+            match_count += 1
+    return True, entries, match_count
 @lru_cache(maxsize=1)