Make RefCheck verification strict
Browse files- app.py +1 -1
- main.py +93 -55
- src/comparator.py +26 -4
- src/space_service.py +46 -18
app.py
CHANGED
|
@@ -258,7 +258,7 @@ with gr.Blocks(title="RefCheck") as demo:
|
|
| 258 |
review_action = gr.Radio(label="Candidate/action", choices=[])
|
| 259 |
with gr.Row():
|
| 260 |
test_button = gr.Button("Test selected")
|
| 261 |
-
apply_button = gr.Button("Apply selected", variant="primary")
|
| 262 |
review_preview = gr.Markdown()
|
| 263 |
|
| 264 |
run_button.click(
|
|
|
|
| 258 |
review_action = gr.Radio(label="Candidate/action", choices=[])
|
| 259 |
with gr.Row():
|
| 260 |
test_button = gr.Button("Test selected")
|
| 261 |
+
apply_button = gr.Button("Apply exact selected", variant="primary")
|
| 262 |
review_preview = gr.Markdown()
|
| 263 |
|
| 264 |
run_button.click(
|
main.py
CHANGED
|
@@ -142,27 +142,16 @@ def run_fix_and_verify(bib_path: Path, workflow):
|
|
| 142 |
local_db = LocalConferenceDB()
|
| 143 |
local_db_loaded = local_db.load()
|
| 144 |
|
| 145 |
-
api_needed_entries = entries #
|
| 146 |
if local_db_loaded:
|
| 147 |
-
api_needed_entries = []
|
| 148 |
local_matched_count = 0
|
| 149 |
for entry in entries:
|
| 150 |
official = local_db.lookup(entry.title)
|
| 151 |
if official:
|
| 152 |
-
|
| 153 |
-
changes = apply_local_fix(entry, official)
|
| 154 |
-
if changes:
|
| 155 |
-
local_matched_count += 1
|
| 156 |
-
if entry.key not in fixed_details:
|
| 157 |
-
fixed_details[entry.key] = []
|
| 158 |
-
fixed_details[entry.key].extend(changes)
|
| 159 |
-
fixed_count += 1
|
| 160 |
-
else:
|
| 161 |
-
api_needed_entries.append(entry)
|
| 162 |
|
| 163 |
if local_matched_count > 0:
|
| 164 |
-
print(f" 📚 Local DB matched: {local_matched_count}
|
| 165 |
-
bib_parser.save_entries(str(bib_path), entries)
|
| 166 |
|
| 167 |
# --- Phase 1: Analysis (API Fetch) ---
|
| 168 |
analysis_results = []
|
|
@@ -195,13 +184,8 @@ def run_fix_and_verify(bib_path: Path, workflow):
|
|
| 195 |
ok_entries.append(entry)
|
| 196 |
continue
|
| 197 |
|
| 198 |
-
|
| 199 |
-
if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
|
| 200 |
to_fix.append((entry, best_result, candidates))
|
| 201 |
-
elif best_result.confidence > 0.85 and best_result.fetched_data:
|
| 202 |
-
to_fix.append((entry, best_result, candidates))
|
| 203 |
-
elif best_result.is_match:
|
| 204 |
-
ok_entries.append(entry)
|
| 205 |
elif candidates:
|
| 206 |
to_review.append((entry, best_result, candidates))
|
| 207 |
else:
|
|
@@ -222,7 +206,7 @@ def run_fix_and_verify(bib_path: Path, workflow):
|
|
| 222 |
|
| 223 |
# Process Fixes
|
| 224 |
for entry, best_result, candidates in to_fix:
|
| 225 |
-
changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
|
| 226 |
if changes:
|
| 227 |
fixed_count += 1
|
| 228 |
fixed_details[entry.key] = changes
|
|
@@ -270,7 +254,10 @@ def run_fix_and_verify(bib_path: Path, workflow):
|
|
| 270 |
idx = int(choice) - 1
|
| 271 |
if 0 <= idx < len(candidates):
|
| 272 |
selected = candidates[idx]
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
| 274 |
if changes:
|
| 275 |
fixed_count += 1
|
| 276 |
if entry.key not in fixed_details: fixed_details[entry.key] = []
|
|
@@ -337,19 +324,12 @@ def run_fix_and_verify(bib_path: Path, workflow):
|
|
| 337 |
|
| 338 |
def apply_local_fix(entry, official) -> list:
|
| 339 |
"""
|
| 340 |
-
Apply fixes from local conference DB
|
| 341 |
-
|
| 342 |
-
|
| 343 |
"""
|
| 344 |
changes = []
|
| 345 |
|
| 346 |
-
# Year: conference year is ground truth
|
| 347 |
-
if official.year and official.year != entry.year:
|
| 348 |
-
year_int = int(official.year) if official.year.isdigit() else 0
|
| 349 |
-
if 1950 <= year_int <= CURRENT_YEAR:
|
| 350 |
-
changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
|
| 351 |
-
entry.year = official.year
|
| 352 |
-
|
| 353 |
# Entry type upgrade: misc/article → inproceedings if booktitle exists
|
| 354 |
if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
|
| 355 |
old_type = entry.entry_type
|
|
@@ -378,8 +358,20 @@ def apply_local_fix(entry, official) -> list:
|
|
| 378 |
return changes
|
| 379 |
|
| 380 |
|
| 381 |
-
def apply_fix(
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
changes = []
|
| 384 |
|
| 385 |
# Helper to clean string
|
|
@@ -388,27 +380,29 @@ def apply_fix(entry, data, all_candidates=None) -> list:
|
|
| 388 |
# Title
|
| 389 |
new_title = clean(data.title)
|
| 390 |
if new_title and new_title.lower() != entry.title.lower():
|
| 391 |
-
|
| 392 |
-
|
|
|
|
| 393 |
|
| 394 |
# Year: Use resolve_year() if we have multiple candidates
|
| 395 |
-
if
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
if
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
if new_year
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
| 412 |
|
| 413 |
# Author: Smart Merge Strategy
|
| 414 |
# Check for author initial conflict first
|
|
@@ -419,7 +413,9 @@ def apply_fix(entry, data, all_candidates=None) -> list:
|
|
| 419 |
has_initial_conflict = True
|
| 420 |
break
|
| 421 |
|
| 422 |
-
if
|
|
|
|
|
|
|
| 423 |
# Don't overwrite authors when initials conflict
|
| 424 |
changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
|
| 425 |
else:
|
|
@@ -471,13 +467,24 @@ def apply_fix(entry, data, all_candidates=None) -> list:
|
|
| 471 |
entry.author = new_author_str
|
| 472 |
|
| 473 |
# Optional fields (doi, journal, etc.)
|
| 474 |
-
if hasattr(data, 'doi') and data.doi and not entry.doi:
|
| 475 |
changes.append(f"DOI: [Added] {data.doi}")
|
| 476 |
entry.doi = data.doi
|
| 477 |
|
| 478 |
return changes
|
| 479 |
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
def validate_entry(entry, workflow, fetchers, comparator):
|
| 482 |
"""Validate a single entry against configured data sources. Returns (best_result, all_results)."""
|
| 483 |
from src.utils import TextNormalizer
|
|
@@ -548,12 +555,43 @@ def validate_entry(entry, workflow, fetchers, comparator):
|
|
| 548 |
|
| 549 |
if results:
|
| 550 |
best = max(results, key=lambda r: r.confidence)
|
|
|
|
| 551 |
return best, results
|
| 552 |
|
| 553 |
# No results
|
| 554 |
return comparator.create_unable_result(entry, "Not found in any data source"), []
|
| 555 |
|
| 556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
|
| 558 |
|
| 559 |
|
|
|
|
| 142 |
local_db = LocalConferenceDB()
|
| 143 |
local_db_loaded = local_db.load()
|
| 144 |
|
| 145 |
+
api_needed_entries = entries # Always verify against live/network sources.
|
| 146 |
if local_db_loaded:
|
|
|
|
| 147 |
local_matched_count = 0
|
| 148 |
for entry in entries:
|
| 149 |
official = local_db.lookup(entry.title)
|
| 150 |
if official:
|
| 151 |
+
local_matched_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
if local_matched_count > 0:
|
| 154 |
+
print(f" 📚 Local DB matched: {local_matched_count}; still verifying all entries online")
|
|
|
|
| 155 |
|
| 156 |
# --- Phase 1: Analysis (API Fetch) ---
|
| 157 |
analysis_results = []
|
|
|
|
| 184 |
ok_entries.append(entry)
|
| 185 |
continue
|
| 186 |
|
| 187 |
+
if best_result.is_match and best_result.fetched_data:
|
|
|
|
| 188 |
to_fix.append((entry, best_result, candidates))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
elif candidates:
|
| 190 |
to_review.append((entry, best_result, candidates))
|
| 191 |
else:
|
|
|
|
| 206 |
|
| 207 |
# Process Fixes
|
| 208 |
for entry, best_result, candidates in to_fix:
|
| 209 |
+
changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates, allow_optional_updates=True)
|
| 210 |
if changes:
|
| 211 |
fixed_count += 1
|
| 212 |
fixed_details[entry.key] = changes
|
|
|
|
| 254 |
idx = int(choice) - 1
|
| 255 |
if 0 <= idx < len(candidates):
|
| 256 |
selected = candidates[idx]
|
| 257 |
+
if not _candidate_exact_match(selected):
|
| 258 |
+
print("Cannot apply: selected candidate is not an exact title/author/year match.")
|
| 259 |
+
continue
|
| 260 |
+
changes = apply_fix(entry, selected.fetched_data, allow_optional_updates=True)
|
| 261 |
if changes:
|
| 262 |
fixed_count += 1
|
| 263 |
if entry.key not in fixed_details: fixed_details[entry.key] = []
|
|
|
|
| 324 |
|
| 325 |
def apply_local_fix(entry, official) -> list:
|
| 326 |
"""
|
| 327 |
+
Apply non-core fixes from local conference DB.
|
| 328 |
+
This never changes title, authors, or year; those fields define the
|
| 329 |
+
reference identity and must be verified against live metadata.
|
| 330 |
"""
|
| 331 |
changes = []
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
# Entry type upgrade: misc/article → inproceedings if booktitle exists
|
| 334 |
if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
|
| 335 |
old_type = entry.entry_type
|
|
|
|
| 358 |
return changes
|
| 359 |
|
| 360 |
|
| 361 |
+
def apply_fix(
|
| 362 |
+
entry,
|
| 363 |
+
data,
|
| 364 |
+
all_candidates=None,
|
| 365 |
+
*,
|
| 366 |
+
allow_core_updates: bool = False,
|
| 367 |
+
allow_optional_updates: bool = False,
|
| 368 |
+
) -> list:
|
| 369 |
+
"""Update only safe metadata by default.
|
| 370 |
+
|
| 371 |
+
Core identity fields (title, author, year) are not overwritten unless
|
| 372 |
+
allow_core_updates=True. RefCheck should validate references, not transform
|
| 373 |
+
a nearby candidate into a different citation.
|
| 374 |
+
"""
|
| 375 |
changes = []
|
| 376 |
|
| 377 |
# Helper to clean string
|
|
|
|
| 380 |
# Title
|
| 381 |
new_title = clean(data.title)
|
| 382 |
if new_title and new_title.lower() != entry.title.lower():
|
| 383 |
+
if allow_core_updates:
|
| 384 |
+
changes.append(f"Title: {entry.title} -> {new_title}")
|
| 385 |
+
entry.title = new_title
|
| 386 |
|
| 387 |
# Year: Use resolve_year() if we have multiple candidates
|
| 388 |
+
if allow_core_updates:
|
| 389 |
+
if all_candidates:
|
| 390 |
+
best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
|
| 391 |
+
if best_year and best_year != entry.year:
|
| 392 |
+
if int(best_year) > CURRENT_YEAR:
|
| 393 |
+
changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
|
| 394 |
+
else:
|
| 395 |
+
changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
|
| 396 |
+
entry.year = best_year
|
| 397 |
+
else:
|
| 398 |
+
# Single candidate fallback
|
| 399 |
+
new_year = clean(getattr(data, 'year', ''))
|
| 400 |
+
if new_year and new_year != entry.year:
|
| 401 |
+
if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
|
| 402 |
+
changes.append(f"⚠ Skip suspicious future year {new_year}")
|
| 403 |
+
else:
|
| 404 |
+
changes.append(f"Year: {entry.year} -> {new_year}")
|
| 405 |
+
entry.year = new_year
|
| 406 |
|
| 407 |
# Author: Smart Merge Strategy
|
| 408 |
# Check for author initial conflict first
|
|
|
|
| 413 |
has_initial_conflict = True
|
| 414 |
break
|
| 415 |
|
| 416 |
+
if not allow_core_updates:
|
| 417 |
+
pass
|
| 418 |
+
elif has_initial_conflict:
|
| 419 |
# Don't overwrite authors when initials conflict
|
| 420 |
changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
|
| 421 |
else:
|
|
|
|
| 467 |
entry.author = new_author_str
|
| 468 |
|
| 469 |
# Optional fields (doi, journal, etc.)
|
| 470 |
+
if allow_optional_updates and hasattr(data, 'doi') and data.doi and not entry.doi:
|
| 471 |
changes.append(f"DOI: [Added] {data.doi}")
|
| 472 |
entry.doi = data.doi
|
| 473 |
|
| 474 |
return changes
|
| 475 |
|
| 476 |
|
| 477 |
+
def _candidate_exact_match(candidate) -> bool:
|
| 478 |
+
return bool(
|
| 479 |
+
candidate
|
| 480 |
+
and getattr(candidate, "is_match", False)
|
| 481 |
+
and getattr(candidate, "title_match", False)
|
| 482 |
+
and getattr(candidate, "author_match", False)
|
| 483 |
+
and getattr(candidate, "year_match", False)
|
| 484 |
+
and not getattr(candidate, "author_initial_conflict", False)
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
|
| 488 |
def validate_entry(entry, workflow, fetchers, comparator):
|
| 489 |
"""Validate a single entry against configured data sources. Returns (best_result, all_results)."""
|
| 490 |
from src.utils import TextNormalizer
|
|
|
|
| 555 |
|
| 556 |
if results:
|
| 557 |
best = max(results, key=lambda r: r.confidence)
|
| 558 |
+
_apply_cross_source_conflict_guard(best, results)
|
| 559 |
return best, results
|
| 560 |
|
| 561 |
# No results
|
| 562 |
return comparator.create_unable_result(entry, "Not found in any data source"), []
|
| 563 |
|
| 564 |
|
| 565 |
+
def _apply_cross_source_conflict_guard(best, results) -> None:
|
| 566 |
+
"""Reject candidates when exact-title sources disagree on core metadata."""
|
| 567 |
+
if not best or not getattr(best, "fetched_title", ""):
|
| 568 |
+
return
|
| 569 |
+
|
| 570 |
+
conflicts = []
|
| 571 |
+
for result in results:
|
| 572 |
+
if result is best:
|
| 573 |
+
continue
|
| 574 |
+
if getattr(result, "title_similarity", 0.0) < 0.95:
|
| 575 |
+
continue
|
| 576 |
+
|
| 577 |
+
best_year = str(getattr(best, "fetched_year", "") or "").strip()
|
| 578 |
+
other_year = str(getattr(result, "fetched_year", "") or "").strip()
|
| 579 |
+
if best_year and other_year and best_year != other_year:
|
| 580 |
+
conflicts.append(f"{result.source}={other_year}")
|
| 581 |
+
|
| 582 |
+
if not conflicts:
|
| 583 |
+
return
|
| 584 |
+
|
| 585 |
+
issue = (
|
| 586 |
+
f"Cross-source year conflict: best {best.source}={best.fetched_year}, "
|
| 587 |
+
f"also found {'; '.join(dict.fromkeys(conflicts))}"
|
| 588 |
+
)
|
| 589 |
+
if issue not in best.issues:
|
| 590 |
+
best.issues.append(issue)
|
| 591 |
+
best.is_match = False
|
| 592 |
+
best.confidence = min(best.confidence, 0.8)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
|
| 596 |
|
| 597 |
|
src/comparator.py
CHANGED
|
@@ -169,6 +169,21 @@ class MetadataComparator:
|
|
| 169 |
|
| 170 |
author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
|
| 171 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
if not author_match:
|
| 174 |
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
|
@@ -176,19 +191,26 @@ class MetadataComparator:
|
|
| 176 |
# --- Year Comparison ---
|
| 177 |
bib_year = str(bib_entry.year).strip()
|
| 178 |
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
|
| 179 |
-
year_match = bib_year == fetched_year
|
| 180 |
-
|
| 181 |
-
if not
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
|
| 183 |
|
| 184 |
# --- Overall Assessment ---
|
| 185 |
-
is_match = title_match and author_match
|
| 186 |
# Simple weighted confidence score
|
| 187 |
confidence = (
|
| 188 |
title_similarity * 0.5 +
|
| 189 |
author_similarity * 0.3 +
|
| 190 |
(1.0 if year_match else 0.5) * 0.2
|
| 191 |
)
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
# --- Author Initial Conflict Detection ---
|
| 194 |
author_initial_conflict = self._check_author_initial_conflict(
|
|
|
|
| 169 |
|
| 170 |
author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
|
| 171 |
author_match = author_similarity >= self.AUTHOR_THRESHOLD
|
| 172 |
+
|
| 173 |
+
allows_truncated_authors = any(
|
| 174 |
+
token in str(raw_author).lower()
|
| 175 |
+
for raw_author in raw_author_list
|
| 176 |
+
for token in ("others", "et al")
|
| 177 |
+
)
|
| 178 |
+
if (
|
| 179 |
+
author_match
|
| 180 |
+
and bib_authors
|
| 181 |
+
and fetched_authors
|
| 182 |
+
and len(bib_authors) != len(fetched_authors)
|
| 183 |
+
and not allows_truncated_authors
|
| 184 |
+
):
|
| 185 |
+
author_match = False
|
| 186 |
+
issues.append(f"Author count mismatch: bib={len(bib_authors)}, fetched={len(fetched_authors)}")
|
| 187 |
|
| 188 |
if not author_match:
|
| 189 |
issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
|
|
|
|
| 191 |
# --- Year Comparison ---
|
| 192 |
bib_year = str(bib_entry.year).strip()
|
| 193 |
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
|
| 194 |
+
year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
|
| 195 |
+
|
| 196 |
+
if not bib_year:
|
| 197 |
+
issues.append("Missing year in BibTeX entry")
|
| 198 |
+
elif not fetched_year:
|
| 199 |
+
issues.append(f"Missing year from {source_name} metadata")
|
| 200 |
+
elif not year_match:
|
| 201 |
issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
|
| 202 |
|
| 203 |
# --- Overall Assessment ---
|
| 204 |
+
is_match = title_match and author_match and year_match
|
| 205 |
# Simple weighted confidence score
|
| 206 |
confidence = (
|
| 207 |
title_similarity * 0.5 +
|
| 208 |
author_similarity * 0.3 +
|
| 209 |
(1.0 if year_match else 0.5) * 0.2
|
| 210 |
)
|
| 211 |
+
if not year_match:
|
| 212 |
+
# A title/author match with the wrong year is not safe enough to auto-fix.
|
| 213 |
+
confidence = min(confidence, 0.8)
|
| 214 |
|
| 215 |
# --- Author Initial Conflict Detection ---
|
| 216 |
author_initial_conflict = self._check_author_initial_conflict(
|
src/space_service.py
CHANGED
|
@@ -13,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
| 13 |
|
| 14 |
from main import (
|
| 15 |
apply_fix,
|
| 16 |
-
apply_local_fix,
|
| 17 |
get_default_workflow,
|
| 18 |
validate_entry,
|
| 19 |
)
|
|
@@ -99,12 +98,8 @@ def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = N
|
|
| 99 |
for entry, best_result, candidates in analysis:
|
| 100 |
if not best_result:
|
| 101 |
actions[entry.key] = ("keep", None, [])
|
| 102 |
-
elif
|
| 103 |
actions[entry.key] = ("fix", best_result, candidates)
|
| 104 |
-
elif best_result.confidence > 0.85 and best_result.fetched_data:
|
| 105 |
-
actions[entry.key] = ("fix", best_result, candidates)
|
| 106 |
-
elif best_result.is_match:
|
| 107 |
-
actions[entry.key] = ("keep", best_result, candidates)
|
| 108 |
elif candidates:
|
| 109 |
actions[entry.key] = ("review", best_result, candidates)
|
| 110 |
else:
|
|
@@ -204,8 +199,20 @@ def preview_review_action(
|
|
| 204 |
return "Select a candidate first."
|
| 205 |
|
| 206 |
candidate = candidates[candidate_index]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
temp_entry = copy.deepcopy(entry)
|
| 208 |
-
changes = apply_fix(temp_entry, candidate.fetched_data)
|
| 209 |
if not changes:
|
| 210 |
changes = ["No field-level changes are needed for this candidate."]
|
| 211 |
|
|
@@ -258,7 +265,11 @@ def apply_review_action(
|
|
| 258 |
if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
|
| 259 |
raise ValueError("Select a candidate first.")
|
| 260 |
candidate = candidates[candidate_index]
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
changes.append(f"Resolved manually with candidate from {candidate.source}.")
|
| 263 |
result.fixed_details.setdefault(entry.key, []).extend(changes)
|
| 264 |
elif action == "remove":
|
|
@@ -280,6 +291,30 @@ def _find_entry(entries: list[BibEntry], key: str) -> BibEntry | None:
|
|
| 280 |
return None
|
| 281 |
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
|
| 284 |
body = "\n".join(f"- {line}" for line in lines)
|
| 285 |
return (
|
|
@@ -360,20 +395,13 @@ def _apply_local_db(
|
|
| 360 |
if not local_db.is_loaded:
|
| 361 |
return False, entries, 0
|
| 362 |
|
| 363 |
-
api_entries = []
|
| 364 |
match_count = 0
|
| 365 |
for entry in entries:
|
| 366 |
official = local_db.lookup(entry.title)
|
| 367 |
-
if
|
| 368 |
-
|
| 369 |
-
continue
|
| 370 |
-
|
| 371 |
-
changes = apply_local_fix(entry, official)
|
| 372 |
-
match_count += 1
|
| 373 |
-
if changes:
|
| 374 |
-
fixed_details.setdefault(entry.key, []).extend(changes)
|
| 375 |
|
| 376 |
-
return True,
|
| 377 |
|
| 378 |
|
| 379 |
@lru_cache(maxsize=1)
|
|
|
|
| 13 |
|
| 14 |
from main import (
|
| 15 |
apply_fix,
|
|
|
|
| 16 |
get_default_workflow,
|
| 17 |
validate_entry,
|
| 18 |
)
|
|
|
|
| 98 |
for entry, best_result, candidates in analysis:
|
| 99 |
if not best_result:
|
| 100 |
actions[entry.key] = ("keep", None, [])
|
| 101 |
+
elif best_result.is_match and best_result.fetched_data:
|
| 102 |
actions[entry.key] = ("fix", best_result, candidates)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
elif candidates:
|
| 104 |
actions[entry.key] = ("review", best_result, candidates)
|
| 105 |
else:
|
|
|
|
| 199 |
return "Select a candidate first."
|
| 200 |
|
| 201 |
candidate = candidates[candidate_index]
|
| 202 |
+
if not _candidate_exact_match(candidate):
|
| 203 |
+
return _entry_preview_markdown(
|
| 204 |
+
entry,
|
| 205 |
+
"Candidate blocked",
|
| 206 |
+
[
|
| 207 |
+
"This candidate is not an exact title/author/year match, so RefCheck will not auto-apply it.",
|
| 208 |
+
f"Candidate source: {candidate.source}",
|
| 209 |
+
f"Candidate confidence: {candidate.confidence:.2f}",
|
| 210 |
+
*_candidate_issue_lines(candidate),
|
| 211 |
+
],
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
temp_entry = copy.deepcopy(entry)
|
| 215 |
+
changes = apply_fix(temp_entry, candidate.fetched_data, allow_optional_updates=True)
|
| 216 |
if not changes:
|
| 217 |
changes = ["No field-level changes are needed for this candidate."]
|
| 218 |
|
|
|
|
| 265 |
if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
|
| 266 |
raise ValueError("Select a candidate first.")
|
| 267 |
candidate = candidates[candidate_index]
|
| 268 |
+
if not _candidate_exact_match(candidate):
|
| 269 |
+
raise ValueError(
|
| 270 |
+
"Selected candidate is not an exact title/author/year match; RefCheck will not auto-overwrite core metadata."
|
| 271 |
+
)
|
| 272 |
+
changes = apply_fix(entry, candidate.fetched_data, allow_optional_updates=True)
|
| 273 |
changes.append(f"Resolved manually with candidate from {candidate.source}.")
|
| 274 |
result.fixed_details.setdefault(entry.key, []).extend(changes)
|
| 275 |
elif action == "remove":
|
|
|
|
| 291 |
return None
|
| 292 |
|
| 293 |
|
| 294 |
+
def _candidate_exact_match(candidate: Any) -> bool:
|
| 295 |
+
return bool(
|
| 296 |
+
candidate
|
| 297 |
+
and getattr(candidate, "is_match", False)
|
| 298 |
+
and getattr(candidate, "title_match", False)
|
| 299 |
+
and getattr(candidate, "author_match", False)
|
| 300 |
+
and getattr(candidate, "year_match", False)
|
| 301 |
+
and not getattr(candidate, "author_initial_conflict", False)
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _candidate_issue_lines(candidate: Any) -> list[str]:
|
| 306 |
+
lines = list(getattr(candidate, "issues", []) or [])
|
| 307 |
+
if not getattr(candidate, "title_match", False):
|
| 308 |
+
lines.append("Title is not an exact-enough match")
|
| 309 |
+
if not getattr(candidate, "author_match", False):
|
| 310 |
+
lines.append("Authors are not an exact-enough match")
|
| 311 |
+
if not getattr(candidate, "year_match", False):
|
| 312 |
+
bib_year = getattr(candidate, "bib_year", "") or "[missing]"
|
| 313 |
+
fetched_year = getattr(candidate, "fetched_year", "") or "[missing]"
|
| 314 |
+
lines.append(f"Year mismatch: bib={bib_year}, candidate={fetched_year}")
|
| 315 |
+
return [f"Blocking issue: {line}" for line in dict.fromkeys(lines)]
|
| 316 |
+
|
| 317 |
+
|
| 318 |
def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
|
| 319 |
body = "\n".join(f"- {line}" for line in lines)
|
| 320 |
return (
|
|
|
|
| 395 |
if not local_db.is_loaded:
|
| 396 |
return False, entries, 0
|
| 397 |
|
|
|
|
| 398 |
match_count = 0
|
| 399 |
for entry in entries:
|
| 400 |
official = local_db.lookup(entry.title)
|
| 401 |
+
if official:
|
| 402 |
+
match_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
+
return True, entries, match_count
|
| 405 |
|
| 406 |
|
| 407 |
@lru_cache(maxsize=1)
|