voidful commited on
Commit
838da49
·
verified ·
1 Parent(s): a2ec1b5

Make RefCheck verification strict

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. main.py +93 -55
  3. src/comparator.py +26 -4
  4. src/space_service.py +46 -18
app.py CHANGED
@@ -258,7 +258,7 @@ with gr.Blocks(title="RefCheck") as demo:
258
  review_action = gr.Radio(label="Candidate/action", choices=[])
259
  with gr.Row():
260
  test_button = gr.Button("Test selected")
261
- apply_button = gr.Button("Apply selected", variant="primary")
262
  review_preview = gr.Markdown()
263
 
264
  run_button.click(
 
258
  review_action = gr.Radio(label="Candidate/action", choices=[])
259
  with gr.Row():
260
  test_button = gr.Button("Test selected")
261
+ apply_button = gr.Button("Apply exact selected", variant="primary")
262
  review_preview = gr.Markdown()
263
 
264
  run_button.click(
main.py CHANGED
@@ -142,27 +142,16 @@ def run_fix_and_verify(bib_path: Path, workflow):
142
  local_db = LocalConferenceDB()
143
  local_db_loaded = local_db.load()
144
 
145
- api_needed_entries = entries # Default: all entries need API
146
  if local_db_loaded:
147
- api_needed_entries = []
148
  local_matched_count = 0
149
  for entry in entries:
150
  official = local_db.lookup(entry.title)
151
  if official:
152
- # Apply local DB fix
153
- changes = apply_local_fix(entry, official)
154
- if changes:
155
- local_matched_count += 1
156
- if entry.key not in fixed_details:
157
- fixed_details[entry.key] = []
158
- fixed_details[entry.key].extend(changes)
159
- fixed_count += 1
160
- else:
161
- api_needed_entries.append(entry)
162
 
163
  if local_matched_count > 0:
164
- print(f" 📚 Local DB matched: {local_matched_count}, API needed: {len(api_needed_entries)}")
165
- bib_parser.save_entries(str(bib_path), entries)
166
 
167
  # --- Phase 1: Analysis (API Fetch) ---
168
  analysis_results = []
@@ -195,13 +184,8 @@ def run_fix_and_verify(bib_path: Path, workflow):
195
  ok_entries.append(entry)
196
  continue
197
 
198
- # Entries flagged for forced API lookup (e.g., future year) always go to to_fix
199
- if getattr(entry, '_force_api_lookup', False) and best_result.fetched_data:
200
  to_fix.append((entry, best_result, candidates))
201
- elif best_result.confidence > 0.85 and best_result.fetched_data:
202
- to_fix.append((entry, best_result, candidates))
203
- elif best_result.is_match:
204
- ok_entries.append(entry)
205
  elif candidates:
206
  to_review.append((entry, best_result, candidates))
207
  else:
@@ -222,7 +206,7 @@ def run_fix_and_verify(bib_path: Path, workflow):
222
 
223
  # Process Fixes
224
  for entry, best_result, candidates in to_fix:
225
- changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates)
226
  if changes:
227
  fixed_count += 1
228
  fixed_details[entry.key] = changes
@@ -270,7 +254,10 @@ def run_fix_and_verify(bib_path: Path, workflow):
270
  idx = int(choice) - 1
271
  if 0 <= idx < len(candidates):
272
  selected = candidates[idx]
273
- changes = apply_fix(entry, selected.fetched_data)
 
 
 
274
  if changes:
275
  fixed_count += 1
276
  if entry.key not in fixed_details: fixed_details[entry.key] = []
@@ -337,19 +324,12 @@ def run_fix_and_verify(bib_path: Path, workflow):
337
 
338
  def apply_local_fix(entry, official) -> list:
339
  """
340
- Apply fixes from local conference DB (ground truth).
341
- Only updates year, booktitle, and entry type not authors or title,
342
- since DBLP data for those may have different formatting conventions.
343
  """
344
  changes = []
345
 
346
- # Year: conference year is ground truth
347
- if official.year and official.year != entry.year:
348
- year_int = int(official.year) if official.year.isdigit() else 0
349
- if 1950 <= year_int <= CURRENT_YEAR:
350
- changes.append(f"Year: {entry.year} -> {official.year} [local_db]")
351
- entry.year = official.year
352
-
353
  # Entry type upgrade: misc/article → inproceedings if booktitle exists
354
  if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
355
  old_type = entry.entry_type
@@ -378,8 +358,20 @@ def apply_local_fix(entry, official) -> list:
378
  return changes
379
 
380
 
381
- def apply_fix(entry, data, all_candidates=None) -> list:
382
- """Update entry metadata from fetched data. Returns list of changes strings."""
 
 
 
 
 
 
 
 
 
 
 
 
383
  changes = []
384
 
385
  # Helper to clean string
@@ -388,27 +380,29 @@ def apply_fix(entry, data, all_candidates=None) -> list:
388
  # Title
389
  new_title = clean(data.title)
390
  if new_title and new_title.lower() != entry.title.lower():
391
- changes.append(f"Title: {entry.title} -> {new_title}")
392
- entry.title = new_title
 
393
 
394
  # Year: Use resolve_year() if we have multiple candidates
395
- if all_candidates:
396
- best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
397
- if best_year and best_year != entry.year:
398
- if int(best_year) > CURRENT_YEAR:
399
- changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
400
- else:
401
- changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
402
- entry.year = best_year
403
- else:
404
- # Single candidate fallback
405
- new_year = clean(getattr(data, 'year', ''))
406
- if new_year and new_year != entry.year:
407
- if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
408
- changes.append(f"⚠ Skip suspicious future year {new_year}")
409
- else:
410
- changes.append(f"Year: {entry.year} -> {new_year}")
411
- entry.year = new_year
 
412
 
413
  # Author: Smart Merge Strategy
414
  # Check for author initial conflict first
@@ -419,7 +413,9 @@ def apply_fix(entry, data, all_candidates=None) -> list:
419
  has_initial_conflict = True
420
  break
421
 
422
- if has_initial_conflict:
 
 
423
  # Don't overwrite authors when initials conflict
424
  changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
425
  else:
@@ -471,13 +467,24 @@ def apply_fix(entry, data, all_candidates=None) -> list:
471
  entry.author = new_author_str
472
 
473
  # Optional fields (doi, journal, etc.)
474
- if hasattr(data, 'doi') and data.doi and not entry.doi:
475
  changes.append(f"DOI: [Added] {data.doi}")
476
  entry.doi = data.doi
477
 
478
  return changes
479
 
480
 
 
 
 
 
 
 
 
 
 
 
 
481
  def validate_entry(entry, workflow, fetchers, comparator):
482
  """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
483
  from src.utils import TextNormalizer
@@ -548,12 +555,43 @@ def validate_entry(entry, workflow, fetchers, comparator):
548
 
549
  if results:
550
  best = max(results, key=lambda r: r.confidence)
 
551
  return best, results
552
 
553
  # No results
554
  return comparator.create_unable_result(entry, "Not found in any data source"), []
555
 
556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
 
559
 
 
142
  local_db = LocalConferenceDB()
143
  local_db_loaded = local_db.load()
144
 
145
+ api_needed_entries = entries # Always verify against live/network sources.
146
  if local_db_loaded:
 
147
  local_matched_count = 0
148
  for entry in entries:
149
  official = local_db.lookup(entry.title)
150
  if official:
151
+ local_matched_count += 1
 
 
 
 
 
 
 
 
 
152
 
153
  if local_matched_count > 0:
154
+ print(f" 📚 Local DB matched: {local_matched_count}; still verifying all entries online")
 
155
 
156
  # --- Phase 1: Analysis (API Fetch) ---
157
  analysis_results = []
 
184
  ok_entries.append(entry)
185
  continue
186
 
187
+ if best_result.is_match and best_result.fetched_data:
 
188
  to_fix.append((entry, best_result, candidates))
 
 
 
 
189
  elif candidates:
190
  to_review.append((entry, best_result, candidates))
191
  else:
 
206
 
207
  # Process Fixes
208
  for entry, best_result, candidates in to_fix:
209
+ changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates, allow_optional_updates=True)
210
  if changes:
211
  fixed_count += 1
212
  fixed_details[entry.key] = changes
 
254
  idx = int(choice) - 1
255
  if 0 <= idx < len(candidates):
256
  selected = candidates[idx]
257
+ if not _candidate_exact_match(selected):
258
+ print("Cannot apply: selected candidate is not an exact title/author/year match.")
259
+ continue
260
+ changes = apply_fix(entry, selected.fetched_data, allow_optional_updates=True)
261
  if changes:
262
  fixed_count += 1
263
  if entry.key not in fixed_details: fixed_details[entry.key] = []
 
324
 
325
  def apply_local_fix(entry, official) -> list:
326
  """
327
+ Apply non-core fixes from local conference DB.
328
+ This never changes title, authors, or year; those fields define the
329
+ reference identity and must be verified against live metadata.
330
  """
331
  changes = []
332
 
 
 
 
 
 
 
 
333
  # Entry type upgrade: misc/article → inproceedings if booktitle exists
334
  if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
335
  old_type = entry.entry_type
 
358
  return changes
359
 
360
 
361
+ def apply_fix(
362
+ entry,
363
+ data,
364
+ all_candidates=None,
365
+ *,
366
+ allow_core_updates: bool = False,
367
+ allow_optional_updates: bool = False,
368
+ ) -> list:
369
+ """Update only safe metadata by default.
370
+
371
+ Core identity fields (title, author, year) are not overwritten unless
372
+ allow_core_updates=True. RefCheck should validate references, not transform
373
+ a nearby candidate into a different citation.
374
+ """
375
  changes = []
376
 
377
  # Helper to clean string
 
380
  # Title
381
  new_title = clean(data.title)
382
  if new_title and new_title.lower() != entry.title.lower():
383
+ if allow_core_updates:
384
+ changes.append(f"Title: {entry.title} -> {new_title}")
385
+ entry.title = new_title
386
 
387
  # Year: Use resolve_year() if we have multiple candidates
388
+ if allow_core_updates:
389
+ if all_candidates:
390
+ best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
391
+ if best_year and best_year != entry.year:
392
+ if int(best_year) > CURRENT_YEAR:
393
+ changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
394
+ else:
395
+ changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
396
+ entry.year = best_year
397
+ else:
398
+ # Single candidate fallback
399
+ new_year = clean(getattr(data, 'year', ''))
400
+ if new_year and new_year != entry.year:
401
+ if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
402
+ changes.append(f"⚠ Skip suspicious future year {new_year}")
403
+ else:
404
+ changes.append(f"Year: {entry.year} -> {new_year}")
405
+ entry.year = new_year
406
 
407
  # Author: Smart Merge Strategy
408
  # Check for author initial conflict first
 
413
  has_initial_conflict = True
414
  break
415
 
416
+ if not allow_core_updates:
417
+ pass
418
+ elif has_initial_conflict:
419
  # Don't overwrite authors when initials conflict
420
  changes.append(f"⚠ Author initial conflict detected — preserving bib authors")
421
  else:
 
467
  entry.author = new_author_str
468
 
469
  # Optional fields (doi, journal, etc.)
470
+ if allow_optional_updates and hasattr(data, 'doi') and data.doi and not entry.doi:
471
  changes.append(f"DOI: [Added] {data.doi}")
472
  entry.doi = data.doi
473
 
474
  return changes
475
 
476
 
477
+ def _candidate_exact_match(candidate) -> bool:
478
+ return bool(
479
+ candidate
480
+ and getattr(candidate, "is_match", False)
481
+ and getattr(candidate, "title_match", False)
482
+ and getattr(candidate, "author_match", False)
483
+ and getattr(candidate, "year_match", False)
484
+ and not getattr(candidate, "author_initial_conflict", False)
485
+ )
486
+
487
+
488
  def validate_entry(entry, workflow, fetchers, comparator):
489
  """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
490
  from src.utils import TextNormalizer
 
555
 
556
  if results:
557
  best = max(results, key=lambda r: r.confidence)
558
+ _apply_cross_source_conflict_guard(best, results)
559
  return best, results
560
 
561
  # No results
562
  return comparator.create_unable_result(entry, "Not found in any data source"), []
563
 
564
 
565
+ def _apply_cross_source_conflict_guard(best, results) -> None:
566
+ """Reject candidates when exact-title sources disagree on core metadata."""
567
+ if not best or not getattr(best, "fetched_title", ""):
568
+ return
569
+
570
+ conflicts = []
571
+ for result in results:
572
+ if result is best:
573
+ continue
574
+ if getattr(result, "title_similarity", 0.0) < 0.95:
575
+ continue
576
+
577
+ best_year = str(getattr(best, "fetched_year", "") or "").strip()
578
+ other_year = str(getattr(result, "fetched_year", "") or "").strip()
579
+ if best_year and other_year and best_year != other_year:
580
+ conflicts.append(f"{result.source}={other_year}")
581
+
582
+ if not conflicts:
583
+ return
584
+
585
+ issue = (
586
+ f"Cross-source year conflict: best {best.source}={best.fetched_year}, "
587
+ f"also found {'; '.join(dict.fromkeys(conflicts))}"
588
+ )
589
+ if issue not in best.issues:
590
+ best.issues.append(issue)
591
+ best.is_match = False
592
+ best.confidence = min(best.confidence, 0.8)
593
+
594
+
595
 
596
 
597
 
src/comparator.py CHANGED
@@ -169,6 +169,21 @@ class MetadataComparator:
169
 
170
  author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
171
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  if not author_match:
174
  issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
@@ -176,19 +191,26 @@ class MetadataComparator:
176
  # --- Year Comparison ---
177
  bib_year = str(bib_entry.year).strip()
178
  fetched_year = str(getattr(fetched_data, 'year', '')).strip()
179
- year_match = bib_year == fetched_year
180
-
181
- if not year_match and bib_year and fetched_year:
 
 
 
 
182
  issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
183
 
184
  # --- Overall Assessment ---
185
- is_match = title_match and author_match
186
  # Simple weighted confidence score
187
  confidence = (
188
  title_similarity * 0.5 +
189
  author_similarity * 0.3 +
190
  (1.0 if year_match else 0.5) * 0.2
191
  )
 
 
 
192
 
193
  # --- Author Initial Conflict Detection ---
194
  author_initial_conflict = self._check_author_initial_conflict(
 
169
 
170
  author_similarity = self._compare_author_lists(bib_authors, fetched_authors)
171
  author_match = author_similarity >= self.AUTHOR_THRESHOLD
172
+
173
+ allows_truncated_authors = any(
174
+ token in str(raw_author).lower()
175
+ for raw_author in raw_author_list
176
+ for token in ("others", "et al")
177
+ )
178
+ if (
179
+ author_match
180
+ and bib_authors
181
+ and fetched_authors
182
+ and len(bib_authors) != len(fetched_authors)
183
+ and not allows_truncated_authors
184
+ ):
185
+ author_match = False
186
+ issues.append(f"Author count mismatch: bib={len(bib_authors)}, fetched={len(fetched_authors)}")
187
 
188
  if not author_match:
189
  issues.append(f"Author mismatch (similarity: {author_similarity:.2%})")
 
191
  # --- Year Comparison ---
192
  bib_year = str(bib_entry.year).strip()
193
  fetched_year = str(getattr(fetched_data, 'year', '')).strip()
194
+ year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
195
+
196
+ if not bib_year:
197
+ issues.append("Missing year in BibTeX entry")
198
+ elif not fetched_year:
199
+ issues.append(f"Missing year from {source_name} metadata")
200
+ elif not year_match:
201
  issues.append(f"Year mismatch: bib={bib_year}, {source_name}={fetched_year}")
202
 
203
  # --- Overall Assessment ---
204
+ is_match = title_match and author_match and year_match
205
  # Simple weighted confidence score
206
  confidence = (
207
  title_similarity * 0.5 +
208
  author_similarity * 0.3 +
209
  (1.0 if year_match else 0.5) * 0.2
210
  )
211
+ if not year_match:
212
+ # A title/author match with the wrong year is not safe enough to auto-fix.
213
+ confidence = min(confidence, 0.8)
214
 
215
  # --- Author Initial Conflict Detection ---
216
  author_initial_conflict = self._check_author_initial_conflict(
src/space_service.py CHANGED
@@ -13,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
13
 
14
  from main import (
15
  apply_fix,
16
- apply_local_fix,
17
  get_default_workflow,
18
  validate_entry,
19
  )
@@ -99,12 +98,8 @@ def run_refcheck_file(file_path: str | Path, options: RefCheckOptions | None = N
99
  for entry, best_result, candidates in analysis:
100
  if not best_result:
101
  actions[entry.key] = ("keep", None, [])
102
- elif getattr(entry, "_force_api_lookup", False) and best_result.fetched_data:
103
  actions[entry.key] = ("fix", best_result, candidates)
104
- elif best_result.confidence > 0.85 and best_result.fetched_data:
105
- actions[entry.key] = ("fix", best_result, candidates)
106
- elif best_result.is_match:
107
- actions[entry.key] = ("keep", best_result, candidates)
108
  elif candidates:
109
  actions[entry.key] = ("review", best_result, candidates)
110
  else:
@@ -204,8 +199,20 @@ def preview_review_action(
204
  return "Select a candidate first."
205
 
206
  candidate = candidates[candidate_index]
 
 
 
 
 
 
 
 
 
 
 
 
207
  temp_entry = copy.deepcopy(entry)
208
- changes = apply_fix(temp_entry, candidate.fetched_data)
209
  if not changes:
210
  changes = ["No field-level changes are needed for this candidate."]
211
 
@@ -258,7 +265,11 @@ def apply_review_action(
258
  if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
259
  raise ValueError("Select a candidate first.")
260
  candidate = candidates[candidate_index]
261
- changes = apply_fix(entry, candidate.fetched_data)
 
 
 
 
262
  changes.append(f"Resolved manually with candidate from {candidate.source}.")
263
  result.fixed_details.setdefault(entry.key, []).extend(changes)
264
  elif action == "remove":
@@ -280,6 +291,30 @@ def _find_entry(entries: list[BibEntry], key: str) -> BibEntry | None:
280
  return None
281
 
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
284
  body = "\n".join(f"- {line}" for line in lines)
285
  return (
@@ -360,20 +395,13 @@ def _apply_local_db(
360
  if not local_db.is_loaded:
361
  return False, entries, 0
362
 
363
- api_entries = []
364
  match_count = 0
365
  for entry in entries:
366
  official = local_db.lookup(entry.title)
367
- if not official:
368
- api_entries.append(entry)
369
- continue
370
-
371
- changes = apply_local_fix(entry, official)
372
- match_count += 1
373
- if changes:
374
- fixed_details.setdefault(entry.key, []).extend(changes)
375
 
376
- return True, api_entries, match_count
377
 
378
 
379
  @lru_cache(maxsize=1)
 
13
 
14
  from main import (
15
  apply_fix,
 
16
  get_default_workflow,
17
  validate_entry,
18
  )
 
98
  for entry, best_result, candidates in analysis:
99
  if not best_result:
100
  actions[entry.key] = ("keep", None, [])
101
+ elif best_result.is_match and best_result.fetched_data:
102
  actions[entry.key] = ("fix", best_result, candidates)
 
 
 
 
103
  elif candidates:
104
  actions[entry.key] = ("review", best_result, candidates)
105
  else:
 
199
  return "Select a candidate first."
200
 
201
  candidate = candidates[candidate_index]
202
+ if not _candidate_exact_match(candidate):
203
+ return _entry_preview_markdown(
204
+ entry,
205
+ "Candidate blocked",
206
+ [
207
+ "This candidate is not an exact title/author/year match, so RefCheck will not auto-apply it.",
208
+ f"Candidate source: {candidate.source}",
209
+ f"Candidate confidence: {candidate.confidence:.2f}",
210
+ *_candidate_issue_lines(candidate),
211
+ ],
212
+ )
213
+
214
  temp_entry = copy.deepcopy(entry)
215
+ changes = apply_fix(temp_entry, candidate.fetched_data, allow_optional_updates=True)
216
  if not changes:
217
  changes = ["No field-level changes are needed for this candidate."]
218
 
 
265
  if candidate_index is None or candidate_index < 0 or candidate_index >= len(candidates):
266
  raise ValueError("Select a candidate first.")
267
  candidate = candidates[candidate_index]
268
+ if not _candidate_exact_match(candidate):
269
+ raise ValueError(
270
+ "Selected candidate is not an exact title/author/year match; RefCheck will not auto-overwrite core metadata."
271
+ )
272
+ changes = apply_fix(entry, candidate.fetched_data, allow_optional_updates=True)
273
  changes.append(f"Resolved manually with candidate from {candidate.source}.")
274
  result.fixed_details.setdefault(entry.key, []).extend(changes)
275
  elif action == "remove":
 
291
  return None
292
 
293
 
294
+ def _candidate_exact_match(candidate: Any) -> bool:
295
+ return bool(
296
+ candidate
297
+ and getattr(candidate, "is_match", False)
298
+ and getattr(candidate, "title_match", False)
299
+ and getattr(candidate, "author_match", False)
300
+ and getattr(candidate, "year_match", False)
301
+ and not getattr(candidate, "author_initial_conflict", False)
302
+ )
303
+
304
+
305
+ def _candidate_issue_lines(candidate: Any) -> list[str]:
306
+ lines = list(getattr(candidate, "issues", []) or [])
307
+ if not getattr(candidate, "title_match", False):
308
+ lines.append("Title is not an exact-enough match")
309
+ if not getattr(candidate, "author_match", False):
310
+ lines.append("Authors are not an exact-enough match")
311
+ if not getattr(candidate, "year_match", False):
312
+ bib_year = getattr(candidate, "bib_year", "") or "[missing]"
313
+ fetched_year = getattr(candidate, "fetched_year", "") or "[missing]"
314
+ lines.append(f"Year mismatch: bib={bib_year}, candidate={fetched_year}")
315
+ return [f"Blocking issue: {line}" for line in dict.fromkeys(lines)]
316
+
317
+
318
  def _entry_preview_markdown(entry: BibEntry, title: str, lines: list[str]) -> str:
319
  body = "\n".join(f"- {line}" for line in lines)
320
  return (
 
395
  if not local_db.is_loaded:
396
  return False, entries, 0
397
 
 
398
  match_count = 0
399
  for entry in entries:
400
  official = local_db.lookup(entry.title)
401
+ if official:
402
+ match_count += 1
 
 
 
 
 
 
403
 
404
+ return True, entries, match_count
405
 
406
 
407
  @lru_cache(maxsize=1)