ai-talent-finder-backend / scripts /cleanup_test_profiles.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
3.34 kB
#!/usr/bin/env python3
"""Safe cleanup for test/demo candidate profiles.
Creates a backup JSON of affected candidates and removes their DB rows and uploaded CV files.
Run from project root with the same python env used for the backend.
Example:
DATABASE_URL=sqlite:///./ai_talent_finder.db .venv/bin/python backend/scripts/cleanup_test_profiles.py
"""
import os
import json
import sqlite3
from pathlib import Path
import shutil
DB_PATH = os.environ.get("DATABASE_URL", "sqlite:///./ai_talent_finder.db")
def sqlite_path_from_url(url: str) -> Path:
# supports sqlite:///./relative.db or sqlite:////abs/path.db
if url.startswith("sqlite:///"):
path = url.replace("sqlite:///", "")
return Path(path).resolve()
raise ValueError("Unsupported DB URL: " + url)
def main():
db_path = sqlite_path_from_url(DB_PATH)
if not db_path.exists():
print(f"DB not found: {db_path}")
return
out_dir = Path("archive") / "cleanups"
out_dir.mkdir(parents=True, exist_ok=True)
backup_file = out_dir / f"candidates_backup_{os.getpid()}.json"
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cur = conn.cursor()
# Identify candidate rows to remove: low quality or placeholder names/emails
query = """
SELECT id, full_name, email, cv_path, extraction_quality_score, is_fully_extracted
FROM candidates
WHERE (extraction_quality_score IS NULL OR extraction_quality_score < 80)
OR lower(full_name) LIKE '%unknown%'
OR lower(full_name) LIKE '%test%'
OR lower(full_name) LIKE '%demo%'
OR lower(email) LIKE 'candidate-%'
"""
rows = cur.execute(query).fetchall()
rows_data = [dict(r) for r in rows]
if not rows_data:
print("No candidate rows matched the cleanup criteria.")
return
# Backup
with backup_file.open("w", encoding="utf-8") as f:
json.dump(rows_data, f, ensure_ascii=False, indent=2)
print(f"Backed up {len(rows_data)} candidates to {backup_file}")
# Remove uploaded CV files and related candidate_skill links if present
uploads_root = Path(__file__).resolve().parents[2] / "uploads"
for r in rows_data:
cv = r.get("cv_path")
if cv:
file_path = Path(__file__).resolve().parents[2] / cv
try:
if file_path.exists():
file_path.unlink()
print(f"Removed upload: {file_path}")
except Exception as e:
print(f"Failed to remove {file_path}: {e}")
# Delete candidates from DB (and cascade if FK constraints exist)
ids = [str(r["id"]) for r in rows_data]
id_list = ",".join(ids)
try:
cur.execute(f"PRAGMA foreign_keys = ON;")
cur.execute(f"DELETE FROM candidate_skills WHERE candidate_id IN ({id_list})")
cur.execute(f"DELETE FROM favorites WHERE candidate_id IN ({id_list})")
cur.execute(f"DELETE FROM match_results WHERE candidate_id IN ({id_list})")
cur.execute(f"DELETE FROM candidates WHERE id IN ({id_list})")
conn.commit()
print(f"Deleted {len(ids)} candidate rows from DB.")
except Exception as e:
conn.rollback()
print(f"DB deletion failed: {e}")
finally:
conn.close()
if __name__ == '__main__':
main()