""" Skill Quality Metrics — Monitor Dictionary Health Track usage, coverage, and quality of the skill dictionary. """ from typing import Dict, List, Any from collections import Counter import logging logger = logging.getLogger(__name__) class SkillQualityAnalyzer: """Analyze and report on skill dictionary quality.""" def compute_metrics(self, db: Any) -> Dict: """Compute comprehensive skill quality metrics.""" try: from app.models.models import Skill, CandidateSkill except ImportError: logger.warning("Could not import models") return {} # Get all skills and usage all_skills = db.query(Skill).all() all_candidate_skills = db.query(CandidateSkill).all() # Compute metrics total_skills = len(all_skills) skill_usage = Counter([cs.skill.name for cs in all_candidate_skills if cs.skill]) unique_skills = len(skill_usage) average_usage = round((sum(skill_usage.values()) / max(1, unique_skills)), 2) # Find unused skills used_skill_names = set(skill_usage.keys()) unused_skills = [s.name for s in all_skills if s.name not in used_skill_names] # Compute coverage (Pareto) total_usage = sum(skill_usage.values()) cumulative = 0 skills_for_80_percent = 0 for skill, count in skill_usage.most_common(): cumulative += count skills_for_80_percent += 1 if cumulative >= total_usage * 0.80: break coverage_ratio = skills_for_80_percent / max(1, len(skill_usage)) coverage_percentage = round((unique_skills / max(1, total_skills)) * 100, 1) quality_score = self._compute_quality_score( total_skills, len(unused_skills), coverage_ratio ) if quality_score >= 85: health_status = "excellent" elif quality_score >= 70: health_status = "good" elif quality_score >= 50: health_status = "fair" else: health_status = "poor" pareto_analysis = { "skills_for_80_percent": skills_for_80_percent, "pareto_ratio": round(coverage_ratio, 2), "coverage_percent": coverage_percentage, } trending_missing = self.detect_trending_skills(db) return { "total_skills": total_skills, "skills_in_use": len(skill_usage), "unique_skills": unique_skills, "average_usage": average_usage, "coverage_percentage": coverage_percentage, "unused_skills_count": len(unused_skills), "unused_skills": unused_skills[:20], "unused_skills_list": unused_skills[:20], # Backward compatibility "most_used_skills": [ {"skill": name, "usage_count": count} for name, count in skill_usage.most_common(10) ], "coverage": { "total_usage_records": total_usage, "skills_for_80_percent": skills_for_80_percent, "pareto_ratio": round(coverage_ratio, 2), "coverage_percent": coverage_percentage, }, "quality_score": quality_score, "health_status": health_status, "pareto_analysis": pareto_analysis, "trending_missing": trending_missing, "recommendations": self._generate_recommendations( unused_skills, skill_usage, total_skills ), } def _compute_quality_score(self, total_skills: int, unused_count: int, coverage_ratio: float) -> float: """ Compute overall quality score (0-100). Factors: - Unused skills (penalty) - Coverage concentration (bonus if concentrated) """ # Start at 100 score = 100.0 # Penalize unused skills unused_ratio = unused_count / max(1, total_skills) score -= unused_ratio * 20 # Penalize if too scattered (good coverage is ~0.15-0.25) if coverage_ratio > 0.30: score -= (coverage_ratio - 0.30) * 10 # Bonus if well-concentrated if 0.10 <= coverage_ratio <= 0.25: score += 10 return max(0, min(100, round(score, 1))) def _generate_recommendations(self, unused_skills: List[str], skill_usage: Counter, total_skills: int) -> List[str]: """Generate actionable recommendations.""" recommendations = [] # Recommendation 1: Remove unused if len(unused_skills) > total_skills * 0.2: recommendations.append( f"🗑️ Remove {len(unused_skills)} unused skills to reduce clutter" ) # Recommendation 2: Add trending if skill_usage: top_skill_count = max(skill_usage.values()) if top_skill_count < 100: recommendations.append( "📈 Consider adding more high-demand skills (usage < 100 records)" ) # Recommendation 3: Balance if len(unused_skills) > 0: recommendations.append( f"⚖️ Review/consolidate remaining {len(unused_skills)} unused skills" ) if not recommendations: recommendations.append("✅ Skill dictionary in good health!") return recommendations def get_skill_health_report(self, db: Any) -> str: """Generate human-readable health report.""" metrics = self.compute_metrics(db) if not metrics: return "Unable to compute skill metrics" lines = [ "📊 SKILL DICTIONARY HEALTH REPORT", "=" * 50, "", f"Total Skills: {metrics['total_skills']}", f"In Active Use: {metrics['skills_in_use']}", f"Unused: {metrics['unused_skills_count']}", f"Quality Score: {metrics['quality_score']}/100", "", "🎯 Coverage (Pareto):", f" {metrics['coverage']['skills_for_80_percent']} skills cover 80% of usage", f" Coverage ratio: {metrics['coverage']['pareto_ratio']}", f" Dictionary utilization: {metrics['coverage']['coverage_percent']}%", "", "⭐ Top 5 Most Used:", ] for item in metrics.get('most_used_skills', [])[:5]: lines.append(f" • {item['skill']}: {item['usage_count']} uses") if len(metrics.get('unused_skills_list', [])) > 0: lines.append("") lines.append("⚠️ Unused Skills (sample):") for skill in metrics['unused_skills_list'][:5]: lines.append(f" • {skill}") if len(metrics.get('unused_skills_list', [])) > 5: lines.append(f" ... and {len(metrics['unused_skills_list']) - 5} more") lines.append("") lines.append("💡 Recommendations:") for rec in metrics.get('recommendations', []): lines.append(f" {rec}") return "\n".join(lines) def detect_trending_skills(self, db: Any, candidate_count_threshold: int = 2) -> List[str]: """Detect skills appearing frequently but not in our dict.""" try: from app.models.models import Skill, CandidateSkill except ImportError: return [] all_candidate_skills = db.query(CandidateSkill).all() dict_skills = {s.name.lower() for s in db.query(Skill).all()} # Count extracted skills extracted_skill_counts = Counter() for cs in all_candidate_skills: if cs.skill: extracted_skill_counts[cs.skill.name.lower()] += 1 # Find frequently used but possibly missing trending = [ skill for skill, count in extracted_skill_counts.items() if count >= candidate_count_threshold and skill not in dict_skills ] return sorted(trending, key=lambda s: extracted_skill_counts[s], reverse=True)