#!/usr/bin/env python3
"""
PHASE 1 IMPLEMENTATION STARTER — Quick Wins

Trois implémentations concrètes pour démarrer immédiatement:
1. Adaptive Thresholds (1-2 hours)
2. Smart Deduplication (1-2 hours)
3. Enhanced Explainability (2-3 hours)

Prêt à copy-paste dans votre codebase.
"""

# ============================================================================
# 1. ADAPTIVE THRESHOLDS (backend/ai_module/matching/adaptive_thresholds.py)
# ============================================================================

from typing import Dict, Tuple
import re
from datetime import datetime

class AdaptiveThresholdEngine:
    """
    Sélectionne thresholds de matching basé sur le domaine du job.
    
    Usage:
        engine = AdaptiveThresholdEngine()
        thresholds = engine.get_thresholds("Senior Data Scientist")
        # → {"accept": 0.75, "review": 0.45}
    """
    
    # Définition des domains et leurs thresholds
    DOMAIN_CONFIG = {
        # Data Science: Bar très haute (rare talent)
        "data_science": {
            "accept": 0.75,
            "review": 0.45,
            "confidence": 0.90,
            "description": "High bar - specialized domain"
        },
        # Finance: Sécurité prioritaire
        "finance": {
            "accept": 0.80,
            "review": 0.50,
            "confidence": 0.85,
            "description": "Very strict - compliance critical"
        },
        # Backend/DevOps: Mixed skills okay
        "backend": {
            "accept": 0.70,
            "review": 0.40,
            "confidence": 0.85,
            "description": "Moderate - diverse tech ok"
        },
        # Frontend: Creative + technical
        "frontend": {
            "accept": 0.70,
            "review": 0.38,
            "confidence": 0.80,
            "description": "Moderate - UX skills valuable"
        },
        # Startup: Flexibility high
        "startup": {
            "accept": 0.60,
            "review": 0.30,
            "confidence": 0.75,
            "description": "Low bar - versatility valued"
        },
        # Product/PM: Soft skills important
        "product": {
            "accept": 0.65,
            "review": 0.35,
            "confidence": 0.80,
            "description": "Moderate - soft skills matter"
        },
        # Sales/Marketing: Personality + skills
        "business": {
            "accept": 0.60,
            "review": 0.32,
            "confidence": 0.75,
            "description": "Low bar - personality critical"
        },
        # ML/AI: Highly specialized
        "machine_learning": {
            "accept": 0.78,
            "review": 0.48,
            "confidence": 0.90,
            "description": "High bar - specialized"
        },
        # DevOps/Infrastructure
        "devops": {
            "accept": 0.72,
            "review": 0.42,
            "confidence": 0.85,
            "description": "High bar - reliability critical"
        },
        # Default fallback
        "default": {
            "accept": 0.80,
            "review": 0.50,
            "confidence": 0.80,
            "description": "Standard thresholds"
        }
    }
    
    # Keywords pour la détection de domaines
    DOMAIN_KEYWORDS = {
        "data_science": [
            "data scientist", "data science", "analytics", "statistical",
            "machine learning", "ml engineer", "data engineer", "big data"
        ],
        "finance": [
            "financial", "accountant", "trader", "analyst", "finance",
            "risk", "banking", "investment", "portfolio"
        ],
        "backend": [
            "backend", "server", "api", "python", "java", "golang",
            "infrastructure", "architect", "systems engineer", "performance"
        ],
        "frontend": [
            "frontend", "ui", "ux", "react", "vue", "angular",
            "web developer", "designer", "visual", "css", "javascript"
        ],
        "startup": [
            "startups", "founder", "early stage", "mvp", "bootstrapped",
            "rapid", "agile", "full stack", "jack of all trades"
        ],
        "product": [
            "product manager", "pm", "product owner", "po",
            "roadmap", "strategy", "vision", "user experience"
        ],
        "business": [
            "sales", "business development", "marketing", "bd",
            "account manager", "customer", "commercial", "partnership"
        ],
        "machine_learning": [
            "machine learning", "ml", "deep learning", "neural",
            "tensorflow", "pytorch", "ai engineer", "ai scientist"
        ],
        "devops": [
            "devops", "sre", "kubernetes", "docker", "infrastructure",
            "ci/cd", "deployment", "cloud", "aws", "gcp", "azure"
        ],
    }
    
    def detect_domain(self, job_title: str) -> str:
        """
        Détecte le domaine du job à partir du titre.
        
        Args:
            job_title: Ex "Senior Data Scientist"
            
        Returns:
            domain slug: Ex "data_science"
        """
        if not job_title:
            return "default"
        
        job_lower = job_title.lower()
        
        # Score chaque domain par nombre de keywords matchés
        domain_scores = {}
        for domain, keywords in self.DOMAIN_KEYWORDS.items():
            score = sum(1 for kw in keywords if kw in job_lower)
            domain_scores[domain] = score
        
        # Retourner le domain avec plus de matches
        best_domain = max(domain_scores.items(), key=lambda x: x[1])[0]
        
        # Si aucun match trouvé, utiliser default
        if domain_scores[best_domain] == 0:
            return "default"
        
        return best_domain
    
    def get_thresholds(self, job_title: str) -> Dict[str, float]:
        """
        Retourne les thresholds adaptatifs pour un job.
        
        Args:
            job_title: Titre du job
            
        Returns:
            {"accept": 0.70, "review": 0.40, "confidence": 0.85}
        """
        domain = self.detect_domain(job_title)
        config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])
        
        # Filtering for output (sans description)
        return {
            "accept": config["accept"],
            "review": config["review"],
            "confidence": config["confidence"],
        }
    
    def get_thresholds_with_explanation(self, job_title: str) -> Dict:
        """Retourne thresholds + explication du domain détecté."""
        domain = self.detect_domain(job_title)
        config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])
        
        return {
            "domain": domain,
            "job_title": job_title,
            "thresholds": {
                "accept": config["accept"],
                "review": config["review"],
                "confidence": config["confidence"],
            },
            "rationale": config["description"],
            "detected_at": datetime.utcnow().isoformat(),
        }


# ============================================================================
# 2. SMART DEDUPLICATION (backend/ai_module/nlp/smart_dedup.py)
# ============================================================================

from typing import List, Set
from numpy import ndarray
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class SmartSkillDeduplicator:
    """
    Déduplique skills avec similarité sémantique au lieu de string matching.
    
    Exemple:
        dedup = SmartSkillDeduplicator(embedder)
        result = dedup.deduplicate(["Python", "python", "Python 3.11"])
        # → ["Python"]
    """
    
    def __init__(self, embedder=None, similarity_threshold: float = 0.82):
        """
        Args:
            embedder: SentenceTransformer instance (ou sera créée)
            similarity_threshold: Min similarity pour merger (0.0-1.0)
        """
        self.embedder = embedder
        self.similarity_threshold = similarity_threshold
    
    def deduplicate(self, skills: List[str]) -> List[str]:
        """
        Déduplique une liste de skills via clustering sémantique.
        
        Args:
            skills: ["Python", "python", "ML", "Machine Learning"]
            
        Returns:
            ["Python", "Machine Learning"]  # Canonical names
        """
        if not skills:
            return []
        
        # Cas simple: si ≤1 skills, retourner as is
        if len(skills) <= 1:
            return skills
        
        # Normaliser (lowercase, trim)
        normalized = [s.strip().lower() for s in skills]
        
        # Première pass: exact string dedup
        first_pass = list(dict.fromkeys(normalized))  # Preserve order, remove dupes
        
        if len(first_pass) <= 1:
            return first_pass
        
        # Deuxième pass: semantic clustering
        if self.embedder:
            try:
                clusters = self._cluster_by_similarity(first_pass)
                canonical = self._extract_canonical(skills, clusters)
                return canonical
            except Exception as e:
                # Fallback to first pass if embedding fails
                print(f"Warning: Embedding failed ({e}), using string dedup")
                return first_pass
        
        return first_pass
    
    def _cluster_by_similarity(self, skills: List[str]) -> List[List[int]]:
        """
        Cluster skills indices basé sur similarité sémantique.
        
        Returns:
            [[0, 1], [2, 3]]  # Indices of skills that cluster together
        """
        # Générer embeddings
        embeddings = self.embedder.encode(skills)  # shape: (N, 384)
        
        # Calculer matrice similarité
        similarity_matrix = cosine_similarity(embeddings)  # shape: (N, N)
        
        # Clustering via connected components
        clusters = []
        used = set()
        
        for i in range(len(skills)):
            if i in used:
                continue
            
            # Commencer nouveau cluster
            cluster = [i]
            used.add(i)
            
            # Trouver tous les skills similaires
            for j in range(i + 1, len(skills)):
                if j in used:
                    continue
                
                if similarity_matrix[i][j] > self.similarity_threshold:
                    cluster.append(j)
                    used.add(j)
            
            clusters.append(cluster)
        
        return clusters
    
    def _extract_canonical(self, original_skills: List[str], 
                          clusters: List[List[int]]) -> List[str]:
        """
        Extrait skill canonical pour chaque cluster.
        
        Heuristique: skill le plus long (plus descriptif)
        """
        canonical = []
        
        for cluster in clusters:
            # Prendre le skill original (preserving case/format)
            cluster_skills = [original_skills[i] for i in cluster]
            
            # Heuristique: skill le plus long = most descriptive
            canonical_skill = max(cluster_skills, key=len)
            canonical.append(canonical_skill)
        
        return canonical


# ============================================================================
# 3. ENHANCED EXPLAINABILITY (backend/ai_module/matching/explainability.py)
# ============================================================================

from datetime import datetime
from enum import Enum

class SkillMatchStatus(str, Enum):
    MATCHED = "matched"
    MISSING = "missing"
    BONUS = "bonus"

class ExplainabilityEngine:
    """
    Explique chaque composant du score de matching de façon détaillée.
    
    Retourne:
        {
            "total_score": 0.847,
            "components": {...},
            "strengths": [...],
            "gaps": [...],
            "recommendation": "Proceed to interview",
            "confidence": 0.92
        }
    """
    
    # Pondérations du scoring
    WEIGHTS = {
        "skills": 0.50,
        "semantic": 0.20,
        "experience": 0.15,
        "education": 0.10,
        "bonus": 0.05,
    }
    
    def explain_score(self, candidate, criteria, total_score: float) -> Dict:
        """
        Génère explication complète du score.
        
        Args:
            candidate: Candidate model instance
            criteria: JobCriteria model instance
            total_score: Match score (0.0-1.0)
            
        Returns:
            Explication structurée
        """
        
        # Composer les différentes évaluations
        skills_breakdown = self._explain_skills(candidate, criteria)
        semantic_breakdown = self._explain_semantic(candidate, criteria)
        experience_breakdown = self._explain_experience(candidate, criteria)
        education_breakdown = self._explain_education(candidate, criteria)
        
        # Identifier forces et faiblesses
        strengths = self._identify_strengths(skills_breakdown)
        gaps = self._identify_gaps(skills_breakdown)
        
        # Recommandation
        recommendation = self._recommend_action(total_score)
        
        # Confiance du score
        confidence = self._calculate_confidence(
            candidate, criteria, skills_breakdown
        )
        
        return {
            "timestamp": datetime.utcnow().isoformat(),
            "candidate": {
                "id": candidate.id,
                "name": candidate.full_name,
                "email": candidate.email,
            },
            "criteria": {
                "id": criteria.id,
                "title": criteria.title,
            },
            "score": {
                "total": round(total_score, 3),
                "percentage": f"{total_score*100:.1f}%",
                "components": {
                    "skills": round(skills_breakdown["score"], 3),
                    "semantic": round(semantic_breakdown["score"], 3),
                    "experience": round(experience_breakdown["score"], 3),
                    "education": round(education_breakdown["score"], 3),
                },
            },
            "breakdown": {
                "skills": skills_breakdown,
                "semantic": semantic_breakdown,
                "experience": experience_breakdown,
                "education": education_breakdown,
            },
            "insights": {
                "strengths": strengths,
                "gaps": gaps,
            },
            "decision": {
                "recommendation": recommendation["action"],
                "rationale": recommendation["rationale"],
            },
            "confidence": confidence,
        }
    
    def _explain_skills(self, candidate, criteria) -> Dict:
        """Détail du matching de skills."""
        matched = []
        missing = []
        
        # Récupérer skills du candidat
        candidate_skill_names = {
            s.skill.name.lower(): s.skill.name
            for s in candidate.candidate_skills
            if s.skill
        }
        
        # Comparer vs criteria
        total_weight = sum(c.weight for c in criteria.criteria_skills) or 100
        
        for criterion in criteria.criteria_skills:
            if not criterion.skill:
                continue
            
            skill_name = criterion.skill.name
            is_present = skill_name.lower() in candidate_skill_names
            
            contribution = (criterion.weight / total_weight) if is_present else 0
            
            skill_info = {
                "skill": skill_name,
                "weight": criterion.weight,
                "status": SkillMatchStatus.MATCHED if is_present else SkillMatchStatus.MISSING,
                "contribution": round(contribution * 0.50, 3),  # 50% weight de skills
            }
            
            if is_present:
                matched.append(skill_info)
            else:
                missing.append(skill_info)
        
        score = len(matched) / max(1, len(matched) + len(missing))
        
        return {
            "score": score,
            "matched": matched,
            "missing": missing,
            "coverage": f"{len(matched)}/{len(matched) + len(missing)} core skills",
            "summary": f"Matched {len(matched)}/{len(matched) + len(missing)} required skills"
        }
    
    def _explain_semantic(self, candidate, criteria) -> Dict:
        """Similarité sémantique CV vs job description."""
        # Simplifié pour exemple
        return {
            "score": 0.75,
            "reason": "Strong alignment with job description keywords",
            "keywords_matched": ["python", "leadership", "frontend"],
            "keywords_missing": ["kubernetes"],
        }
    
    def _explain_experience(self, candidate, criteria) -> Dict:
        """Évaluation expérience."""
        years = candidate.years_experience or 0
        return {
            "score": min(years / 10.0, 1.0),  # Cap at 1.0
            "years": years,
            "assessment": "Senior level" if years >= 5 else "Junior-Mid level",
        }
    
    def _explain_education(self, candidate, criteria) -> Dict:
        """Évaluation éducation."""
        return {
            "score": 0.8,
            "degree": candidate.extracted_education or "Not specified",
            "assessment": "Relevant background",
        }
    
    def _identify_strengths(self, skills_breakdown: Dict) -> List[str]:
        """Identifie top forces."""
        matched = skills_breakdown.get("matched", [])
        if not matched:
            return []
        
        # Top 3 par contribution
        top = sorted(matched, key=lambda x: x["weight"], reverse=True)[:3]
        return [f"{s['skill']} ({s['weight']}%)" for s in top]
    
    def _identify_gaps(self, skills_breakdown: Dict) -> List[str]:
        """Identifie top gaps."""
        missing = skills_breakdown.get("missing", [])
        if not missing:
            return []
        
        # Top 3 par weight
        top = sorted(missing, key=lambda x: x["weight"], reverse=True)[:3]
        return [f"{s['skill']} ({s['weight']}%)" for s in top]
    
    def _recommend_action(self, score: float) -> Dict:
        """Recommandation basée sur score."""
        if score >= 0.80:
            return {
                "action": "ACCEPT - Interview now",
                "rationale": "Strong match on core criteria",
                "confidence": "High"
            }
        elif score >= 0.50:
            return {
                "action": "REVIEW - Phone screen first",
                "rationale": "Good match but verify specific skills",
                "confidence": "Medium"
            }
        else:
            return {
                "action": "PASS - Not aligned",
                "rationale": "Missing too many core skills",
                "confidence": "High"
            }
    
    def _calculate_confidence(self, candidate, criteria, skills_breakdown: Dict) -> float:
        """Confiance du scoring (0.0-1.0)."""
        confidence = 0.8  # Base
        
        # Penalize si peu de skills dans criteria
        if len(criteria.criteria_skills) < 3:
            confidence *= 0.7
        
        # Boost si tous skills matchent
        if len(skills_breakdown["missing"]) == 0:
            confidence = min(confidence * 1.1, 1.0)
        
        return round(confidence, 2)


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

if __name__ == "__main__":
    print("=" * 70)
    print("Phase 1 Implementation Examples")
    print("=" * 70)
    
    # 1. Adaptive Thresholds
    print("\n1️⃣ ADAPTIVE THRESHOLDS")
    print("-" * 70)
    
    threshold_engine = AdaptiveThresholdEngine()
    
    test_jobs = [
        "Senior Data Scientist",
        "Financial Analyst",
        "Startup Full Stack Developer",
    ]
    
    for job in test_jobs:
        result = threshold_engine.get_thresholds_with_explanation(job)
        print(f"\nJob: {result['job_title']}")
        print(f"Domain: {result['domain']}")
        print(f"Thresholds: Accept={result['thresholds']['accept']:.0%}, Review={result['thresholds']['review']:.0%}")
        print(f"Rationale: {result['rationale']}")
    
    # 2. Smart Deduplication
    print("\n\n2️⃣ SMART DEDUPLICATION")
    print("-" * 70)
    
    dedup = SmartSkillDeduplicator(similarity_threshold=0.82)
    
    test_skills = [
        ["Python", "python", "python3"],
        ["JavaScript", "JS", "Node.js", "TypeScript"],
        ["Data Analysis", "Analytics", "Data Analytics"],
    ]
    
    for skills in test_skills:
        result = dedup.deduplicate(skills)
        print(f"\nInput: {skills}")
        print(f"Output: {result}")
    
    # 3. Explainability (example structure)
    print("\n\n3️⃣ ENHANCED EXPLAINABILITY")
    print("-" * 70)
    print("\nExample output structure:")
    print("""
{
  "timestamp": "2026-05-12T23:50:00.000000",
  "candidate": {
    "id": 1,
    "name": "Ahmed Ben",
    "email": "ahmed@example.com"
  },
  "score": {
    "total": 0.847,
    "percentage": "84.7%",
    "components": {
      "skills": 0.85,
      "semantic": 0.72,
      "experience": 0.9,
      "education": 0.8
    }
  },
  "insights": {
    "strengths": ["Python (25%)", "Leadership (20%)", "Cloud (15%)"],
    "gaps": ["Kubernetes (15%)", "DevOps (10%)"]
  },
  "decision": {
    "recommendation": "ACCEPT - Interview now",
    "rationale": "Strong match on core criteria",
    "confidence": 0.92
  }
}
    """)
    
    print("\n" + "=" * 70)
    print("✅ Phase 1 Examples Complete")
    print("=" * 70)