"""
Indonesian Skill Extractor Model v1.0
A rule-based NER system for extracting technical and soft skills from Indonesian job postings

Author: Herlambang Haryo Putro
GitHub: https://github.com/herlambangharyoputro
License: MIT
Repository: https://github.com/herlambangharyoputro/job-market-intelligence-platform
HuggingFace: https://huggingface.co/herlambangharyoputro/indonesian-skill-extractor-v1
"""

import re
import json
from typing import Dict, List, Any, Optional, Tuple
from collections import Counter


class IndonesianSkillExtractor:
    """
    Production-ready skill extraction model for Indonesian job postings
    
    Features:
    - Extracts technical and soft skills from text
    - Categorizes skills into 7 categories
    - Handles Indonesian language patterns
    - Proficiency level detection
    - Skill normalization and deduplication
    
    Categories:
    - programming: Programming languages
    - frontend: Frontend frameworks and tools
    - backend: Backend frameworks
    - database: Database systems
    - cloud: Cloud platforms and DevOps
    - data_science: Data science and ML tools
    - soft_skills: Soft and interpersonal skills
    """
    
    VERSION = "1.0.0"
    
    def __init__(self):
        """Initialize the skill extractor with skill taxonomy"""
        self.skill_categories = self._load_skill_taxonomy()
        self.skill_aliases = self._load_skill_aliases()
        self.proficiency_keywords = self._load_proficiency_keywords()
    
    def _load_skill_taxonomy(self) -> Dict[str, set]:
        """Load comprehensive skill taxonomy (200+ skills)"""
        return {
            'programming': {
                # Languages
                'python', 'java', 'javascript', 'typescript', 'php', 'ruby',
                'c', 'c++', 'c#', 'go', 'golang', 'rust', 'swift', 'kotlin',
                'scala', 'perl', 'r', 'matlab', 'dart', 'elixir', 'haskell',
                'clojure', 'f#', 'groovy', 'lua', 'assembly', 'shell', 'bash',
                'powershell', 'vba', 'fortran', 'cobol', 'pascal', 'delphi',
            },
            'frontend': {
                # Frameworks & Libraries
                'html', 'html5', 'css', 'css3', 'react', 'reactjs', 'react.js',
                'vue', 'vuejs', 'vue.js', 'angular', 'angularjs', 'svelte',
                'next.js', 'nextjs', 'nuxt', 'nuxtjs', 'gatsby', 'remix',
                # UI Libraries
                'jquery', 'bootstrap', 'tailwind', 'tailwindcss', 'material-ui',
                'mui', 'ant design', 'chakra ui', 'styled-components',
                # Build Tools
                'sass', 'scss', 'less', 'webpack', 'vite', 'parcel', 'rollup',
                'babel', 'postcss',
                # Testing
                'jest', 'enzyme', 'cypress', 'playwright', 'selenium',
            },
            'backend': {
                # Frameworks
                'node.js', 'nodejs', 'express', 'expressjs', 'nest.js', 'nestjs',
                'django', 'flask', 'fastapi', 'tornado', 'pyramid',
                'laravel', 'symfony', 'codeigniter', 'slim', 'lumen',
                'spring', 'spring boot', 'hibernate', 'struts',
                'rails', 'ruby on rails', 'sinatra',
                '.net', 'asp.net', '.net core', 'entity framework',
                'gin', 'echo', 'fiber',
                # API
                'rest', 'restful', 'graphql', 'grpc', 'soap', 'api',
            },
            'database': {
                # SQL
                'mysql', 'postgresql', 'postgres', 'sql server', 'mssql',
                'oracle', 'oracle db', 'mariadb', 'sqlite', 'db2',
                # NoSQL
                'mongodb', 'mongo', 'redis', 'cassandra', 'couchdb',
                'dynamodb', 'neo4j', 'elasticsearch', 'solr',
                # Cloud DB
                'firestore', 'firebase', 'supabase', 'planetscale',
                # ORMs
                'sequelize', 'typeorm', 'prisma', 'mongoose',
                # Query Languages
                'sql', 'nosql', 'plsql', 't-sql',
            },
            'cloud': {
                # Cloud Providers
                'aws', 'amazon web services', 'azure', 'microsoft azure',
                'gcp', 'google cloud', 'google cloud platform',
                'alibaba cloud', 'oracle cloud', 'ibm cloud', 'digital ocean',
                # Containers & Orchestration
                'docker', 'kubernetes', 'k8s', 'podman', 'containerd',
                'docker compose', 'docker swarm', 'helm', 'rancher',
                # CI/CD
                'jenkins', 'gitlab ci', 'github actions', 'circle ci',
                'travis ci', 'bamboo', 'teamcity', 'argo cd', 'flux',
                # IaC
                'terraform', 'ansible', 'puppet', 'chef', 'cloudformation',
                'pulumi', 'vagrant',
                # Monitoring
                'prometheus', 'grafana', 'elk', 'elastic stack', 'datadog',
                'new relic', 'splunk', 'nagios',
                # Service Mesh
                'istio', 'linkerd', 'consul',
            },
            'data_science': {
                # Python Libraries
                'pandas', 'numpy', 'scipy', 'matplotlib', 'seaborn',
                'plotly', 'bokeh',
                # ML Frameworks
                'tensorflow', 'keras', 'pytorch', 'scikit-learn', 'sklearn',
                'xgboost', 'lightgbm', 'catboost',
                # Deep Learning
                'transformers', 'hugging face', 'langchain', 'llama',
                # BI Tools
                'tableau', 'power bi', 'looker', 'metabase', 'superset',
                'qlik', 'sisense',
                # Big Data
                'spark', 'pyspark', 'hadoop', 'hive', 'pig', 'kafka',
                'flink', 'storm', 'airflow',
                # Notebooks
                'jupyter', 'colab', 'databricks',
            },
            'soft_skills': {
                # Communication
                'communication', 'komunikasi', 'public speaking',
                'presentation', 'presentasi', 'writing', 'menulis',
                # Collaboration
                'teamwork', 'kerja sama', 'kerja sama tim', 'collaboration',
                'kolaborasi', 'interpersonal',
                # Leadership
                'leadership', 'kepemimpinan', 'management', 'manajemen',
                'mentoring', 'coaching',
                # Problem Solving
                'problem solving', 'analytical', 'analitis', 'analytical thinking',
                'berpikir analitis', 'critical thinking', 'berpikir kritis',
                'creativity', 'kreativitas', 'innovation', 'inovasi',
                # Organization
                'organization', 'organizational', 'time management',
                'manajemen waktu', 'planning', 'perencanaan',
                # Adaptability
                'adaptability', 'adaptasi', 'flexibility', 'fleksibilitas',
                'learning agility', 'fast learner', 'cepat belajar',
            }
        }
    
    def _load_skill_aliases(self) -> Dict[str, str]:
        """Load skill aliases for normalization"""
        return {
            # Programming
            'js': 'javascript',
            'ts': 'typescript',
            'py': 'python',
            # Frontend
            'react.js': 'react',
            'reactjs': 'react',
            'vue.js': 'vue',
            'vuejs': 'vue',
            'next.js': 'nextjs',
            'nuxt.js': 'nuxtjs',
            # Backend
            'node': 'node.js',
            'nodejs': 'node.js',
            'nest': 'nest.js',
            'nestjs': 'nest.js',
            'express.js': 'express',
            'expressjs': 'express',
            'django rest framework': 'django',
            'drf': 'django',
            # Database
            'pg': 'postgresql',
            'postgres': 'postgresql',
            'mongo': 'mongodb',
            'psql': 'postgresql',
            # Cloud
            'k8s': 'kubernetes',
            'aws lambda': 'aws',
            'ec2': 'aws',
            's3': 'aws',
            'rds': 'aws',
            # Data Science
            'sklearn': 'scikit-learn',
            'tf': 'tensorflow',
            'hf': 'hugging face',
        }
    
    def _load_proficiency_keywords(self) -> Dict[str, List[str]]:
        """Load proficiency level keywords"""
        return {
            'expert': ['expert', 'advanced', 'mahir', 'ahli', 'mastery', 'proficient'],
            'intermediate': ['intermediate', 'menengah', 'cukup', 'moderate', 'competent'],
            'beginner': ['beginner', 'basic', 'pemula', 'dasar', 'fundamental', 'novice']
        }
    
    def parse_skills_from_text(self, text: str) -> List[str]:
        """
        Parse skills from text by splitting on delimiters
        
        Args:
            text: Raw skill text (e.g., "Python, React, MySQL")
            
        Returns:
            List of individual skill strings
        """
        if not text or not isinstance(text, str):
            return []
        
        # Split by common delimiters
        delimiters = [',', ';', '•', '·', '\n', '|', '/']
        pattern = '|'.join(map(re.escape, delimiters))
        skills = re.split(pattern, text)
        
        # Clean each skill
        cleaned_skills = []
        for skill in skills:
            # Remove brackets, numbers, proficiency levels
            skill = re.sub(r'\([^)]*\)', '', skill)  # Remove (parentheses)
            skill = re.sub(r'\[[^\]]*\]', '', skill)  # Remove [brackets]
            skill = re.sub(r'\d+\s*(year|tahun|yr)', '', skill, flags=re.IGNORECASE)
            skill = re.sub(r'^\d+[\.\)]\s*', '', skill)  # Remove numbered lists
            
            # Strip whitespace
            skill = skill.strip()
            
            if skill and len(skill) > 1:
                cleaned_skills.append(skill)
        
        return cleaned_skills
    
    def normalize_skill(self, skill: str) -> str:
        """
        Normalize skill name using aliases
        
        Args:
            skill: Raw skill name
            
        Returns:
            Normalized skill name
        """
        skill_lower = skill.lower().strip()
        
        # Remove special characters except dots and spaces
        skill_lower = re.sub(r'[^\w\s\.\+\#]', '', skill_lower)
        
        # Apply aliases
        if skill_lower in self.skill_aliases:
            return self.skill_aliases[skill_lower]
        
        return skill_lower
    
    def categorize_skill(self, skill: str) -> str:
        """
        Categorize a skill into one of 7 categories
        
        Args:
            skill: Normalized skill name
            
        Returns:
            Category name or 'other'
        """
        skill_lower = skill.lower()
        
        # Direct match
        for category, skill_set in self.skill_categories.items():
            if skill_lower in skill_set:
                return category
        
        # Partial match for compound skills
        for category, skill_set in self.skill_categories.items():
            for known_skill in skill_set:
                if known_skill in skill_lower or skill_lower in known_skill:
                    return category
        
        return 'other'
    
    def extract_proficiency(self, text: str) -> Optional[str]:
        """
        Extract proficiency level from text
        
        Args:
            text: Text containing potential proficiency keywords
            
        Returns:
            Proficiency level or None
        """
        text_lower = text.lower()
        
        for level, keywords in self.proficiency_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    return level
        
        return None
    
    def extract(self, text: str) -> Dict[str, Any]:
        """
        Extract skills with full metadata
        
        Args:
            text: Text containing skills (job description, requirements, etc.)
            
        Returns:
            Dictionary with extracted skills and metadata
        """
        if not text or not isinstance(text, str):
            return {
                'skills': [],
                'total_count': 0,
                'unique_count': 0,
                'by_category': {},
                'proficiency_detected': False
            }
        
        # Parse skills
        raw_skills = self.parse_skills_from_text(text)
        
        # Process each skill
        processed_skills = []
        category_counts = Counter()
        
        for raw_skill in raw_skills:
            normalized = self.normalize_skill(raw_skill)
            category = self.categorize_skill(normalized)
            proficiency = self.extract_proficiency(raw_skill)
            
            skill_data = {
                'original': raw_skill,
                'normalized': normalized,
                'category': category,
                'proficiency': proficiency
            }
            
            processed_skills.append(skill_data)
            category_counts[category] += 1
        
        # Group by category
        by_category = {}
        for skill in processed_skills:
            category = skill['category']
            if category not in by_category:
                by_category[category] = []
            by_category[category].append(skill)
        
        return {
            'skills': processed_skills,
            'total_count': len(processed_skills),
            'unique_count': len(set(s['normalized'] for s in processed_skills)),
            'by_category': by_category,
            'category_counts': dict(category_counts),
            'proficiency_detected': any(s['proficiency'] for s in processed_skills)
        }
    
    def extract_simple(self, text: str) -> List[str]:
        """
        Simple extraction returning just skill names
        
        Args:
            text: Text containing skills
            
        Returns:
            List of normalized skill names
        """
        result = self.extract(text)
        return [s['normalized'] for s in result['skills']]
    
    def batch_extract(self, texts: List[str]) -> List[Dict[str, Any]]:
        """
        Extract skills from multiple texts
        
        Args:
            texts: List of texts to process
            
        Returns:
            List of extraction results
        """
        return [self.extract(text) for text in texts]
    
    def get_top_skills(self, texts: List[str], top_n: int = 10) -> List[Tuple[str, int]]:
        """
        Get top N most frequent skills across texts
        
        Args:
            texts: List of texts to analyze
            top_n: Number of top skills to return
            
        Returns:
            List of (skill, count) tuples
        """
        all_skills = []
        for text in texts:
            skills = self.extract_simple(text)
            all_skills.extend(skills)
        
        skill_counts = Counter(all_skills)
        return skill_counts.most_common(top_n)
    
    def get_stats(self) -> Dict[str, Any]:
        """Get model statistics"""
        total_skills = sum(len(skills) for skills in self.skill_categories.values())
        
        return {
            'version': self.VERSION,
            'total_skills': total_skills,
            'categories': len(self.skill_categories),
            'category_breakdown': {
                cat: len(skills) 
                for cat, skills in self.skill_categories.items()
            },
            'total_aliases': len(self.skill_aliases),
            'proficiency_levels': len(self.proficiency_keywords)
        }


# Convenience function
def extract_skills(text: str) -> List[str]:
    """
    Convenience function for quick skill extraction
    
    Args:
        text: Text containing skills
        
    Returns:
        List of normalized skill names
    """
    extractor = IndonesianSkillExtractor()
    return extractor.extract_simple(text)


if __name__ == "__main__":
    # Demo
    print("=" * 70)
    print("Indonesian Skill Extractor v1.0")
    print("Author: Herlambang Haryo Putro")
    print("=" * 70)
    print()
    
    # Initialize
    extractor = IndonesianSkillExtractor()
    
    # Show stats
    stats = extractor.get_stats()
    print("Model Statistics:")
    print(f"  Version: {stats['version']}")
    print(f"  Total Skills: {stats['total_skills']}")
    print(f"  Categories: {stats['categories']}")
    print(f"  Aliases: {stats['total_aliases']}")
    print()
    
    # Example 1: Simple extraction
    text1 = "Python, React, MySQL, AWS, Docker"
    print("Example 1: Simple text")
    print(f"Input: {text1}")
    print(f"Output: {extract_skills(text1)}")
    print()
    
    # Example 2: Indonesian text
    text2 = "Menguasai Python, JavaScript, komunikasi yang baik, kerja sama tim"
    print("Example 2: Indonesian text")
    print(f"Input: {text2}")
    result = extractor.extract(text2)
    print(f"Total skills: {result['total_count']}")
    print(f"Categories: {list(result['by_category'].keys())}")
    print()
    
    # Example 3: With proficiency
    text3 = "Expert in Python, Advanced React, Basic MySQL"
    print("Example 3: With proficiency")
    print(f"Input: {text3}")
    result = extractor.extract(text3)
    for skill in result['skills']:
        print(f"  - {skill['normalized']} ({skill['category']}): {skill['proficiency'] or 'N/A'}")
    print()
    
    print("=" * 70)
    print("Ready for production use!")
    print("GitHub: https://github.com/herlambangharyoputro")
    print("=" * 70)