""" Indonesian Skill Extractor Model v1.0 A rule-based NER system for extracting technical and soft skills from Indonesian job postings Author: Herlambang Haryo Putro GitHub: https://github.com/herlambangharyoputro License: MIT Repository: https://github.com/herlambangharyoputro/job-market-intelligence-platform HuggingFace: https://huggingface.co/herlambangharyoputro/indonesian-skill-extractor-v1 """ import re import json from typing import Dict, List, Any, Optional, Tuple from collections import Counter class IndonesianSkillExtractor: """ Production-ready skill extraction model for Indonesian job postings Features: - Extracts technical and soft skills from text - Categorizes skills into 7 categories - Handles Indonesian language patterns - Proficiency level detection - Skill normalization and deduplication Categories: - programming: Programming languages - frontend: Frontend frameworks and tools - backend: Backend frameworks - database: Database systems - cloud: Cloud platforms and DevOps - data_science: Data science and ML tools - soft_skills: Soft and interpersonal skills """ VERSION = "1.0.0" def __init__(self): """Initialize the skill extractor with skill taxonomy""" self.skill_categories = self._load_skill_taxonomy() self.skill_aliases = self._load_skill_aliases() self.proficiency_keywords = self._load_proficiency_keywords() def _load_skill_taxonomy(self) -> Dict[str, set]: """Load comprehensive skill taxonomy (200+ skills)""" return { 'programming': { # Languages 'python', 'java', 'javascript', 'typescript', 'php', 'ruby', 'c', 'c++', 'c#', 'go', 'golang', 'rust', 'swift', 'kotlin', 'scala', 'perl', 'r', 'matlab', 'dart', 'elixir', 'haskell', 'clojure', 'f#', 'groovy', 'lua', 'assembly', 'shell', 'bash', 'powershell', 'vba', 'fortran', 'cobol', 'pascal', 'delphi', }, 'frontend': { # Frameworks & Libraries 'html', 'html5', 'css', 'css3', 'react', 'reactjs', 'react.js', 'vue', 'vuejs', 'vue.js', 'angular', 'angularjs', 'svelte', 'next.js', 'nextjs', 'nuxt', 'nuxtjs', 'gatsby', 'remix', # UI Libraries 'jquery', 'bootstrap', 'tailwind', 'tailwindcss', 'material-ui', 'mui', 'ant design', 'chakra ui', 'styled-components', # Build Tools 'sass', 'scss', 'less', 'webpack', 'vite', 'parcel', 'rollup', 'babel', 'postcss', # Testing 'jest', 'enzyme', 'cypress', 'playwright', 'selenium', }, 'backend': { # Frameworks 'node.js', 'nodejs', 'express', 'expressjs', 'nest.js', 'nestjs', 'django', 'flask', 'fastapi', 'tornado', 'pyramid', 'laravel', 'symfony', 'codeigniter', 'slim', 'lumen', 'spring', 'spring boot', 'hibernate', 'struts', 'rails', 'ruby on rails', 'sinatra', '.net', 'asp.net', '.net core', 'entity framework', 'gin', 'echo', 'fiber', # API 'rest', 'restful', 'graphql', 'grpc', 'soap', 'api', }, 'database': { # SQL 'mysql', 'postgresql', 'postgres', 'sql server', 'mssql', 'oracle', 'oracle db', 'mariadb', 'sqlite', 'db2', # NoSQL 'mongodb', 'mongo', 'redis', 'cassandra', 'couchdb', 'dynamodb', 'neo4j', 'elasticsearch', 'solr', # Cloud DB 'firestore', 'firebase', 'supabase', 'planetscale', # ORMs 'sequelize', 'typeorm', 'prisma', 'mongoose', # Query Languages 'sql', 'nosql', 'plsql', 't-sql', }, 'cloud': { # Cloud Providers 'aws', 'amazon web services', 'azure', 'microsoft azure', 'gcp', 'google cloud', 'google cloud platform', 'alibaba cloud', 'oracle cloud', 'ibm cloud', 'digital ocean', # Containers & Orchestration 'docker', 'kubernetes', 'k8s', 'podman', 'containerd', 'docker compose', 'docker swarm', 'helm', 'rancher', # CI/CD 'jenkins', 'gitlab ci', 'github actions', 'circle ci', 'travis ci', 'bamboo', 'teamcity', 'argo cd', 'flux', # IaC 'terraform', 'ansible', 'puppet', 'chef', 'cloudformation', 'pulumi', 'vagrant', # Monitoring 'prometheus', 'grafana', 'elk', 'elastic stack', 'datadog', 'new relic', 'splunk', 'nagios', # Service Mesh 'istio', 'linkerd', 'consul', }, 'data_science': { # Python Libraries 'pandas', 'numpy', 'scipy', 'matplotlib', 'seaborn', 'plotly', 'bokeh', # ML Frameworks 'tensorflow', 'keras', 'pytorch', 'scikit-learn', 'sklearn', 'xgboost', 'lightgbm', 'catboost', # Deep Learning 'transformers', 'hugging face', 'langchain', 'llama', # BI Tools 'tableau', 'power bi', 'looker', 'metabase', 'superset', 'qlik', 'sisense', # Big Data 'spark', 'pyspark', 'hadoop', 'hive', 'pig', 'kafka', 'flink', 'storm', 'airflow', # Notebooks 'jupyter', 'colab', 'databricks', }, 'soft_skills': { # Communication 'communication', 'komunikasi', 'public speaking', 'presentation', 'presentasi', 'writing', 'menulis', # Collaboration 'teamwork', 'kerja sama', 'kerja sama tim', 'collaboration', 'kolaborasi', 'interpersonal', # Leadership 'leadership', 'kepemimpinan', 'management', 'manajemen', 'mentoring', 'coaching', # Problem Solving 'problem solving', 'analytical', 'analitis', 'analytical thinking', 'berpikir analitis', 'critical thinking', 'berpikir kritis', 'creativity', 'kreativitas', 'innovation', 'inovasi', # Organization 'organization', 'organizational', 'time management', 'manajemen waktu', 'planning', 'perencanaan', # Adaptability 'adaptability', 'adaptasi', 'flexibility', 'fleksibilitas', 'learning agility', 'fast learner', 'cepat belajar', } } def _load_skill_aliases(self) -> Dict[str, str]: """Load skill aliases for normalization""" return { # Programming 'js': 'javascript', 'ts': 'typescript', 'py': 'python', # Frontend 'react.js': 'react', 'reactjs': 'react', 'vue.js': 'vue', 'vuejs': 'vue', 'next.js': 'nextjs', 'nuxt.js': 'nuxtjs', # Backend 'node': 'node.js', 'nodejs': 'node.js', 'nest': 'nest.js', 'nestjs': 'nest.js', 'express.js': 'express', 'expressjs': 'express', 'django rest framework': 'django', 'drf': 'django', # Database 'pg': 'postgresql', 'postgres': 'postgresql', 'mongo': 'mongodb', 'psql': 'postgresql', # Cloud 'k8s': 'kubernetes', 'aws lambda': 'aws', 'ec2': 'aws', 's3': 'aws', 'rds': 'aws', # Data Science 'sklearn': 'scikit-learn', 'tf': 'tensorflow', 'hf': 'hugging face', } def _load_proficiency_keywords(self) -> Dict[str, List[str]]: """Load proficiency level keywords""" return { 'expert': ['expert', 'advanced', 'mahir', 'ahli', 'mastery', 'proficient'], 'intermediate': ['intermediate', 'menengah', 'cukup', 'moderate', 'competent'], 'beginner': ['beginner', 'basic', 'pemula', 'dasar', 'fundamental', 'novice'] } def parse_skills_from_text(self, text: str) -> List[str]: """ Parse skills from text by splitting on delimiters Args: text: Raw skill text (e.g., "Python, React, MySQL") Returns: List of individual skill strings """ if not text or not isinstance(text, str): return [] # Split by common delimiters delimiters = [',', ';', '•', '·', '\n', '|', '/'] pattern = '|'.join(map(re.escape, delimiters)) skills = re.split(pattern, text) # Clean each skill cleaned_skills = [] for skill in skills: # Remove brackets, numbers, proficiency levels skill = re.sub(r'\([^)]*\)', '', skill) # Remove (parentheses) skill = re.sub(r'\[[^\]]*\]', '', skill) # Remove [brackets] skill = re.sub(r'\d+\s*(year|tahun|yr)', '', skill, flags=re.IGNORECASE) skill = re.sub(r'^\d+[\.\)]\s*', '', skill) # Remove numbered lists # Strip whitespace skill = skill.strip() if skill and len(skill) > 1: cleaned_skills.append(skill) return cleaned_skills def normalize_skill(self, skill: str) -> str: """ Normalize skill name using aliases Args: skill: Raw skill name Returns: Normalized skill name """ skill_lower = skill.lower().strip() # Remove special characters except dots and spaces skill_lower = re.sub(r'[^\w\s\.\+\#]', '', skill_lower) # Apply aliases if skill_lower in self.skill_aliases: return self.skill_aliases[skill_lower] return skill_lower def categorize_skill(self, skill: str) -> str: """ Categorize a skill into one of 7 categories Args: skill: Normalized skill name Returns: Category name or 'other' """ skill_lower = skill.lower() # Direct match for category, skill_set in self.skill_categories.items(): if skill_lower in skill_set: return category # Partial match for compound skills for category, skill_set in self.skill_categories.items(): for known_skill in skill_set: if known_skill in skill_lower or skill_lower in known_skill: return category return 'other' def extract_proficiency(self, text: str) -> Optional[str]: """ Extract proficiency level from text Args: text: Text containing potential proficiency keywords Returns: Proficiency level or None """ text_lower = text.lower() for level, keywords in self.proficiency_keywords.items(): for keyword in keywords: if keyword in text_lower: return level return None def extract(self, text: str) -> Dict[str, Any]: """ Extract skills with full metadata Args: text: Text containing skills (job description, requirements, etc.) Returns: Dictionary with extracted skills and metadata """ if not text or not isinstance(text, str): return { 'skills': [], 'total_count': 0, 'unique_count': 0, 'by_category': {}, 'proficiency_detected': False } # Parse skills raw_skills = self.parse_skills_from_text(text) # Process each skill processed_skills = [] category_counts = Counter() for raw_skill in raw_skills: normalized = self.normalize_skill(raw_skill) category = self.categorize_skill(normalized) proficiency = self.extract_proficiency(raw_skill) skill_data = { 'original': raw_skill, 'normalized': normalized, 'category': category, 'proficiency': proficiency } processed_skills.append(skill_data) category_counts[category] += 1 # Group by category by_category = {} for skill in processed_skills: category = skill['category'] if category not in by_category: by_category[category] = [] by_category[category].append(skill) return { 'skills': processed_skills, 'total_count': len(processed_skills), 'unique_count': len(set(s['normalized'] for s in processed_skills)), 'by_category': by_category, 'category_counts': dict(category_counts), 'proficiency_detected': any(s['proficiency'] for s in processed_skills) } def extract_simple(self, text: str) -> List[str]: """ Simple extraction returning just skill names Args: text: Text containing skills Returns: List of normalized skill names """ result = self.extract(text) return [s['normalized'] for s in result['skills']] def batch_extract(self, texts: List[str]) -> List[Dict[str, Any]]: """ Extract skills from multiple texts Args: texts: List of texts to process Returns: List of extraction results """ return [self.extract(text) for text in texts] def get_top_skills(self, texts: List[str], top_n: int = 10) -> List[Tuple[str, int]]: """ Get top N most frequent skills across texts Args: texts: List of texts to analyze top_n: Number of top skills to return Returns: List of (skill, count) tuples """ all_skills = [] for text in texts: skills = self.extract_simple(text) all_skills.extend(skills) skill_counts = Counter(all_skills) return skill_counts.most_common(top_n) def get_stats(self) -> Dict[str, Any]: """Get model statistics""" total_skills = sum(len(skills) for skills in self.skill_categories.values()) return { 'version': self.VERSION, 'total_skills': total_skills, 'categories': len(self.skill_categories), 'category_breakdown': { cat: len(skills) for cat, skills in self.skill_categories.items() }, 'total_aliases': len(self.skill_aliases), 'proficiency_levels': len(self.proficiency_keywords) } # Convenience function def extract_skills(text: str) -> List[str]: """ Convenience function for quick skill extraction Args: text: Text containing skills Returns: List of normalized skill names """ extractor = IndonesianSkillExtractor() return extractor.extract_simple(text) if __name__ == "__main__": # Demo print("=" * 70) print("Indonesian Skill Extractor v1.0") print("Author: Herlambang Haryo Putro") print("=" * 70) print() # Initialize extractor = IndonesianSkillExtractor() # Show stats stats = extractor.get_stats() print("Model Statistics:") print(f" Version: {stats['version']}") print(f" Total Skills: {stats['total_skills']}") print(f" Categories: {stats['categories']}") print(f" Aliases: {stats['total_aliases']}") print() # Example 1: Simple extraction text1 = "Python, React, MySQL, AWS, Docker" print("Example 1: Simple text") print(f"Input: {text1}") print(f"Output: {extract_skills(text1)}") print() # Example 2: Indonesian text text2 = "Menguasai Python, JavaScript, komunikasi yang baik, kerja sama tim" print("Example 2: Indonesian text") print(f"Input: {text2}") result = extractor.extract(text2) print(f"Total skills: {result['total_count']}") print(f"Categories: {list(result['by_category'].keys())}") print() # Example 3: With proficiency text3 = "Expert in Python, Advanced React, Basic MySQL" print("Example 3: With proficiency") print(f"Input: {text3}") result = extractor.extract(text3) for skill in result['skills']: print(f" - {skill['normalized']} ({skill['category']}): {skill['proficiency'] or 'N/A'}") print() print("=" * 70) print("Ready for production use!") print("GitHub: https://github.com/herlambangharyoputro") print("=" * 70)