import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go from datasets import load_dataset import json from datetime import datetime import re # Dataset metadata from HuggingFace search DATASETS_METADATA = [ {"name": "ethanolivertroy/nist-cybersecurity-training", "downloads": 8000, "likes": 48, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "nist", "compliance", "security-controls", "zero-trust", "privacy"], "category": "compliance", "description": "NIST Cybersecurity Training Dataset v1.1 - The largest open-source NIST cybersecurity training dataset for fine-tuning LLMs"}, {"name": "clydeiii/cybersecurity", "downloads": 4000, "likes": 6, "size": "100K-1M", "language": "unknown", "tags": ["APT", "threat-intelligence"], "category": "offensive", "description": "APT notes dataset from GitHub"}, {"name": "vinitvek/cybersecurityattacks", "downloads": 2300, "likes": 5, "size": "10K-100K", "language": "en", "tags": ["attacks", "security"], "category": "offensive", "description": "Cybersecurity attacks dataset"}, {"name": "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 786, "likes": 78, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning", "threat-intelligence", "incident-response", "security-operations"], "category": "defensive", "description": "53,202 meticulously curated system/user/assistant instruction-tuning examples covering defensive security"}, {"name": "AlicanKiraz0/Cybersecurity-Dataset-Fenrir-v2.0", "downloads": 353, "likes": 10, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "83,920 high-quality system/user/assistant triples for defensive cybersecurity"}, {"name": "AlicanKiraz0/Cybersecurity-Dataset-Heimdall-v1.1", "downloads": 192, "likes": 13, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "21,258 high-quality system/user/assistant triples for training alignment-safe, defensive-cybersecurity LLMs"}, {"name": "Chemically-motivated/CyberSecurityDataset", "downloads": 180, "likes": 3, "size": "<1K", "language": "en", "tags": ["cybersecurity", "machine learning", "pentesting", "exploits"], "category": "offensive", "description": "Curated data points related to penetration testing, known exploits, and vulnerabilities"}, {"name": "ChaoticNeutrals/Cybersecurity-ShareGPT", "downloads": 175, "likes": 15, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "ShareGPT"], "category": "ai", "description": "Converted, deslopped, min-hash deduplicated cybersecurity ShareGPT dataset"}, {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller_2.0", "downloads": 145, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["Cyber", "Security", "Cybersecurity", "LLM", "Pentest", "RedTeam", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset"}, {"name": "Bouquets/DeepSeek-V3-Distill-Cybersecurity-en", "downloads": 140, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "penetration-testing", "distilled"], "category": "offensive", "description": "High-quality distilled dataset specialized in cybersecurity penetration testing domain"}, {"name": "Druva-S-Kumar/cybersecurity-qa-dataset", "downloads": 123, "likes": 0, "size": "<1K", "language": "en", "tags": ["qa", "cybersecurity"], "category": "ai", "description": "Cybersecurity Q&A dataset"}, {"name": "Rowden/CybersecurityQAA", "downloads": 119, "likes": 4, "size": "1K-10K", "language": "en", "tags": ["cybersecurity", "QAA"], "category": "ai", "description": "Cybersecurity Question-Answer-Assertion (QAA) Dataset designed to evaluate LLM capabilities"}, {"name": "luckwa/cybersecurity-dataset", "downloads": 119, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "General cybersecurity dataset"}, {"name": "Vanessasml/cybersecurity_32k_instruction_input_output", "downloads": 114, "likes": 17, "size": "10K-100K", "language": "en", "tags": ["NIST", "ITC EBA", "threat-identification"], "category": "compliance", "description": "Q&As focused on identification of cyber threats, and text classification under NIST taxonomy"}, {"name": "AlicanKiraz0/Cybersecurity-Dataset-v1", "downloads": 98, "likes": 12, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 high-quality instruction-response pairs focused on defensive cybersecurity education"}, {"name": "mariiazhiv/cybersecurity_qa", "downloads": 97, "likes": 1, "size": "<1K", "language": "en", "tags": ["question-answering", "cybersecurity"], "category": "ai", "description": "Instruction-response pairs focused on cybersecurity concepts"}, {"name": "CyberNative/CyberSecurityEval", "downloads": 84, "likes": 19, "size": "<1K", "language": "en", "tags": ["cybersecurity", "infosec", "IT", "evaluation"], "category": "ai", "description": "CyberNative AI for CyberSecurity Q/A Evaluation - NOT FOR TRAINING"}, {"name": "whybe-choi/kovidore-v2-cybersecurity-beir", "downloads": 80, "likes": 1, "size": "1K-10K", "language": "ko", "tags": ["Visual Retrieving", "Industrial RAG"], "category": "defensive", "description": "Corpus of technical reports on cyber threat trends and security incident responses in Korea"}, {"name": "Canstralian/Purple-Team-Cybersecurity-Dataset", "downloads": 73, "likes": 9, "size": "10K-100K", "language": "en", "tags": ["purple-team", "code"], "category": "defensive", "description": "Synthetic collection designed to simulate collaborative cybersecurity exercises"}, {"name": "Bouquets/Cybersecurity-LLM-CVE", "downloads": 46, "likes": 15, "size": "100K-1M", "language": "en", "tags": ["CVE", "vulnerabilities"], "category": "defensive", "description": "CVE vulnerability database for cybersecurity"}, {"name": "theResearchNinja/benchmarkResults_violentUTF_cybersecurityBehavior", "downloads": 37, "likes": 1, "size": "100K-1M", "language": "en", "tags": ["benchmark", "results"], "category": "ai", "description": "Interdependent cybersecurity benchmark results"}, {"name": "schooly/Cyber-Security-Breaches", "downloads": 36, "likes": 11, "size": "1K-10K", "language": "en", "tags": ["breaches", "incidents"], "category": "offensive", "description": "Cyber security breaches dataset"}, {"name": "jcordon5/cybersecurity-rules", "downloads": 36, "likes": 9, "size": "<1K", "language": "en", "tags": ["SIGMA", "YARA", "Suricata", "detection-rules"], "category": "defensive", "description": "950 detection rules from official SIGMA, YARA, and Suricata repositories"}, {"name": "Tiamz/cybersecurity-instruction-dataset", "downloads": 33, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["instruction", "cybersecurity"], "category": "ai", "description": "Cybersecurity instruction dataset"}, {"name": "zeroshot/cybersecurity-corpus", "downloads": 29, "likes": 9, "size": "1K-10K", "language": "en", "tags": ["corpus"], "category": "ai", "description": "Cybersecurity corpus for training"}, {"name": "mteb/kovidore-v2-cybersecurity-mteb", "downloads": 29, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset in Korean"}, {"name": "electricsheepafrica/nigerian-telecom-cybersecurity-incident-logs", "downloads": 27, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["telecom", "cybersecurity", "incident", "logs"], "category": "defensive", "description": "Security events including intrusions, DDoS attacks, and malware on telecom infrastructure"}, {"name": "CyberNative/github_cybersecurity_READMEs", "downloads": 26, "likes": 14, "size": "1K-10K", "language": "en", "tags": ["github", "README"], "category": "ai", "description": "GitHub cybersecurity README files"}, {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_2.0", "downloads": 26, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["Cyber", "Security", "Pentest", "Cybersecurity", "LLM", "BlueTeam"], "category": "ai", "description": "Alpaca format cybersecurity dataset v2.0"}, {"name": "hcnote/Cybersecurity-Dataset", "downloads": 26, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "question-answering"], "category": "ai", "description": "High-quality cybersecurity dataset"}, {"name": "Zeo6/CyberSecurity-FineTune", "downloads": 25, "likes": 0, "size": "unknown", "language": "en", "tags": ["finetune"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"}, {"name": "ystemsrx/Cybersecurity-ShareGPT-Chinese", "downloads": 24, "likes": 21, "size": "10K-100K", "language": "zh", "tags": ["code", "Chinese"], "category": "ai", "description": "Chinese cybersecurity dataset in ShareGPT format"}, {"name": "whybe-choi/kovidore-v2-cybersecurity-mteb", "downloads": 24, "likes": 0, "size": "1K-10K", "language": "ko", "tags": ["MTEB", "retrieval"], "category": "ai", "description": "MTEB cybersecurity retrieval dataset"}, {"name": "princemaxp/cybersecurity-keywords", "downloads": 22, "likes": 1, "size": "<1K", "language": "en", "tags": ["cybersecurity", "keywords"], "category": "ai", "description": "Common cybersecurity keywords list"}, {"name": "madox81/cybersecurity_attack_conversational_dataset", "downloads": 20, "likes": 0, "size": "unknown", "language": "en", "tags": ["conversational", "attacks"], "category": "offensive", "description": "Conversational cybersecurity attack dataset"}, {"name": "safouene99999/Cybersecurity_QA", "downloads": 19, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["QA"], "category": "ai", "description": "Cybersecurity Q&A dataset"}, {"name": "hcnote/High-quality-cybersecurity-datasets", "downloads": 19, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "277,707 high-quality cybersecurity records with AI annotation"}, {"name": "theResearchNinja/violentutf_cybersecurityBehavior", "downloads": 18, "likes": 3, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "cognitive behavioral psychology", "benchmark"], "category": "ai", "description": "LLM cybersecurity behavior benchmark dataset"}, {"name": "GotThatData/nist-cybersecurity-framework", "downloads": 18, "likes": 7, "size": "1K-10K", "language": "en", "tags": ["NIST", "Cybersecurity", "Framework"], "category": "compliance", "description": "NIST Cybersecurity Publications Dataset"}, {"name": "Mohabahmed03/Alpaca_Dataset_General_CyberSecurity", "downloads": 18, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["General", "Alpaca", "CyberSecurity"], "category": "ai", "description": "General Alpaca format cybersecurity dataset"}, {"name": "vnovaai19/CYBERSECURITY_JSONL_V1", "downloads": 18, "likes": 0, "size": "<1K", "language": "en", "tags": ["cybersecurity", "synthetic-data", "safety", "phishing", "fraud-detection"], "category": "defensive", "description": "100 synthetic cybersecurity threat scenarios with educational AI responses"}, {"name": "Mattimax/Cybersecurity-ShareGPT-Italian", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "it", "tags": ["Italian", "ShareGPT"], "category": "ai", "description": "Italian cybersecurity ShareGPT dataset"}, {"name": "olgazigbeehub/cybersecurity-news-dataset-english-3000", "downloads": 18, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity", "media-analysis"], "category": "defensive", "description": "3,000 English-language cybersecurity news metadata rows"}, {"name": "hcnote/Cybersecurity-High-Quality-Dataset", "downloads": 17, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["high-quality"], "category": "ai", "description": "270,271 high-quality Chinese-English Q&A cybersecurity dataset"}, {"name": "ScoutieAutoML/cybersecurity_news_telegram_dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "ru", "tags": ["russia", "cybersecurity", "media", "news"], "category": "defensive", "description": "Russian-language Telegram news channels on cybersecurity"}, {"name": "savaniDhruv/Cybersecurity_Attack_Dataset", "downloads": 16, "likes": 2, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"}, {"name": "pyToshka/cyber-security-events", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Cybersecurity events collected from honeypot infrastructure"}, {"name": "ahmadkaab/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 16, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "defensive-security", "instruction-tuning"], "category": "defensive", "description": "53,202 defensive security instruction-tuning examples"}, {"name": "MCP-1st-Birthday/smoltrace-cybersecurity-tasks", "downloads": 15, "likes": 0, "size": "<1K", "language": "en", "tags": ["smoltrace", "synthetic-data", "agent-evaluation"], "category": "ai", "description": "SMOLTRACE synthetic dataset for agent evaluation"}, {"name": "ErebusTN/The-Ultimate-CyberSecurity-Dataset-Collection", "downloads": 14, "likes": 1, "size": "unknown", "language": "en", "tags": ["collection"], "category": "ai", "description": "Ultimate cybersecurity dataset collection"}, {"name": "NewsDataHub/cybersecurity-news-dataset-english-3000", "downloads": 14, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["news", "cybersecurity"], "category": "defensive", "description": "3,000 English cybersecurity news metadata rows"}, {"name": "AYI-NEDJIMI/ai-cybersecurity-en", "downloads": 14, "likes": 0, "size": "<1K", "language": "en", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai", "deepfake"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - English Dataset"}, {"name": "AR2021/cybersecurity-corpus-llama2-1k", "downloads": 13, "likes": 1, "size": "<1K", "language": "en", "tags": ["llama2"], "category": "ai", "description": "Cybersecurity corpus for Llama2"}, {"name": "boapro/Purple-Team-Cybersecurity-Dataset", "downloads": 13, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["code", "purple-team"], "category": "defensive", "description": "Synthetic purple team cybersecurity exercises"}, {"name": "tuandunghcmut/Trendyol-Cybersecurity-Instruction-Tuning-Dataset", "downloads": 13, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["cybersecurity", "security", "cyber-defense", "conversational"], "category": "defensive", "description": "GPT format conversational cybersecurity dataset"}, {"name": "AYI-NEDJIMI/ai-cybersecurity-fr", "downloads": 13, "likes": 0, "size": "<1K", "language": "fr", "tags": ["artificial-intelligence", "cybersecurity", "offensive-ai", "defensive-ai"], "category": "ai", "description": "AI in Offensive and Defensive Cybersecurity - French Dataset"}, {"name": "pki/autonlp-data-cybersecurity", "downloads": 12, "likes": 0, "size": "unknown", "language": "en", "tags": ["autonlp"], "category": "ai", "description": "AutoNLP cybersecurity data"}, {"name": "Hadihilman/cybersecurity-dataset", "downloads": 12, "likes": 0, "size": "<1K", "language": "en", "tags": ["images"], "category": "defensive", "description": "Cybersecurity image dataset"}, {"name": "AnodeAI/Elite_quality_cybersecurity", "downloads": 12, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["legal", "finance"], "category": "ai", "description": "Elite quality cybersecurity dataset"}, {"name": "hcnote/Cybersecurity-bigDataset", "downloads": 12, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Global first open-source mega-scale cybersecurity dataset"}, {"name": "bnsapa/cybersecurity-ner", "downloads": 11, "likes": 2, "size": "1K-10K", "language": "en", "tags": ["token-classification", "NER"], "category": "ai", "description": "Cybersecurity named entity recognition dataset"}, {"name": "baig31/Cybersecurity_penetration_testing_books", "downloads": 11, "likes": 12, "size": "unknown", "language": "en", "tags": ["books", "penetration-testing"], "category": "offensive", "description": "Cybersecurity penetration testing books"}, {"name": "beldua/english-cybersecurity-basics-30", "downloads": 11, "likes": 0, "size": "<1K", "language": "en", "tags": ["basics"], "category": "ai", "description": "English cybersecurity basics"}, {"name": "ahmedds10/finetuning_cybersecurity", "downloads": 10, "likes": 0, "size": "<1K", "language": "en", "tags": ["finetuning"], "category": "ai", "description": "Cybersecurity fine-tuning dataset"}, {"name": "Mohabahmed03/Alpaca_Dataset_CyberSecurity_Smaller", "downloads": 10, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["CyberSecurity", "Finetune"], "category": "ai", "description": "Smaller Alpaca cybersecurity dataset"}, {"name": "ChavyvAkvar/Trendyol-Cybersecurity-Instruction-Tuning-Dataset-Converted", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["converted"], "category": "defensive", "description": "Converted Trendyol cybersecurity dataset"}, {"name": "tandevllc/cybersecurity-atom-rss-feeds-2025", "downloads": 10, "likes": 1, "size": "unknown", "language": "en", "tags": ["news", "rss", "feeds"], "category": "defensive", "description": "Cybersecurity Atom/RSS feeds 2025"}, {"name": "tandevllc/cybersecurity-wiki-slices", "downloads": 10, "likes": 1, "size": "10K-100K", "language": "en", "tags": ["wikipedia", "cybersecurity"], "category": "ai", "description": "Curated collection of English Wikipedia pages covering cybersecurity"}, {"name": "antitheft159/CybersecurityAttacks", "downloads": 9, "likes": 1, "size": "unknown", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attacks dataset"}, {"name": "Tiamz/cybersecurity-raw-json-datasets", "downloads": 9, "likes": 0, "size": "unknown", "language": "en", "tags": ["raw", "json"], "category": "ai", "description": "Raw JSON cybersecurity datasets"}, {"name": "burpsuite/Cybersecurity-Dataset-v1", "downloads": 9, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["cybersecurity"], "category": "defensive", "description": "2,500 defensive cybersecurity instruction-response pairs"}, {"name": "Deshaune/Global-Cybersecurity-Threats-2015_2024", "downloads": 8, "likes": 1, "size": "1K-10K", "language": "en", "tags": ["global-threats"], "category": "defensive", "description": "Global cybersecurity threats from 2015-2024"}, {"name": "oceancharcoal/Cybersecurity_attack_dataset", "downloads": 8, "likes": 0, "size": "10K-100K", "language": "en", "tags": ["attacks"], "category": "offensive", "description": "Cybersecurity attack dataset"}, {"name": "pyToshka/cyber-security-events-full", "downloads": 8, "likes": 0, "size": "100K-1M", "language": "en", "tags": ["cybersecurity", "honeypot", "threat-intelligence"], "category": "defensive", "description": "Full cybersecurity events from honeypot infrastructure"}, {"name": "dattaraj/rag_eval_cybersecurity", "downloads": 7, "likes": 0, "size": "<1K", "language": "en", "tags": ["RAG", "evaluation"], "category": "ai", "description": "RAG evaluation for cybersecurity"}, {"name": "lianghsun/tw-cybersecurity", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "ISO-27001"], "category": "compliance", "description": "Taiwan cybersecurity dataset with ISO/IEC 27001"}, {"name": "mariiazhiv/Cybersecurity_messages", "downloads": 7, "likes": 0, "size": "1K-10K", "language": "en", "tags": ["messages"], "category": "ai", "description": "Cybersecurity messages dataset"}, {"name": "MichaelPrimez/cybersecurity-questionaire", "downloads": 6, "likes": 0, "size": "<1K", "language": "en", "tags": ["questionnaire", "synthetic", "distilabel"], "category": "ai", "description": "Cybersecurity questionnaire dataset"}, {"name": "lianghsun/tw-cybersecurity-chat", "downloads": 5, "likes": 0, "size": "1K-10K", "language": "zh", "tags": ["Taiwan", "cybersecurity", "chat"], "category": "ai", "description": "Taiwan cybersecurity chat dataset"}, {"name": "WhoIsShe/CyberSecurity-big", "downloads": 5, "likes": 1, "size": "1M-10M", "language": "en", "tags": ["large-scale"], "category": "ai", "description": "Large-scale cybersecurity dataset"}, ] def create_dataframe(): """Create pandas DataFrame from metadata""" df = pd.DataFrame(DATASETS_METADATA) df['url'] = df['name'].apply(lambda x: f"https://huggingface.co/datasets/{x}") return df def get_dataset_stats(): """Generate overall statistics""" df = create_dataframe() stats = { "Total Datasets": len(df), "Total Downloads": f"{df['downloads'].sum():,}", "Total Likes": f"{df['likes'].sum():,}", "Languages": len(df['language'].unique()), "Categories": len(df['category'].unique()), } return stats def filter_datasets(keyword, language, category, min_downloads, min_likes): """Filter datasets based on criteria""" df = create_dataframe() # Filter by keyword if keyword: mask = ( df['name'].str.contains(keyword, case=False, na=False) | df['description'].str.contains(keyword, case=False, na=False) | df['tags'].apply(lambda x: any(keyword.lower() in tag.lower() for tag in x)) ) df = df[mask] # Filter by language if language and language != "All": df = df[df['language'] == language] # Filter by category if category and category != "All": df = df[df['category'] == category] # Filter by downloads if min_downloads: df = df[df['downloads'] >= min_downloads] # Filter by likes if min_likes: df = df[df['likes'] >= min_likes] return df def search_datasets(keyword, language, category, min_downloads, min_likes): """Search and display datasets""" df = filter_datasets(keyword, language, category, min_downloads, min_likes) # Format for display display_df = df[['name', 'downloads', 'likes', 'size', 'language', 'category', 'description']].copy() display_df.columns = ['Dataset Name', 'Downloads', 'Likes', 'Size', 'Language', 'Category', 'Description'] result_text = f"Found {len(df)} datasets matching your criteria" return display_df, result_text def get_dataset_details(dataset_name): """Get detailed information about a specific dataset""" df = create_dataframe() if not dataset_name: return "Please select a dataset from the list above", None, None dataset = df[df['name'] == dataset_name] if dataset.empty: return "Dataset not found", None, None dataset = dataset.iloc[0] details = f""" ## {dataset['name']} **Description:** {dataset['description']} **Statistics:** - Downloads: {dataset['downloads']:,} - Likes: {dataset['likes']} - Size: {dataset['size']} - Language: {dataset['language']} - Category: {dataset['category']} **Tags:** {', '.join(dataset['tags'])} **HuggingFace URL:** [{dataset['url']}]({dataset['url']}) --- *Note: To preview dataset samples, you would need to load the actual dataset using the HuggingFace datasets library. This demo shows metadata only. For full dataset access, click the URL above.* """ # Create a simple preview table (mock data since we're not loading actual datasets) preview_data = { "Column": ["Feature 1", "Feature 2", "Feature 3"], "Type": ["text", "text", "category"], "Sample": ["Sample data...", "Sample data...", "Sample category..."] } preview_df = pd.DataFrame(preview_data) return details, preview_df, dataset['url'] def create_category_chart(): """Create pie chart of datasets by category""" df = create_dataframe() category_counts = df['category'].value_counts() fig = px.pie( values=category_counts.values, names=category_counts.index, title='Datasets by Category', color_discrete_sequence=px.colors.sequential.RdBu, hole=0.3 ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font=dict(color='white') ) return fig def create_language_chart(): """Create bar chart of datasets by language""" df = create_dataframe() language_counts = df['language'].value_counts().head(10) fig = px.bar( x=language_counts.index, y=language_counts.values, title='Top 10 Languages', labels={'x': 'Language', 'y': 'Number of Datasets'}, color=language_counts.values, color_continuous_scale='Viridis' ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font=dict(color='white'), showlegend=False ) return fig def create_downloads_chart(): """Create bar chart of top datasets by downloads""" df = create_dataframe() top_downloads = df.nlargest(15, 'downloads')[['name', 'downloads']] top_downloads['short_name'] = top_downloads['name'].apply(lambda x: x.split('/')[-1][:30]) fig = px.bar( top_downloads, x='downloads', y='short_name', orientation='h', title='Top 15 Datasets by Downloads', labels={'downloads': 'Downloads', 'short_name': 'Dataset'}, color='downloads', color_continuous_scale='Plasma' ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font=dict(color='white'), height=600, showlegend=False ) return fig def create_size_distribution_chart(): """Create distribution chart of dataset sizes""" df = create_dataframe() size_counts = df['size'].value_counts() fig = px.bar( x=size_counts.index, y=size_counts.values, title='Dataset Size Distribution', labels={'x': 'Size Category', 'y': 'Number of Datasets'}, color=size_counts.values, color_continuous_scale='Cividis' ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font=dict(color='white'), showlegend=False ) return fig def export_to_csv(keyword, language, category, min_downloads, min_likes): """Export filtered datasets to CSV""" df = filter_datasets(keyword, language, category, min_downloads, min_likes) output_path = "/tmp/cybersecurity_datasets.csv" df.to_csv(output_path, index=False) return output_path def export_to_json(keyword, language, category, min_downloads, min_likes): """Export filtered datasets to JSON""" df = filter_datasets(keyword, language, category, min_downloads, min_likes) output_path = "/tmp/cybersecurity_datasets.json" df.to_json(output_path, orient='records', indent=2) return output_path # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple").set( body_background_fill='*primary_900', body_background_fill_dark='*primary_950', block_background_fill='*primary_800', block_background_fill_dark='*primary_900', block_border_color='*primary_600', input_background_fill='*primary_700', button_primary_background_fill='*primary_600', button_primary_background_fill_hover='*primary_500', )) as demo: gr.Markdown(""" # 🔐 Cybersecurity Dataset Explorer Explore and analyze 80+ cybersecurity datasets from HuggingFace **Features:** - Search by keyword, language, category - View detailed statistics and visualizations - Export datasets metadata to CSV/JSON - Preview dataset information - Direct links to HuggingFace repositories """) # Statistics overview with gr.Row(): stats = get_dataset_stats() for key, value in stats.items(): gr.Markdown(f"### {key}\n## {value}") # Main tabs with gr.Tabs(): # Search & Filter Tab with gr.Tab("🔍 Search & Filter"): with gr.Row(): with gr.Column(scale=1): keyword_input = gr.Textbox( label="Search Keyword", placeholder="Enter keyword (name, description, or tags)...", lines=1 ) language_dropdown = gr.Dropdown( label="Language", choices=["All"] + sorted(list(set([d['language'] for d in DATASETS_METADATA]))), value="All" ) category_dropdown = gr.Dropdown( label="Category", choices=["All", "ai", "defensive", "offensive", "compliance"], value="All" ) min_downloads_slider = gr.Slider( label="Minimum Downloads", minimum=0, maximum=10000, value=0, step=100 ) min_likes_slider = gr.Slider( label="Minimum Likes", minimum=0, maximum=100, value=0, step=1 ) search_btn = gr.Button("🔍 Search Datasets", variant="primary") with gr.Column(scale=3): result_text = gr.Textbox(label="Search Results", lines=1) results_table = gr.Dataframe( label="Datasets", wrap=True, interactive=False ) with gr.Row(): export_csv_btn = gr.Button("đŸ“Ĩ Export to CSV") export_json_btn = gr.Button("đŸ“Ĩ Export to JSON") with gr.Row(): csv_file = gr.File(label="CSV Download") json_file = gr.File(label="JSON Download") search_btn.click( fn=search_datasets, inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], outputs=[results_table, result_text] ) export_csv_btn.click( fn=export_to_csv, inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], outputs=csv_file ) export_json_btn.click( fn=export_to_json, inputs=[keyword_input, language_dropdown, category_dropdown, min_downloads_slider, min_likes_slider], outputs=json_file ) # Dataset Details Tab with gr.Tab("📊 Dataset Details"): dataset_selector = gr.Dropdown( label="Select Dataset", choices=[d['name'] for d in DATASETS_METADATA], value=DATASETS_METADATA[0]['name'] if DATASETS_METADATA else None ) view_details_btn = gr.Button("View Details", variant="primary") dataset_details = gr.Markdown(label="Dataset Information") preview_table = gr.Dataframe(label="Preview (Mock Data)") dataset_link = gr.Textbox(label="HuggingFace URL") view_details_btn.click( fn=get_dataset_details, inputs=dataset_selector, outputs=[dataset_details, preview_table, dataset_link] ) # Statistics & Visualizations Tab with gr.Tab("📈 Statistics & Charts"): gr.Markdown("## Dataset Analytics Dashboard") with gr.Row(): category_chart = gr.Plot(label="Category Distribution") language_chart = gr.Plot(label="Language Distribution") with gr.Row(): downloads_chart = gr.Plot(label="Top Downloads") with gr.Row(): size_chart = gr.Plot(label="Size Distribution") refresh_charts_btn = gr.Button("🔄 Refresh Charts", variant="primary") def refresh_all_charts(): return ( create_category_chart(), create_language_chart(), create_downloads_chart(), create_size_distribution_chart() ) refresh_charts_btn.click( fn=refresh_all_charts, outputs=[category_chart, language_chart, downloads_chart, size_chart] ) # Load charts on startup demo.load( fn=refresh_all_charts, outputs=[category_chart, language_chart, downloads_chart, size_chart] ) # About Tab with gr.Tab("â„šī¸ About"): gr.Markdown(""" ## About Dataset Explorer This application provides a comprehensive interface to explore 80 cybersecurity datasets from HuggingFace. ### Features: 1. **Search & Filter**: Find datasets by keyword, language, category, popularity 2. **Dataset Details**: View comprehensive information about each dataset 3. **Statistics**: Visual analytics with interactive charts 4. **Export**: Download filtered results as CSV or JSON 5. **Direct Links**: Access to HuggingFace repositories ### Categories: - **AI**: Datasets for training AI/ML models - **Defensive**: Blue team, threat detection, incident response - **Offensive**: Red team, penetration testing, exploits - **Compliance**: NIST, ISO 27001, regulatory frameworks ### Data Sources: All datasets are publicly available on HuggingFace Hub. This explorer provides metadata and filtering capabilities. To access the actual dataset content, click the HuggingFace URL for any dataset. ### Technologies: - **Gradio**: Interactive web interface - **Pandas**: Data manipulation - **Plotly**: Interactive visualizations - **HuggingFace Datasets**: Dataset metadata --- **Created by:** AYI-NEDJIMI **Version:** 1.0 **Last Updated:** February 2026 """) # Footer gr.Markdown(""" --- 💡 **Tip**: Use the search feature to find datasets by specific topics like "NIST", "penetration testing", "threat intelligence", etc. """) if __name__ == "__main__": demo.launch()