#!/usr/bin/env python3 """ Load and examine the Bengali Math dataset Dataset: hamim-87/Ashrafur_bangla_math """ from datasets import load_dataset import pandas as pd def load_and_examine_dataset(): """Load the Bengali math dataset and examine its structure""" print("šŸ”„ Loading Bengali Math Dataset...") print("Dataset: hamim-87/Ashrafur_bangla_math") print("=" * 50) try: # Load the dataset ds = load_dataset("hamim-87/Ashrafur_bangla_math") print("āœ… Dataset loaded successfully!") print(f"Dataset splits: {list(ds.keys())}") print() # Examine each split for split_name, split_data in ds.items(): print(f"šŸ“Š {split_name.upper()} SPLIT ANALYSIS:") print("-" * 30) print(f"Number of examples: {len(split_data)}") print(f"Columns: {split_data.column_names}") # Show first few examples print("\nšŸ” Sample Data:") for i in range(min(3, len(split_data))): example = split_data[i] print(f"\nExample {i+1}:") for key, value in example.items(): # Truncate long text for display value_str = str(value) if len(value_str) > 200: value_str = value_str[:200] + "..." print(f" {key}: {value_str}") # Show column info print(f"\nšŸ“‹ Column Information:") for col in split_data.column_names: # Get first non-null value to determine type sample_values = [] for i in range(min(10, len(split_data))): if split_data[i][col] is not None: sample_values.append(split_data[i][col]) if sample_values: sample_type = type(sample_values[0]) unique_count = len(set(str(v) for v in sample_values)) print(f" {col}: {sample_type.__name__}, ~{unique_count} unique values") else: print(f" {col}: All values are None") print("\n" + "=" * 50) # Save dataset info to file dataset_info = { "dataset_name": "hamim-87/Ashrafur_bangla_math", "splits": {name: len(split_data) for name, split_data in ds.items()}, "columns": {name: split_data.column_names for name, split_data in ds.items()}, "loaded_at": "2025-12-21 17:50:46" } # Convert to DataFrame for easier inspection print("\nšŸ“ˆ Creating DataFrame for easier inspection...") all_data = {} for split_name, split_data in ds.items(): df = split_data.to_pandas() all_data[split_name] = df print(f"āœ… Converted {split_name} to DataFrame with {len(df)} rows") # Save to JSON for reference import json with open('/workspace/dataset_info.json', 'w') as f: json.dump(dataset_info, f, indent=2) print(f"\nšŸ’¾ Dataset information saved to: dataset_info.json") # Show dataset statistics print("\nšŸ“Š DATASET STATISTICS:") print("-" * 25) for split_name, df in all_data.items(): print(f"{split_name}:") print(f" Rows: {len(df)}") print(f" Columns: {len(df.columns)}") # Show data types print(f" Data types:") for col in df.columns: print(f" {col}: {df[col].dtype}") print() return ds, all_data except Exception as e: print(f"āŒ Error loading dataset: {e}") return None, None def show_next_steps(): """Show possible next steps for training""" print("\nšŸŽÆ POSSIBLE TRAINING APPROACHES:") print("=" * 40) print("1. šŸ¤– Language Model Fine-tuning") print(" - Train a Bengali language model on math content") print(" - Use for math problem solving or educational assistance") print("\n2. šŸ“ Text Classification") print(" - Classify math problems by difficulty level") print(" - Identify math problem types (arithmetic, algebra, etc.)") print("\n3. šŸ” Question Answering") print(" - Train a QA model for math problems") print(" - Provide step-by-step solutions") print("\n4. šŸ“Š Data Analysis") print(" - Analyze patterns in math problem structure") print(" - Generate new similar problems") print("\n5. šŸŽ“ Educational Tool") print(" - Create interactive math learning assistant") print(" - Adaptive problem generation") print("\nšŸ’” What would you like to train?") if __name__ == "__main__": ds, dataframes = load_and_examine_dataset() if ds is not None: show_next_steps() else: print("āŒ Failed to load dataset. Please check the dataset name and your internet connection.")