Spaces:

kimtaeyeong1229
/

AI-Study-Roadmap

Sleeping

App Files Files Community

kimtaeyeong1229 commited on Feb 21

Commit

fdef6a0

verified ·

1 Parent(s): 138d3c8

Upload utils/models.py with huggingface_hub

Browse files

Files changed (1) hide show

utils/models.py +215 -0

utils/models.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""ML models and PyTorch MLP class (reused from notebook)."""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import accuracy_score, confusion_matrix
+from sklearn.model_selection import cross_val_score
+try:
+    from xgboost import XGBClassifier
+    XGBOOST_AVAILABLE = True
+except ImportError:
+    XGBOOST_AVAILABLE = False
+SEED = 42
+DEVICE = torch.device('cpu')  # HF Spaces CPU tier
+# ── TitanicMLP (노트북 코드 그대로) ──
+class TitanicMLP(nn.Module):
+    def __init__(self, input_dim, hidden_dims=None, dropout=0.3):
+        super(TitanicMLP, self).__init__()
+        if hidden_dims is None:
+            hidden_dims = [64, 32]
+        layers = []
+        prev_dim = input_dim
+        for hidden_dim in hidden_dims:
+            layers.extend([
+                nn.Linear(prev_dim, hidden_dim),
+                nn.BatchNorm1d(hidden_dim),
+                nn.ReLU(),
+                nn.Dropout(dropout),
+            ])
+            prev_dim = hidden_dim
+        layers.append(nn.Linear(prev_dim, 1))
+        layers.append(nn.Sigmoid())
+        self.network = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.network(x).squeeze()
+def make_dataloader(X_arr, y_arr, batch_size=32, shuffle=True):
+    X_tensor = torch.FloatTensor(np.array(X_arr)).to(DEVICE)
+    y_tensor = torch.FloatTensor(np.array(y_arr)).to(DEVICE)
+    dataset = TensorDataset(X_tensor, y_tensor)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
+def build_sklearn_model(algo: str, params: dict):
+    """Build a sklearn model by algorithm name and hyperparameter dict."""
+    if algo == 'Logistic Regression':
+        return LogisticRegression(
+            C=params.get('C', 1.0),
+            max_iter=1000,
+            random_state=SEED,
+        )
+    elif algo == 'Decision Tree':
+        return DecisionTreeClassifier(
+            max_depth=params.get('max_depth', 4),
+            min_samples_leaf=params.get('min_samples_leaf', 1),
+            random_state=SEED,
+        )
+    elif algo == 'Random Forest':
+        return RandomForestClassifier(
+            n_estimators=params.get('n_estimators', 100),
+            max_depth=params.get('max_depth', 5),
+            random_state=SEED,
+        )
+    elif algo == 'SVM (RBF)':
+        return SVC(
+            C=params.get('C', 1.0),
+            gamma=params.get('gamma', 'scale'),
+            kernel='rbf',
+            probability=True,
+            random_state=SEED,
+        )
+    elif algo == 'KNN':
+        return KNeighborsClassifier(
+            n_neighbors=params.get('n_neighbors', 7),
+            weights=params.get('weights', 'uniform'),
+        )
+    elif algo == 'Gradient Boosting':
+        return GradientBoostingClassifier(
+            n_estimators=params.get('n_estimators', 100),
+            learning_rate=params.get('learning_rate', 0.1),
+            max_depth=params.get('max_depth', 3),
+            random_state=SEED,
+        )
+    elif algo == 'XGBoost':
+        return XGBClassifier(
+            n_estimators=params.get('n_estimators', 100),
+            learning_rate=params.get('learning_rate', 0.1),
+            max_depth=params.get('max_depth', 3),
+            random_state=SEED,
+            eval_metric='logloss',
+            verbosity=0,
+        )
+    elif algo == 'Naive Bayes':
+        return GaussianNB()
+    else:
+        raise ValueError(f"Unknown algorithm: {algo}")
+def train_sklearn_model(model, X_train, X_test, y_train, y_test, cv_folds=5):
+    """Train and evaluate a sklearn model. Returns metrics dict."""
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    acc = accuracy_score(y_test, y_pred)
+    cm = confusion_matrix(y_test, y_pred)
+    cv_scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring='accuracy')
+    feature_importances = None
+    if hasattr(model, 'feature_importances_'):
+        feature_importances = model.feature_importances_
+    elif hasattr(model, 'coef_'):
+        feature_importances = model.coef_[0]
+    return {
+        'model': model,
+        'accuracy': acc,
+        'y_pred': y_pred,
+        'confusion_matrix': cm,
+        'cv_mean': cv_scores.mean(),
+        'cv_std': cv_scores.std(),
+        'feature_importances': feature_importances,
+    }
+def train_mlp(X_train_scaled, X_test_scaled, y_train, y_test,
+              hidden_dims, epochs, lr, batch_size, dropout,
+              progress_callback=None):
+    """Train TitanicMLP and return training history + metrics."""
+    input_dim = X_train_scaled.shape[1]
+    mlp = TitanicMLP(input_dim=input_dim, hidden_dims=hidden_dims, dropout=dropout).to(DEVICE)
+    train_loader = make_dataloader(X_train_scaled, y_train.values, batch_size, shuffle=True)
+    test_loader = make_dataloader(X_test_scaled, y_test.values, batch_size, shuffle=False)
+    criterion = nn.BCELoss()
+    optimizer = optim.Adam(mlp.parameters(), lr=lr, weight_decay=1e-4)
+    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=max(1, epochs // 3), gamma=0.5)
+    train_losses, test_losses = [], []
+    train_accs, test_accs = [], []
+    for epoch in range(1, epochs + 1):
+        # Train
+        mlp.train()
+        epoch_loss, correct, total = 0.0, 0, 0
+        for X_batch, y_batch in train_loader:
+            optimizer.zero_grad()
+            output = mlp(X_batch)
+            loss = criterion(output, y_batch)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            pred = (output >= 0.5).float()
+            correct += (pred == y_batch).sum().item()
+            total += len(y_batch)
+        train_losses.append(epoch_loss / len(train_loader))
+        train_accs.append(correct / total)
+        # Eval
+        mlp.eval()
+        with torch.no_grad():
+            t_loss, t_correct, t_total = 0.0, 0, 0
+            for X_batch, y_batch in test_loader:
+                output = mlp(X_batch)
+                t_loss += criterion(output, y_batch).item()
+                pred = (output >= 0.5).float()
+                t_correct += (pred == y_batch).sum().item()
+                t_total += len(y_batch)
+        test_losses.append(t_loss / len(test_loader))
+        test_accs.append(t_correct / t_total)
+        scheduler.step()
+        if progress_callback:
+            progress_callback(epoch, epochs, train_losses[-1], train_accs[-1],
+                              test_losses[-1], test_accs[-1])
+    # Final predictions for confusion matrix
+    mlp.eval()
+    y_pred_list = []
+    with torch.no_grad():
+        for X_batch, _ in test_loader:
+            output = mlp(X_batch)
+            y_pred_list.extend((output >= 0.5).cpu().numpy().astype(int))
+    cm = confusion_matrix(y_test, y_pred_list)
+    return {
+        'train_losses': train_losses,
+        'test_losses': test_losses,
+        'train_accs': train_accs,
+        'test_accs': test_accs,
+        'final_acc': test_accs[-1],
+        'confusion_matrix': cm,
+        'y_pred': y_pred_list,
+    }