"""Data loading and preprocessing utilities for Titanic dataset.""" import numpy as np import pandas as pd import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import streamlit as st SEED = 42 @st.cache_data def load_titanic(): """Load Titanic dataset from seaborn (no Kaggle account needed).""" return sns.load_dataset('titanic') def preprocess_titanic(df): """타이타닉 데이터 전처리 함수 (노트북 코드 재사용)""" data = df.copy() features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'] target = 'survived' data = data[features + [target]].copy() # 결측값 처리 data['age'].fillna(data['age'].median(), inplace=True) data['fare'].fillna(data['fare'].median(), inplace=True) data['embarked'].fillna(data['embarked'].mode()[0], inplace=True) # 범주형 → 숫자 data['sex'] = (data['sex'] == 'male').astype(int) embarked_map = {'S': 0, 'C': 1, 'Q': 2} data['embarked'] = data['embarked'].map(embarked_map) # 파생 피처 data['family_size'] = data['sibsp'] + data['parch'] return data @st.cache_data def get_train_test_data(): """Return preprocessed train/test split with scaling.""" df = load_titanic() data = preprocess_titanic(df) X = data.drop('survived', axis=1) y = data['survived'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=SEED, stratify=y ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler def get_feature_names(): return ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'family_size']