kimtaeyeong1229 commited on
Commit
a45270b
·
verified ·
1 Parent(s): 1e7ec49

Upload utils/data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. utils/data.py +64 -0
utils/data.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data loading and preprocessing utilities for Titanic dataset."""
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler
7
+ import streamlit as st
8
+
9
+ SEED = 42
10
+
11
+
12
+ @st.cache_data
13
+ def load_titanic():
14
+ """Load Titanic dataset from seaborn (no Kaggle account needed)."""
15
+ return sns.load_dataset('titanic')
16
+
17
+
18
+ def preprocess_titanic(df):
19
+ """타이타닉 데이터 전처리 함수 (노트북 코드 재사용)"""
20
+ data = df.copy()
21
+
22
+ features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
23
+ target = 'survived'
24
+
25
+ data = data[features + [target]].copy()
26
+
27
+ # 결측값 처리
28
+ data['age'].fillna(data['age'].median(), inplace=True)
29
+ data['fare'].fillna(data['fare'].median(), inplace=True)
30
+ data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)
31
+
32
+ # 범주형 → 숫자
33
+ data['sex'] = (data['sex'] == 'male').astype(int)
34
+ embarked_map = {'S': 0, 'C': 1, 'Q': 2}
35
+ data['embarked'] = data['embarked'].map(embarked_map)
36
+
37
+ # 파생 피처
38
+ data['family_size'] = data['sibsp'] + data['parch']
39
+
40
+ return data
41
+
42
+
43
+ @st.cache_data
44
+ def get_train_test_data():
45
+ """Return preprocessed train/test split with scaling."""
46
+ df = load_titanic()
47
+ data = preprocess_titanic(df)
48
+
49
+ X = data.drop('survived', axis=1)
50
+ y = data['survived']
51
+
52
+ X_train, X_test, y_train, y_test = train_test_split(
53
+ X, y, test_size=0.2, random_state=SEED, stratify=y
54
+ )
55
+
56
+ scaler = StandardScaler()
57
+ X_train_scaled = scaler.fit_transform(X_train)
58
+ X_test_scaled = scaler.transform(X_test)
59
+
60
+ return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler
61
+
62
+
63
+ def get_feature_names():
64
+ return ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'family_size']