{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "1d9c9ac7", "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "# Install dependencies\n", "!pip install datasets scikit-learn pandas numpy\n", "\n", "from datasets import load_dataset\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report, roc_auc_score\n", "\n", "# Load discharge summaries\n", "notes = load_dataset(\n", " \"ntphuc149/MIMIC-III-Clinical-Database\",\n", " \"NOTEEVENTS\",\n", " split=\"train\"\n", ")\n", "\n", "notes = notes.filter(lambda x: x[\"CATEGORY\"] == \"Discharge summary\")\n", "\n", "# Minimal clinical-safe cleaning\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r\"\\[\\*\\*.*?\\*\\*\\]\", \"\", text)\n", " text = re.sub(r\"\\n+\", \" \", text)\n", " return text.strip()\n", "\n", "notes = notes.map(lambda x: {\"clean_text\": clean_text(x[\"TEXT\"])})\n", "\n", "# TEMPORARY labels (replace with READMITTED_30D later)\n", "np.random.seed(42)\n", "labels = np.random.binomial(1, 0.35, size=len(notes))\n", "\n", "df = pd.DataFrame({\n", " \"text\": notes[\"clean_text\"],\n", " \"label\": labels\n", "})\n", "\n", "# Train-test split\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " df[\"text\"],\n", " df[\"label\"],\n", " test_size=0.2,\n", " stratify=df[\"label\"],\n", " random_state=42\n", ")\n", "\n", "# TF-IDF baseline model\n", "model = Pipeline([\n", " (\"tfidf\", TfidfVectorizer(\n", " max_features=20000,\n", " ngram_range=(1,2),\n", " stop_words=\"english\"\n", " )),\n", " (\"clf\", LogisticRegression(\n", " max_iter=1000,\n", " class_weight=\"balanced\"\n", " ))\n", "])\n", "\n", "# Train\n", "model.fit(X_train, y_train)\n", "\n", "# Evaluate\n", "y_pred = model.predict(X_test)\n", "y_prob = model.predict_proba(X_test)[:,1]\n", "\n", "print(classification_report(y_test, y_pred))\n", "print(\"ROC-AUC:\", roc_auc_score(y_test, y_prob))\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }