{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "27d3ad11", "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "# Install dependencies\n", "!pip install datasets pandas\n", "\n", "from datasets import load_dataset\n", "import pandas as pd\n", "\n", "# Load NOTEEVENTS directly from Hugging Face (no manual download)\n", "notes = load_dataset(\n", " \"ntphuc149/MIMIC-III-Clinical-Database\",\n", " \"NOTEEVENTS\",\n", " split=\"train\"\n", ")\n", "\n", "# Inspect columns\n", "print(\"Columns:\", notes.column_names)\n", "\n", "# Filter discharge summaries\n", "discharge_notes = notes.filter(\n", " lambda x: x[\"CATEGORY\"] == \"Discharge summary\"\n", ")\n", "\n", "print(\"Total discharge summaries:\", len(discharge_notes))\n", "\n", "# Convert small sample to pandas for inspection\n", "df = discharge_notes.select(range(5)).to_pandas()\n", "\n", "# Print sample text\n", "for i, text in enumerate(df[\"TEXT\"]):\n", " print(f\"\\n--- Discharge Summary {i+1} ---\\n\")\n", " print(text[:2000])\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }