{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![Urgences - Image CC0 - pexels.com](img/pexels-pixabay-263402.jpg \"Urgences\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Challenge - [ED Lab Prediction]\n", "_Nom à trouver_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Objectif" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ce notebook effectue le pre-processing des données. \n", "Il exploite les données stockées dans la base sqlite, téléchargées à partir de `download_data.py` et les exporte dans un fichier csv à destination de l'entrainement de la data-visualisation l'algorithme." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chargement des données" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "# Sqlite connection\n", "conn = sqlite3.connect(\"./data/mimic-iv.sqlite\")\n", "\n", "# Classification des items de biologie\n", "items = pd.read_csv(\"./config/lab_items.csv\").dropna()\n", "items_list = items[\"item_id\"].astype(\"str\").tolist()\n", "\n", "# Classification ATC des médicaments\n", "drugs_rules = pd.read_csv(\"./config/atc_items.csv\")\n", "drugs_rules_list = drugs_rules[\"gsn\"].drop_duplicates().astype(\"str\").tolist()" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "# Création d'un index pour accélérer les requêtes\n", "conn.execute(\"CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stays dataset" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [], "source": [ "stays = pd.read_sql(f\"\"\"\n", " SELECT \n", " s.stay_id,\n", " s.intime intime,\n", " p.gender gender,\n", " p.anchor_age age,\n", " t.temperature,\n", " t.heartrate,\n", " t.resprate,\n", " t.o2sat,\n", " t.sbp,\n", " t.dbp,\n", " t.pain,\n", " t.chiefcomplaint\n", " FROM edstays s\n", " LEFT JOIN patients p\n", " ON p.subject_id = s.subject_id\n", " LEFT Join triage t\n", " ON t.stay_id = s.stay_id\n", "\"\"\", conn)" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [], "source": [ "stays[\"intime\"] = pd.to_datetime(stays[\"intime\"])\n", "stays[\"gender\"] = stays[\"gender\"].astype(\"string\") # Pas de valeurs manquantes en gender\n", "stays[\"chiefcomplaint\"] = stays[\"chiefcomplaint\"].fillna(\"\").astype(\"string\") # ¨Chiefcomplaint manquant = chiefcomplaint vide" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [], "source": [ "drugs = pd.read_sql(f\"\"\"\n", " SELECT stay_id, gsn\n", " FROM medrecon\n", " WHERE gsn IN ({','.join(drugs_rules_list)})\n", "\"\"\", conn)" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "# Liste des codes ATC pour chaque séjour\n", "atc_stays = pd.merge(\n", " drugs,\n", " drugs_rules,\n", " left_on=\"gsn\",\n", " right_on=\"gsn\"\n", ").drop_duplicates([\"stay_id\",\"atc\"])\n", "\n", "atc_stays[\"atc_2\"] = atc_stays[\"atc\"].str.slice(0, 3)" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [], "source": [ "# Considérons 2 niveaux de granularité\n", "## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV\n", "\n", "atc_stays_pivoted_4 = pd.pivot_table(\n", " atc_stays[[\"stay_id\",\"atc\"]] \\\n", " .assign(value=1),\n", " columns=[\"atc\"],\n", " index=[\"stay_id\"],\n", " values=\"value\"\n", ").fillna(0).reset_index()" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [], "source": [ "## Le code ATC 2 (Anatomique et Thérapeutique)\n", "\n", "atc_stays_pivoted_2 = pd.pivot_table(\n", " atc_stays[[\"stay_id\",\"atc_2\"]] \\\n", " .drop_duplicates() \\\n", " .rename(columns={\"atc_2\":\"atc\"}) \\\n", " .assign(value=1),\n", " columns=[\"atc\"],\n", " index=[\"stay_id\"],\n", " values=\"value\"\n", ").fillna(0).reset_index()" ] }, { "cell_type": "code", "execution_count": 304, "metadata": {}, "outputs": [], "source": [ "stays_atc_4 = pd.merge(\n", " stays,\n", " atc_stays_pivoted_4,\n", " left_on=\"stay_id\",\n", " right_on=\"stay_id\",\n", " how=\"left\"\n", ").fillna(0)\n", "\n", "stays_atc_2 = pd.merge(\n", " stays,\n", " atc_stays_pivoted_2,\n", " left_on=\"stay_id\",\n", " right_on=\"stay_id\",\n", " how=\"left\"\n", ").fillna(0)" ] }, { "cell_type": "code", "execution_count": 306, "metadata": {}, "outputs": [], "source": [ "# Ecriture du featues dataset\n", "# On écrit en parquet pour optimiser le stockage et les temps d'io\n", "\n", "stays_atc_2.to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n", "stays_atc_4.to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lab dataset" ] }, { "cell_type": "code", "execution_count": 312, "metadata": {}, "outputs": [], "source": [ "labs = pd.read_sql(f\"\"\"\n", " SELECT \n", " le.stay_id,\n", " le.itemid item_id\n", " FROM labevents le\n", " WHERE le.itemid IN ('{\"','\".join(items_list)}')\n", " GROUP BY\n", " le.stay_id,\n", " le.itemid\n", "\"\"\", conn)" ] }, { "cell_type": "code", "execution_count": 315, "metadata": {}, "outputs": [], "source": [ "labs_deduplicate = pd.merge(\n", " items[[\"item_id\",\"3\"]].rename(columns={\"3\":\"label\"}),\n", " labs,\n", " left_on=\"item_id\",\n", " right_on=\"item_id\"\n", ") \\\n", " .drop_duplicates([\"stay_id\", \"label\"])[[\"stay_id\",\"label\"]] \\\n", " .reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 316, "metadata": {}, "outputs": [], "source": [ "labs_deduplicate_pivot = pd.pivot_table(\n", " labs_deduplicate.assign(value=1),\n", " index=[\"stay_id\"],\n", " columns=[\"label\"],\n", " values=\"value\"\n", ").fillna(0)" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [], "source": [ "labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(\n", " stays[[\"stay_id\"]].set_index(\"stay_id\"),\n", " how=\"right\"\n", ").fillna(0).astype(\"int8\").reset_index()" ] }, { "cell_type": "code", "execution_count": 319, "metadata": {}, "outputs": [], "source": [ "labs_deduplicate_pivot_final.to_parquet(\"./data/labels.parquet\", index=False)" ] } ], "metadata": { "interpreter": { "hash": "28b293e0c0671e44c7281dde6399c7c7419d3faca031d22494da8635907ada72" }, "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }