Sfoglia il codice sorgente

Ajout des médicaments dans le pre-processing

Ali Bellamine 3 anni fa
parent
commit
2edeb2ab4f
2 ha cambiato i file con 118 aggiunte e 15 eliminazioni
  1. 115 15
      0_Preprocessing.ipynb
  2. 3 0
      requirements.txt

+ 115 - 15
0_Preprocessing.ipynb

@@ -56,9 +56,13 @@
     "# Sqlite connection\n",
     "conn = sqlite3.connect(\"./data/mimic-iv.sqlite\")\n",
     "\n",
-    "# Custom lab items classification\n",
+    "# Classification des items de biologie\n",
     "items = pd.read_csv(\"./config/lab_items.csv\").dropna()\n",
-    "items_list = items[\"item_id\"].astype(\"str\").tolist()"
+    "items_list = items[\"item_id\"].astype(\"str\").tolist()\n",
+    "\n",
+    "# Classification ATC des médicaments\n",
+    "drugs_rules = pd.read_csv(\"./config/atc_items.csv\")\n",
+    "drugs_rules_list = drugs_rules[\"gsn\"].drop_duplicates().astype(\"str\").tolist()"
    ]
   },
   {
@@ -80,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 301,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -108,21 +112,116 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 302,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stays[\"intime\"] = pd.to_datetime(stays[\"intime\"])\n",
+    "stays[\"gender\"] = stays[\"gender\"].astype(\"string\") # Pas de valeurs manquantes en gender\n",
+    "stays[\"chiefcomplaint\"] = stays[\"chiefcomplaint\"].fillna(\"\").astype(\"string\") # ¨Chiefcomplaint manquant = chiefcomplaint vide"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 284,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "drugs = pd.read_sql(f\"\"\"\n",
+    "    SELECT stay_id, gsn\n",
+    "    FROM medrecon\n",
+    "    WHERE gsn IN ({','.join(drugs_rules_list)})\n",
+    "\"\"\", conn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 285,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# @TODO : include drugs"
+    "# Liste des codes ATC pour chaque séjour\n",
+    "atc_stays = pd.merge(\n",
+    "    drugs,\n",
+    "    drugs_rules,\n",
+    "    left_on=\"gsn\",\n",
+    "    right_on=\"gsn\"\n",
+    ").drop_duplicates([\"stay_id\",\"atc\"])\n",
+    "\n",
+    "atc_stays[\"atc_2\"] = atc_stays[\"atc\"].str.slice(0, 3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 178,
+   "execution_count": 286,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Writting\n",
-    "stays.to_csv(\"./data/features.csv\")"
+    "# Considérons 2 niveaux de granularité\n",
+    "## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV\n",
+    "\n",
+    "atc_stays_pivoted_4 = pd.pivot_table(\n",
+    "    atc_stays[[\"stay_id\",\"atc\"]] \\\n",
+    "        .assign(value=1),\n",
+    "    columns=[\"atc\"],\n",
+    "    index=[\"stay_id\"],\n",
+    "    values=\"value\"\n",
+    ").fillna(0).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 287,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Le code ATC 2 (Anatomique et Thérapeutique)\n",
+    "\n",
+    "atc_stays_pivoted_2 = pd.pivot_table(\n",
+    "    atc_stays[[\"stay_id\",\"atc_2\"]] \\\n",
+    "        .drop_duplicates() \\\n",
+    "        .rename(columns={\"atc_2\":\"atc\"}) \\\n",
+    "        .assign(value=1),\n",
+    "    columns=[\"atc\"],\n",
+    "    index=[\"stay_id\"],\n",
+    "    values=\"value\"\n",
+    ").fillna(0).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 304,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stays_atc_4 = pd.merge(\n",
+    "    stays,\n",
+    "    atc_stays_pivoted_4,\n",
+    "    left_on=\"stay_id\",\n",
+    "    right_on=\"stay_id\",\n",
+    "    how=\"left\"\n",
+    ").fillna(0)\n",
+    "\n",
+    "stays_atc_2 = pd.merge(\n",
+    "    stays,\n",
+    "    atc_stays_pivoted_2,\n",
+    "    left_on=\"stay_id\",\n",
+    "    right_on=\"stay_id\",\n",
+    "    how=\"left\"\n",
+    ").fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 306,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ecriture du featues dataset\n",
+    "# On écrit en parquet pour optimiser le stockage et les temps d'io\n",
+    "\n",
+    "stays_atc_2.to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n",
+    "stays_atc_4.to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)"
    ]
   },
   {
@@ -134,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 312,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 315,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,13 +260,14 @@
     "    labs,\n",
     "    left_on=\"item_id\",\n",
     "    right_on=\"item_id\"\n",
-    ").drop_duplicates([\"stay_id\", \"label\"])[[\"stay_id\",\"label\"]] \\\n",
+    ") \\\n",
+    " .drop_duplicates([\"stay_id\", \"label\"])[[\"stay_id\",\"label\"]] \\\n",
     " .reset_index(drop=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 316,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -181,7 +281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 176,
+   "execution_count": 317,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -193,11 +293,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 179,
+   "execution_count": 319,
    "metadata": {},
    "outputs": [],
    "source": [
-    "labs_deduplicate_pivot_final.to_csv(\"./data/labels.csv\")"
+    "labs_deduplicate_pivot_final.to_parquet(\"./data/labels.parquet\", index=False)"
    ]
   }
  ],

+ 3 - 0
requirements.txt

@@ -1 +1,4 @@
 progress
+pyarrow
+pandas
+sqlite3