|
@@ -56,9 +56,13 @@
|
|
|
"# Sqlite connection\n",
|
|
|
"conn = sqlite3.connect(\"./data/mimic-iv.sqlite\")\n",
|
|
|
"\n",
|
|
|
- "# Custom lab items classification\n",
|
|
|
+ "# Classification des items de biologie\n",
|
|
|
"items = pd.read_csv(\"./config/lab_items.csv\").dropna()\n",
|
|
|
- "items_list = items[\"item_id\"].astype(\"str\").tolist()"
|
|
|
+ "items_list = items[\"item_id\"].astype(\"str\").tolist()\n",
|
|
|
+ "\n",
|
|
|
+ "# Classification ATC des médicaments\n",
|
|
|
+ "drugs_rules = pd.read_csv(\"./config/atc_items.csv\")\n",
|
|
|
+ "drugs_rules_list = drugs_rules[\"gsn\"].drop_duplicates().astype(\"str\").tolist()"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -80,7 +84,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 148,
|
|
|
+ "execution_count": 301,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -108,21 +112,116 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 151,
|
|
|
+ "execution_count": 302,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "stays[\"intime\"] = pd.to_datetime(stays[\"intime\"])\n",
|
|
|
+ "stays[\"gender\"] = stays[\"gender\"].astype(\"string\") # Pas de valeurs manquantes en gender\n",
|
|
|
+ "stays[\"chiefcomplaint\"] = stays[\"chiefcomplaint\"].fillna(\"\").astype(\"string\") # ¨Chiefcomplaint manquant = chiefcomplaint vide"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 284,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "drugs = pd.read_sql(f\"\"\"\n",
|
|
|
+ " SELECT stay_id, gsn\n",
|
|
|
+ " FROM medrecon\n",
|
|
|
+ " WHERE gsn IN ({','.join(drugs_rules_list)})\n",
|
|
|
+ "\"\"\", conn)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 285,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "# @TODO : include drugs"
|
|
|
+ "# Liste des codes ATC pour chaque séjour\n",
|
|
|
+ "atc_stays = pd.merge(\n",
|
|
|
+ " drugs,\n",
|
|
|
+ " drugs_rules,\n",
|
|
|
+ " left_on=\"gsn\",\n",
|
|
|
+ " right_on=\"gsn\"\n",
|
|
|
+ ").drop_duplicates([\"stay_id\",\"atc\"])\n",
|
|
|
+ "\n",
|
|
|
+ "atc_stays[\"atc_2\"] = atc_stays[\"atc\"].str.slice(0, 3)"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 178,
|
|
|
+ "execution_count": 286,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "# Writting\n",
|
|
|
- "stays.to_csv(\"./data/features.csv\")"
|
|
|
+ "# Considérons 2 niveaux de granularité\n",
|
|
|
+ "## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV\n",
|
|
|
+ "\n",
|
|
|
+ "atc_stays_pivoted_4 = pd.pivot_table(\n",
|
|
|
+ " atc_stays[[\"stay_id\",\"atc\"]] \\\n",
|
|
|
+ " .assign(value=1),\n",
|
|
|
+ " columns=[\"atc\"],\n",
|
|
|
+ " index=[\"stay_id\"],\n",
|
|
|
+ " values=\"value\"\n",
|
|
|
+ ").fillna(0).reset_index()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 287,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "## Le code ATC 2 (Anatomique et Thérapeutique)\n",
|
|
|
+ "\n",
|
|
|
+ "atc_stays_pivoted_2 = pd.pivot_table(\n",
|
|
|
+ " atc_stays[[\"stay_id\",\"atc_2\"]] \\\n",
|
|
|
+ " .drop_duplicates() \\\n",
|
|
|
+ " .rename(columns={\"atc_2\":\"atc\"}) \\\n",
|
|
|
+ " .assign(value=1),\n",
|
|
|
+ " columns=[\"atc\"],\n",
|
|
|
+ " index=[\"stay_id\"],\n",
|
|
|
+ " values=\"value\"\n",
|
|
|
+ ").fillna(0).reset_index()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 304,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "stays_atc_4 = pd.merge(\n",
|
|
|
+ " stays,\n",
|
|
|
+ " atc_stays_pivoted_4,\n",
|
|
|
+ " left_on=\"stay_id\",\n",
|
|
|
+ " right_on=\"stay_id\",\n",
|
|
|
+ " how=\"left\"\n",
|
|
|
+ ").fillna(0)\n",
|
|
|
+ "\n",
|
|
|
+ "stays_atc_2 = pd.merge(\n",
|
|
|
+ " stays,\n",
|
|
|
+ " atc_stays_pivoted_2,\n",
|
|
|
+ " left_on=\"stay_id\",\n",
|
|
|
+ " right_on=\"stay_id\",\n",
|
|
|
+ " how=\"left\"\n",
|
|
|
+ ").fillna(0)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 306,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Ecriture du featues dataset\n",
|
|
|
+ "# On écrit en parquet pour optimiser le stockage et les temps d'io\n",
|
|
|
+ "\n",
|
|
|
+ "stays_atc_2.to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n",
|
|
|
+ "stays_atc_4.to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -134,7 +233,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 112,
|
|
|
+ "execution_count": 312,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -152,7 +251,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 119,
|
|
|
+ "execution_count": 315,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -161,13 +260,14 @@
|
|
|
" labs,\n",
|
|
|
" left_on=\"item_id\",\n",
|
|
|
" right_on=\"item_id\"\n",
|
|
|
- ").drop_duplicates([\"stay_id\", \"label\"])[[\"stay_id\",\"label\"]] \\\n",
|
|
|
+ ") \\\n",
|
|
|
+ " .drop_duplicates([\"stay_id\", \"label\"])[[\"stay_id\",\"label\"]] \\\n",
|
|
|
" .reset_index(drop=True)"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 170,
|
|
|
+ "execution_count": 316,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -181,7 +281,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 176,
|
|
|
+ "execution_count": 317,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
@@ -193,11 +293,11 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 179,
|
|
|
+ "execution_count": 319,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "labs_deduplicate_pivot_final.to_csv(\"./data/labels.csv\")"
|
|
|
+ "labs_deduplicate_pivot_final.to_parquet(\"./data/labels.parquet\", index=False)"
|
|
|
]
|
|
|
}
|
|
|
],
|