Browse Source

Prototype

Ali 3 years ago
parent
commit
77440c02a7
2 changed files with 89 additions and 49 deletions
  1. 74 32
      0_Preprocessing.ipynb
  2. 15 17
      1_Prototype_MLP.ipynb

+ 74 - 32
0_Preprocessing.ipynb

@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -123,12 +123,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "drugs = pd.read_sql(f\"\"\"\n",
-    "    SELECT stay_id, gsn\n",
+    "    SELECT stay_id, gsn, etccode, 1 n\n",
     "    FROM medrecon\n",
     "    WHERE gsn IN ({','.join(drugs_rules_list)})\n",
     "\"\"\", conn)"
@@ -136,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -146,14 +146,15 @@
     "    drugs_rules,\n",
     "    left_on=\"gsn\",\n",
     "    right_on=\"gsn\"\n",
-    ").drop_duplicates([\"stay_id\",\"atc\"])\n",
+    ").groupby([\"stay_id\",\"atc\"])[\"n\"].sum() \\\n",
+    " .reset_index()\n",
     "\n",
     "atc_stays[\"atc_2\"] = atc_stays[\"atc\"].str.slice(0, 3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,36 +162,55 @@
     "## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV\n",
     "\n",
     "atc_stays_pivoted_4 = pd.pivot_table(\n",
-    "    atc_stays[[\"stay_id\",\"atc\"]] \\\n",
-    "        .assign(value=1),\n",
+    "    atc_stays[[\"stay_id\",\"atc\", \"n\"]],\n",
     "    columns=[\"atc\"],\n",
     "    index=[\"stay_id\"],\n",
-    "    values=\"value\"\n",
+    "    values=\"n\"\n",
     ").fillna(0).reset_index()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
     "## Le code ATC 2 (Anatomique et Thérapeutique)\n",
     "\n",
     "atc_stays_pivoted_2 = pd.pivot_table(\n",
-    "    atc_stays[[\"stay_id\",\"atc_2\"]] \\\n",
-    "        .drop_duplicates() \\\n",
-    "        .rename(columns={\"atc_2\":\"atc\"}) \\\n",
-    "        .assign(value=1),\n",
+    "    atc_stays[[\"stay_id\",\"atc_2\", \"n\"]] \\\n",
+    "        .groupby([\"stay_id\",\"atc_2\"])[\"n\"].sum() \\\n",
+    "        .reset_index() \\\n",
+    "        .rename(columns={\"atc_2\":\"atc\"}),\n",
     "    columns=[\"atc\"],\n",
     "    index=[\"stay_id\"],\n",
-    "    values=\"value\"\n",
+    "    values=\"n\"\n",
+    ").fillna(0).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Les codes ETC\n",
+    "\n",
+    "etc_pivoted = pd.pivot_table(\n",
+    "    drugs[[\"stay_id\",\"etccode\", \"n\"]].dropna() \\\n",
+    "        .assign(etccode = lambda x: x[\"etccode\"].astype(\"int\").astype(\"str\")) \\\n",
+    "        .groupby([\"stay_id\",\"etccode\"])[\"n\"].sum() \\\n",
+    "        .reset_index() \\\n",
+    "        .rename(columns={\"etccode\":\"atc\"}),\n",
+    "    columns=[\"atc\"],\n",
+    "    index=[\"stay_id\"],\n",
+    "    values=\"n\"\n",
     ").fillna(0).reset_index()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -210,13 +230,22 @@
     "    how=\"left\"\n",
     ")\n",
     "\n",
+    "stays_etc = pd.merge(\n",
+    "    stays,\n",
+    "    etc_pivoted,\n",
+    "    left_on=\"stay_id\",\n",
+    "    right_on=\"stay_id\",\n",
+    "    how=\"left\"\n",
+    ")\n",
+    "\n",
     "stays_atc_4[atc_stays_pivoted_4.columns[1:]] = stays_atc_4[atc_stays_pivoted_4.columns[1:]].fillna(0)\n",
-    "stays_atc_2[atc_stays_pivoted_2.columns[1:]] = stays_atc_2[atc_stays_pivoted_2.columns[1:]].fillna(0)"
+    "stays_atc_2[atc_stays_pivoted_2.columns[1:]] = stays_atc_2[atc_stays_pivoted_2.columns[1:]].fillna(0)\n",
+    "stays_etc[etc_pivoted.columns[1:]] = stays_etc[etc_pivoted.columns[1:]].fillna(0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -224,7 +253,8 @@
     "# On écrit en parquet pour optimiser le stockage et les temps d'io\n",
     "\n",
     "stays_atc_2.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n",
-    "stays_atc_4.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)"
+    "stays_atc_4.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)\n",
+    "stays_etc.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_etc.parquet\", engine=\"pyarrow\", index=False)"
    ]
   },
   {
@@ -236,9 +266,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'pd' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m labs \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241m.\u001b[39mread_sql(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;124m    SELECT \u001b[39m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;124m        le.stay_id,\u001b[39m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;124m        le.itemid item_id\u001b[39m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;124m    FROM labevents le\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;124m    WHERE le.itemid IN (\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(items_list)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m)\u001b[39m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;124m    GROUP BY\u001b[39m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;124m        le.stay_id,\u001b[39m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;124m        le.itemid\u001b[39m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m, conn)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
+     ]
+    }
+   ],
    "source": [
     "labs = pd.read_sql(f\"\"\"\n",
     "    SELECT \n",
@@ -254,7 +296,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -270,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -296,7 +338,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -306,10 +348,10 @@
  ],
  "metadata": {
   "interpreter": {
-   "hash": "28b293e0c0671e44c7281dde6399c7c7419d3faca031d22494da8635907ada72"
+   "hash": "c304935560631f5a20c1bdabb506947800ccd82d813704000c078f0735b9b818"
   },
   "kernelspec": {
-   "display_name": "Python 3.9.7 ('base')",
+   "display_name": "R",
    "language": "python",
    "name": "python3"
   },
@@ -323,7 +365,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.9"
   },
   "orig_nbformat": 4
  },

File diff suppressed because it is too large
+ 15 - 17
1_Prototype_MLP.ipynb


Some files were not shown because too many files changed in this diff