Browse Source

Sorting X and Y

Ali Bellamine 3 years ago
parent
commit
34b20d3cd0
1 changed files with 32 additions and 20 deletions
  1. 32 20
      0_Preprocessing.ipynb

+ 32 - 20
0_Preprocessing.ipynb

@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 301,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 302,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -123,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 284,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -136,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 285,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,7 +153,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 286,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 287,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 304,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -213,15 +213,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 306,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'stays_atc_2' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_10104/535576780.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# On écrit en parquet pour optimiser le stockage et les temps d'io\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mstays_atc_2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"stay_id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_parquet\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"./data/features_atc2.parquet\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"pyarrow\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[0mstays_atc_4\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"stay_id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_parquet\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"./data/features_atc4.parquet\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"pyarrow\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'stays_atc_2' is not defined"
+     ]
+    }
+   ],
    "source": [
     "# Ecriture du featues dataset\n",
     "# On écrit en parquet pour optimiser le stockage et les temps d'io\n",
     "\n",
-    "stays_atc_2.to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n",
-    "stays_atc_4.to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)"
+    "stays_atc_2.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_atc2.parquet\", engine=\"pyarrow\", index=False)\n",
+    "stays_atc_4.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/features_atc4.parquet\", engine=\"pyarrow\", index=False)"
    ]
   },
   {
@@ -233,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 312,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -251,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 315,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -267,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 316,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -281,7 +293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 317,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -293,11 +305,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 319,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "labs_deduplicate_pivot_final.to_parquet(\"./data/labels.parquet\", index=False)"
+    "labs_deduplicate_pivot_final.sort_values(\"stay_id\").reset_index(drop=True).to_parquet(\"./data/labels.parquet\", index=False)"
    ]
   }
  ],