Browse Source

Start preprocessing

Ali Bellamine 3 years ago
parent
commit
b1c98e8f48
3 changed files with 208 additions and 71 deletions
  1. 83 69
      Project_Report.ipynb
  2. 57 2
      scripts/preprocessing.py
  3. 68 0
      scripts/visualisation.py

File diff suppressed because it is too large
+ 83 - 69
Project_Report.ipynb


+ 57 - 2
scripts/preprocessing.py

@@ -1,10 +1,33 @@
 """
-    This script the preprocessing functions
+    This script contains the preprocessing functions
 """
 
 import sqlite3
 import pandas as pd
 
+def get_Xy_df (X, y):
+    """
+        Merge together the X and y dataframe on stay_id basis
+
+        Parameters
+        ----------
+        X: pandas dataframe of features
+        y: pandas dataframe of labeles
+
+        Output
+        ------
+        Xy : merged pandas dataframe
+    """
+
+    Xy = pd.merge(
+        X,
+        y,
+        left_on="stay_id",
+        right_on="stay_id"
+    )
+
+    return Xy
+
 def generate_features_dataset(database, get_drugs=True, get_diseases=True):
 
     """
@@ -185,4 +208,36 @@ def generate_labels_dataset(database, lab_dictionnary):
 
     labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
 
-    return labels
+    return labels
+
+def remove_outliers (X, variables_ranges):
+    """
+        This function remove the outliers and replace them by an NA according to the variable_ranges rules
+
+        Parameters
+        ----------
+        X: pandas Dataframe
+        variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
+
+        Outputs
+        -------
+        Tuple containing :
+        - Processing dataframe
+        - A Dataframe containing the number and percentage of processed outliers per variable
+    """
+
+    outliers = {}
+    X_copy = X.copy()
+
+    for key, value in variables_ranges.items():
+        outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
+        outliers[key] = outliers_mask.sum() # Storing the number of outliers
+        X_copy.loc[outliers_mask, key] = pd.NA # Setting outliers to NA
+
+    outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
+        .rename(columns={0:"n"}) \
+        .assign(total=X[outliers.keys()].count().values,
+                pourcentage=lambda x: (x["n"]/x["total"])*100
+        )
+
+    return X_copy, outlier_report

+ 68 - 0
scripts/visualisation.py

@@ -0,0 +1,68 @@
+from matplotlib import pyplot as plt
+import seaborn as sns
+import pandas as pd
+from .preprocessing import get_Xy_df
+
+def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
+    """
+        This function produce a scatter view of all the variables from a dataset
+
+        Parameters
+        ----------
+        X: Pandas Dataframe
+        variables: [str], list of variables name
+        n_cols: int, number of columns in the plot
+        figsize: (int, int), tuple of the figure size
+    """
+
+    # Getting nrows
+    nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
+
+    figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
+    axs = axs.flatten()
+
+    for i in range(len(variables)):
+        variable = variables[i]
+        sns.scatterplot(
+            data=X[variable].value_counts(),
+            ax = axs[i]
+        )
+
+        axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
+        axs[i].set_xlabel("Valeur")
+        axs[i].set_ylabel("Nombre d'occurences")
+        axs[i].set_title(variable)
+
+    plt.tight_layout()
+
+def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
+    """
+        This function produce a line plot of all the missings values according to the outcomes values
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        y: Pandas Dataframe of labels
+        features: [str], list of variables name
+        labels: [str], list of output name
+        figsize: (int, int), tuple of the figure size
+    """
+
+    Xy = get_Xy_df(X, y)
+    data = Xy[labels].join(
+            pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
+        ).rename(columns={0:"n_NA"}) \
+        .groupby("n_NA") \
+        .agg(lambda x: x.sum()/x.count())
+
+    fig,ax = plt.subplots(1, 1, figsize=(20,10))
+    sns.lineplot(
+        data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
+        hue="variable",
+        x="n_NA",
+        y="value",
+        ax=ax
+    )
+
+    ax.set_xlabel("Nombre de valeurs manquantes")
+    ax.set_ylabel("Pourcentage d'examen prescrit")

Some files were not shown because too many files changed in this diff