3 years ago · b1c98e8f48
--- a/Project_Report.ipynb
+++ b/Project_Report.ipynb
--- a/scripts/preprocessing.py
+++ b/scripts/preprocessing.py
@@ -1,10 +1,33 @@
 
				 """
			
 
				-    This script the preprocessing functions
			
 
				+    This script contains the preprocessing functions
			
 
				 """
			
 
				 
			
 
				 import sqlite3
			
 
				 import pandas as pd
			
 
				 
			
 
				+def get_Xy_df (X, y):
			
 
				+    """
			
 
				+        Merge together the X and y dataframe on stay_id basis
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        X: pandas dataframe of features
			
 
				+        y: pandas dataframe of labeles
			
 
				+
			
 
				+        Output
			
 
				+        ------
			
 
				+        Xy : merged pandas dataframe
			
 
				+    """
			
 
				+
			
 
				+    Xy = pd.merge(
			
 
				+        X,
			
 
				+        y,
			
 
				+        left_on="stay_id",
			
 
				+        right_on="stay_id"
			
 
				+    )
			
 
				+
			
 
				+    return Xy
			
 
				+
			
 
				 def generate_features_dataset(database, get_drugs=True, get_diseases=True):
			
 
				 
			
 
				     """
			
@@ -185,4 +208,36 @@ def generate_labels_dataset(database, lab_dictionnary):
 
				 
			
 
				     labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
			
 
				 
			
 
				-    return labels
			
 
				+    return labels
			
 
				+
			
 
				+def remove_outliers (X, variables_ranges):
			
 
				+    """
			
 
				+        This function remove the outliers and replace them by an NA according to the variable_ranges rules
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        X: pandas Dataframe
			
 
				+        variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
			
 
				+
			
 
				+        Outputs
			
 
				+        -------
			
 
				+        Tuple containing :
			
 
				+        - Processing dataframe
			
 
				+        - A Dataframe containing the number and percentage of processed outliers per variable
			
 
				+    """
			
 
				+
			
 
				+    outliers = {}
			
 
				+    X_copy = X.copy()
			
 
				+
			
 
				+    for key, value in variables_ranges.items():
			
 
				+        outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
			
 
				+        outliers[key] = outliers_mask.sum() # Storing the number of outliers
			
 
				+        X_copy.loc[outliers_mask, key] = pd.NA # Setting outliers to NA
			
 
				+
			
 
				+    outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
			
 
				+        .rename(columns={0:"n"}) \
			
 
				+        .assign(total=X[outliers.keys()].count().values,
			
 
				+                pourcentage=lambda x: (x["n"]/x["total"])*100
			
 
				+        )
			
 
				+
			
 
				+    return X_copy, outlier_report
			
--- a/scripts/visualisation.py
+++ b/scripts/visualisation.py
@@ -0,0 +1,68 @@
 
				+from matplotlib import pyplot as plt
			
 
				+import seaborn as sns
			
 
				+import pandas as pd
			
 
				+from .preprocessing import get_Xy_df
			
 
				+
			
 
				+def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
			
 
				+    """
			
 
				+        This function produce a scatter view of all the variables from a dataset
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        X: Pandas Dataframe
			
 
				+        variables: [str], list of variables name
			
 
				+        n_cols: int, number of columns in the plot
			
 
				+        figsize: (int, int), tuple of the figure size
			
 
				+    """
			
 
				+
			
 
				+    # Getting nrows
			
 
				+    nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
			
 
				+
			
 
				+    figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
			
 
				+    axs = axs.flatten()
			
 
				+
			
 
				+    for i in range(len(variables)):
			
 
				+        variable = variables[i]
			
 
				+        sns.scatterplot(
			
 
				+            data=X[variable].value_counts(),
			
 
				+            ax = axs[i]
			
 
				+        )
			
 
				+
			
 
				+        axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
			
 
				+        axs[i].set_xlabel("Valeur")
			
 
				+        axs[i].set_ylabel("Nombre d'occurences")
			
 
				+        axs[i].set_title(variable)
			
 
				+
			
 
				+    plt.tight_layout()
			
 
				+
			
 
				+def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
			
 
				+    """
			
 
				+        This function produce a line plot of all the missings values according to the outcomes values
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        X: Pandas Dataframe of features
			
 
				+        y: Pandas Dataframe of labels
			
 
				+        features: [str], list of variables name
			
 
				+        labels: [str], list of output name
			
 
				+        figsize: (int, int), tuple of the figure size
			
 
				+    """
			
 
				+
			
 
				+    Xy = get_Xy_df(X, y)
			
 
				+    data = Xy[labels].join(
			
 
				+            pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
			
 
				+        ).rename(columns={0:"n_NA"}) \
			
 
				+        .groupby("n_NA") \
			
 
				+        .agg(lambda x: x.sum()/x.count())
			
 
				+
			
 
				+    fig,ax = plt.subplots(1, 1, figsize=(20,10))
			
 
				+    sns.lineplot(
			
 
				+        data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
			
 
				+        hue="variable",
			
 
				+        x="n_NA",
			
 
				+        y="value",
			
 
				+        ax=ax
			
 
				+    )
			
 
				+
			
 
				+    ax.set_xlabel("Nombre de valeurs manquantes")
			
 
				+    ax.set_ylabel("Pourcentage d'examen prescrit")