|
@@ -0,0 +1,68 @@
|
|
|
+from matplotlib import pyplot as plt
|
|
|
+import seaborn as sns
|
|
|
+import pandas as pd
|
|
|
+from .preprocessing import get_Xy_df
|
|
|
+
|
|
|
+def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
|
|
|
+ """
|
|
|
+ This function produce a scatter view of all the variables from a dataset
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ X: Pandas Dataframe
|
|
|
+ variables: [str], list of variables name
|
|
|
+ n_cols: int, number of columns in the plot
|
|
|
+ figsize: (int, int), tuple of the figure size
|
|
|
+ """
|
|
|
+
|
|
|
+ # Getting nrows
|
|
|
+ nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
|
|
|
+
|
|
|
+ figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
|
|
|
+ axs = axs.flatten()
|
|
|
+
|
|
|
+ for i in range(len(variables)):
|
|
|
+ variable = variables[i]
|
|
|
+ sns.scatterplot(
|
|
|
+ data=X[variable].value_counts(),
|
|
|
+ ax = axs[i]
|
|
|
+ )
|
|
|
+
|
|
|
+ axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
|
|
|
+ axs[i].set_xlabel("Valeur")
|
|
|
+ axs[i].set_ylabel("Nombre d'occurences")
|
|
|
+ axs[i].set_title(variable)
|
|
|
+
|
|
|
+ plt.tight_layout()
|
|
|
+
|
|
|
+def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
|
|
|
+ """
|
|
|
+ This function produce a line plot of all the missings values according to the outcomes values
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ X: Pandas Dataframe of features
|
|
|
+ y: Pandas Dataframe of labels
|
|
|
+ features: [str], list of variables name
|
|
|
+ labels: [str], list of output name
|
|
|
+ figsize: (int, int), tuple of the figure size
|
|
|
+ """
|
|
|
+
|
|
|
+ Xy = get_Xy_df(X, y)
|
|
|
+ data = Xy[labels].join(
|
|
|
+ pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
|
|
|
+ ).rename(columns={0:"n_NA"}) \
|
|
|
+ .groupby("n_NA") \
|
|
|
+ .agg(lambda x: x.sum()/x.count())
|
|
|
+
|
|
|
+ fig,ax = plt.subplots(1, 1, figsize=(20,10))
|
|
|
+ sns.lineplot(
|
|
|
+ data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
|
|
|
+ hue="variable",
|
|
|
+ x="n_NA",
|
|
|
+ y="value",
|
|
|
+ ax=ax
|
|
|
+ )
|
|
|
+
|
|
|
+ ax.set_xlabel("Nombre de valeurs manquantes")
|
|
|
+ ax.set_ylabel("Pourcentage d'examen prescrit")
|