123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- from matplotlib import pyplot as plt
- import seaborn as sns
- import pandas as pd
- from .preprocessing import get_Xy_df
- def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
- """
- This function produce a scatter view of all the variables from a dataset
- Parameters
- ----------
- X: Pandas Dataframe
- variables: [str], list of variables name
- n_cols: int, number of columns in the plot
- figsize: (int, int), tuple of the figure size
- """
- # Getting nrows
- nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
- figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
- axs = axs.flatten()
- for i in range(len(variables)):
- variable = variables[i]
- sns.scatterplot(
- data=X[variable].value_counts(),
- ax = axs[i]
- )
- axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
- axs[i].set_xlabel("Valeur")
- axs[i].set_ylabel("Nombre d'occurences")
- axs[i].set_title(variable)
- plt.tight_layout()
- def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
- """
- This function produce a line plot of all the missings values according to the outcomes values
- Parameters
- ----------
- X: Pandas Dataframe of features
- y: Pandas Dataframe of labels
- features: [str], list of variables name
- labels: [str], list of output name
- figsize: (int, int), tuple of the figure size
- """
- Xy = get_Xy_df(X, y)
- data = Xy[labels].join(
- pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
- ).rename(columns={0:"n_NA"}) \
- .groupby("n_NA") \
- .agg(lambda x: x.sum()/x.count())
- fig,ax = plt.subplots(1, 1, figsize=figsize)
- sns.lineplot(
- data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
- hue="variable",
- x="n_NA",
- y="value",
- ax=ax
- )
- ax.set_xlabel("Nombre de valeurs manquantes")
- ax.set_ylabel("Pourcentage d'examen prescrit")
- ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
- def plot_missing_bar(X, features, figsize=(15,10)):
- fig, ax = plt.subplots(1,1, figsize=figsize)
- data = (X[features].isna()*1).mean().reset_index()
- sns.barplot(
- data=data,
- x="index",
- y=0,
- ax=ax
- )
- ax.set_title("% de valeurs manquantes par variable")
- ax.set_xlabel("Variable")
- ax.set_ylabel("% de valeurs manquantes")
|