|
@@ -1,8 +1,13 @@
|
|
from matplotlib import pyplot as plt
|
|
from matplotlib import pyplot as plt
|
|
|
|
+from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
+from wordcloud import WordCloud
|
|
import seaborn as sns
|
|
import seaborn as sns
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
+import numpy as np
|
|
|
|
+import random
|
|
from .preprocessing import get_Xy_df
|
|
from .preprocessing import get_Xy_df
|
|
|
|
|
|
|
|
+
|
|
def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
|
|
def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
|
|
"""
|
|
"""
|
|
This function produce a scatter view of all the variables from a dataset
|
|
This function produce a scatter view of all the variables from a dataset
|
|
@@ -69,6 +74,16 @@ def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
|
|
ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
|
|
ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
|
|
|
|
|
|
def plot_missing_bar(X, features, figsize=(15,10)):
|
|
def plot_missing_bar(X, features, figsize=(15,10)):
|
|
|
|
+ """
|
|
|
|
+ This function produce a bar plot of all the missings values
|
|
|
|
+
|
|
|
|
+ Parameters
|
|
|
|
+ ----------
|
|
|
|
+ X: Pandas Dataframe of features
|
|
|
|
+ features: [str], list of variables name
|
|
|
|
+ figsize: (int, int), tuple of the figure size
|
|
|
|
+ """
|
|
|
|
+
|
|
fig, ax = plt.subplots(1,1, figsize=figsize)
|
|
fig, ax = plt.subplots(1,1, figsize=figsize)
|
|
|
|
|
|
data = (X[features].isna()*1).mean().reset_index()
|
|
data = (X[features].isna()*1).mean().reset_index()
|
|
@@ -81,4 +96,169 @@ def plot_missing_bar(X, features, figsize=(15,10)):
|
|
|
|
|
|
ax.set_title("% de valeurs manquantes par variable")
|
|
ax.set_title("% de valeurs manquantes par variable")
|
|
ax.set_xlabel("Variable")
|
|
ax.set_xlabel("Variable")
|
|
- ax.set_ylabel("% de valeurs manquantes")
|
|
|
|
|
|
+ ax.set_ylabel("% de valeurs manquantes")
|
|
|
|
+
|
|
|
|
+def plot_correlation(X, features, figsize=(10,6)):
|
|
|
|
+ """
|
|
|
|
+ This function produce a heatmap plot of all variables correlation values
|
|
|
|
+
|
|
|
|
+ Parameters
|
|
|
|
+ ----------
|
|
|
|
+ X: Pandas Dataframe of features
|
|
|
|
+ features: [str], list of variables name
|
|
|
|
+ figsize: (int, int), tuple of the figure size
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ fig, ax = plt.subplots(figsize = figsize)
|
|
|
|
+
|
|
|
|
+ correlation_matrix = X[features].corr()
|
|
|
|
+ sns.heatmap(
|
|
|
|
+ correlation_matrix,
|
|
|
|
+ cmap='YlGn',
|
|
|
|
+ ax=ax
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ ax.set_title('Corrélations entre les features');
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def plot_labels_frequencies_and_correlation(y, labels, figsize=(30,10)):
|
|
|
|
+ """
|
|
|
|
+ This function produce a bar of label proportion and heatmap plot of all labels correlation values
|
|
|
|
+
|
|
|
|
+ Parameters
|
|
|
|
+ ----------
|
|
|
|
+ y: Pandas Dataframe of labels
|
|
|
|
+ labels: [str], list of labels name
|
|
|
|
+ figsize: (int, int), tuple of the figure size
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ fig, axs = plt.subplots(1, 2, figsize=figsize)
|
|
|
|
+ axs = axs.flatten()
|
|
|
|
+
|
|
|
|
+ # Plotting labels proportion
|
|
|
|
+ labels_data = ((y[labels].sum()/y.shape[0])*100).reset_index().round(2)
|
|
|
|
+ sns.barplot(
|
|
|
|
+ data=labels_data,
|
|
|
|
+ x="index",
|
|
|
|
+ y=0,
|
|
|
|
+ ax=axs[0]
|
|
|
|
+ )
|
|
|
|
+ axs[0].tick_params(labelrotation=45)
|
|
|
|
+ axs[0].set_ylim(0,100)
|
|
|
|
+ axs[0].set_title("Proportion d'examens biologiques réalisés")
|
|
|
|
+ axs[0].set_xlabel("Examens biologiques")
|
|
|
|
+ axs[0].set_ylabel("% d'examens réalisés")
|
|
|
|
+
|
|
|
|
+ # Plotting correlation
|
|
|
|
+
|
|
|
|
+ correlation_data = y[labels].corr()
|
|
|
|
+ sns.heatmap(correlation_data, ax=axs[1], cmap='YlGn')
|
|
|
|
+ axs[1].set_title('Correlations entre les labels');
|
|
|
|
+
|
|
|
|
+def plot_box_variable_label_distribution(X, y, features, labels):
|
|
|
|
+ """
|
|
|
|
+ This function produce a box plot of the features distribution according to the variable status
|
|
|
|
+
|
|
|
|
+ Parameters
|
|
|
|
+ ----------
|
|
|
|
+ X: Pandas Dataframe of features
|
|
|
|
+ y: Pandas Dataframe of labels
|
|
|
|
+ features: [str], list of variables name
|
|
|
|
+ labels: [str], list of output name
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ # Generating colormap
|
|
|
|
+ colors = sns.color_palette("muted", 2*len(features))
|
|
|
|
+
|
|
|
|
+ # Getting Xy dataframe
|
|
|
|
+ Xy = get_Xy_df(X, y)
|
|
|
|
+
|
|
|
|
+ fig = plt.figure(constrained_layout=True, figsize=(5*len(labels),2*len(features)))
|
|
|
|
+ figs = fig.subfigures(len(labels), 1)
|
|
|
|
+ axs = [x.subplots(1, len(features)) for x in figs]
|
|
|
|
+
|
|
|
|
+ for i in range(len(labels)):
|
|
|
|
+ figs[i].suptitle(f"Distribution des variables selon le statut {labels[i]} (réalisé (1) ou non (0))")
|
|
|
|
+ for j in range(len(features)):
|
|
|
|
+ feature_name, variable_name = features[j], labels[i]
|
|
|
|
+ axs[i][j].set_title(feature_name)
|
|
|
|
+ axs[i][j].set_xlabel(variable_name)
|
|
|
|
+ axs[i][j].set_ylabel(feature_name)
|
|
|
|
+ sns.boxplot(
|
|
|
|
+ data=Xy,
|
|
|
|
+ ax=axs[i][j],
|
|
|
|
+ x=variable_name,
|
|
|
|
+ y=feature_name,
|
|
|
|
+ showfliers=False,
|
|
|
|
+ palette=colors[j*2:(j+1)*2]
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ fig.suptitle("Distribution des features en fonction du label")
|
|
|
|
+ plt.show()
|
|
|
|
+
|
|
|
|
+def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
|
|
|
|
+ """
|
|
|
|
+ This function produce a word cloud of words odd-ratio (odd-ratio of seing the word given the label)
|
|
|
|
+
|
|
|
|
+ Parameters
|
|
|
|
+ ----------
|
|
|
|
+ X: Pandas Dataframe of features
|
|
|
|
+ y: Pandas Dataframe of labels
|
|
|
|
+ text_column: str, name of the column containing the text
|
|
|
|
+ labels: [str], list of output name
|
|
|
|
+ min_occurrence: int, minimum number of ocurrence of the word
|
|
|
|
+ ncols: int, number of columns in the output plot
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ # Computing nrows an getting the structure
|
|
|
|
+ nrows = len(labels)//ncols + 1*((len(labels)%ncols) != 0)
|
|
|
|
+ fig = plt.figure(constrained_layout=True, figsize=(4*ncols, 5*nrows))
|
|
|
|
+ figs = fig.subfigures(nrows, ncols)
|
|
|
|
+ figs = figs.flatten()
|
|
|
|
+ axs = [x.subplots(2, 1) for x in figs]
|
|
|
|
+
|
|
|
|
+ def rand_color_label0(*args, **kwargs):
|
|
|
|
+ return "rgb(0, 100, {})".format(random.randint(200, 455))
|
|
|
|
+
|
|
|
|
+ def rand_color_label1(*args, **kwargs):
|
|
|
|
+ return "rgb({}, 0, 100)".format(random.randint(200, 455))
|
|
|
|
+
|
|
|
|
+ color_fn = [rand_color_label0, rand_color_label1]
|
|
|
|
+
|
|
|
|
+ # Getting Xy
|
|
|
|
+ Xy = get_Xy_df(X, y)
|
|
|
|
+
|
|
|
|
+ # Text preprocessing
|
|
|
|
+ Xy = Xy.dropna(subset=[text_column])
|
|
|
|
+ Xy["text_preprocessed"] = Xy[text_column] \
|
|
|
|
+ .replace(",", " ").str.lower()
|
|
|
|
+
|
|
|
|
+ # Generating the plots
|
|
|
|
+ for i in range(len(labels)):
|
|
|
|
+ label = labels[i]
|
|
|
|
+ figs[i].suptitle(label)
|
|
|
|
+
|
|
|
|
+ # Filtering text data
|
|
|
|
+ text_data = Xy[[label, "chiefcomplaint"]].dropna().groupby(label).agg(lambda x: " ".join(x))["chiefcomplaint"]
|
|
|
|
+
|
|
|
|
+ # Training countvectorizer model then counting the odd
|
|
|
|
+ cv = CountVectorizer().fit(text_data)
|
|
|
|
+ text_data_array = (cv.transform(text_data).toarray()+1) # Smoothing count
|
|
|
|
+ text_data_array[:,np.where(text_data_array <= (min_occurrence+1))[1]] = 1 # Set the odds to neutral odd
|
|
|
|
+ text_data_array = text_data_array/text_data_array.sum(axis=1).reshape(2, -1)
|
|
|
|
+
|
|
|
|
+ for j, text in text_data.items():
|
|
|
|
+ values = (text_data_array[j,:]/(text_data_array[1-j,:])).tolist()
|
|
|
|
+
|
|
|
|
+ axs[i][j].imshow(
|
|
|
|
+ WordCloud(background_color = "white", relative_scaling=0.2, max_words = 25, color_func=color_fn[j]).generate_from_frequencies(
|
|
|
|
+ frequencies=dict(zip(
|
|
|
|
+ cv.get_feature_names(),
|
|
|
|
+ values
|
|
|
|
+ ))
|
|
|
|
+ )
|
|
|
|
+ )
|
|
|
|
+ axs[i][j].set_xlabel(f"{j}")
|
|
|
|
+
|
|
|
|
+ fig.suptitle("WordCloud selon le label")
|
|
|
|
+ plt.show()
|