3 years ago · 0cffdcf66e
--- a/Project_Report.ipynb
+++ b/Project_Report.ipynb
--- a/bop_scripts/visualisation.py
+++ b/bop_scripts/visualisation.py
@@ -1,8 +1,13 @@
 
															 from matplotlib import pyplot as plt
														
 
															+from sklearn.feature_extraction.text import CountVectorizer
														
 
															+from wordcloud import WordCloud
														
 
															 import seaborn as sns
														
 
															 import pandas as pd
														
 
															+import numpy as np
														
 
															+import random
														
 
															 from .preprocessing import get_Xy_df
														
 
															+
														
 
															 def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
														
 
															     """
														
 
															         This function produce a scatter view of all the variables from a dataset
														
@@ -69,6 +74,16 @@ def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
 
															     ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
														
 
															 def plot_missing_bar(X, features, figsize=(15,10)):
														
 
															+    """
														
 
															+        This function produce a bar plot of all the missings values
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        X: Pandas Dataframe of features
														
 
															+        features: [str], list of variables name
														
 
															+        figsize: (int, int), tuple of the figure size
														
 
															+    """
														
 
															+
														
 
															     fig, ax = plt.subplots(1,1, figsize=figsize)
														
 
															     data = (X[features].isna()*1).mean().reset_index()
														
@@ -81,4 +96,169 @@ def plot_missing_bar(X, features, figsize=(15,10)):
 
															     ax.set_title("% de valeurs manquantes par variable")
														
 
															     ax.set_xlabel("Variable")
														
 
															-    ax.set_ylabel("% de valeurs manquantes")
														
 
															+    ax.set_ylabel("% de valeurs manquantes")
														
 
															+
														
 
															+def plot_correlation(X, features, figsize=(10,6)):
														
 
															+    """
														
 
															+        This function produce a heatmap plot of all variables correlation values
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        X: Pandas Dataframe of features
														
 
															+        features: [str], list of variables name
														
 
															+        figsize: (int, int), tuple of the figure size
														
 
															+    """
														
 
															+
														
 
															+    fig, ax = plt.subplots(figsize = figsize)
														
 
															+
														
 
															+    correlation_matrix = X[features].corr()
														
 
															+    sns.heatmap(
														
 
															+        correlation_matrix,
														
 
															+        cmap='YlGn',
														
 
															+        ax=ax
														
 
															+    )
														
 
															+    
														
 
															+    ax.set_title('Corrélations entre les features');
														
 
															+
														
 
															+
														
 
															+def plot_labels_frequencies_and_correlation(y, labels, figsize=(30,10)):
														
 
															+    """
														
 
															+        This function produce a bar of label proportion and heatmap plot of all labels correlation values
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        y: Pandas Dataframe of labels
														
 
															+        labels: [str], list of labels name
														
 
															+        figsize: (int, int), tuple of the figure size
														
 
															+    """
														
 
															+
														
 
															+    fig, axs = plt.subplots(1, 2, figsize=figsize)
														
 
															+    axs = axs.flatten()
														
 
															+
														
 
															+    # Plotting labels proportion
														
 
															+    labels_data = ((y[labels].sum()/y.shape[0])*100).reset_index().round(2)
														
 
															+    sns.barplot(
														
 
															+        data=labels_data,
														
 
															+        x="index",
														
 
															+        y=0,
														
 
															+        ax=axs[0]
														
 
															+    )
														
 
															+    axs[0].tick_params(labelrotation=45)
														
 
															+    axs[0].set_ylim(0,100)
														
 
															+    axs[0].set_title("Proportion d'examens biologiques réalisés")
														
 
															+    axs[0].set_xlabel("Examens biologiques")
														
 
															+    axs[0].set_ylabel("% d'examens réalisés")
														
 
															+
														
 
															+    # Plotting correlation
														
 
															+
														
 
															+    correlation_data = y[labels].corr()
														
 
															+    sns.heatmap(correlation_data, ax=axs[1], cmap='YlGn')
														
 
															+    axs[1].set_title('Correlations entre les labels');
														
 
															+
														
 
															+def plot_box_variable_label_distribution(X, y, features, labels):
														
 
															+    """
														
 
															+        This function produce a box plot of the features distribution according to the variable status
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        X: Pandas Dataframe of features
														
 
															+        y: Pandas Dataframe of labels
														
 
															+        features: [str], list of variables name
														
 
															+        labels: [str], list of output name
														
 
															+    """
														
 
															+
														
 
															+    # Generating colormap
														
 
															+    colors = sns.color_palette("muted", 2*len(features))
														
 
															+
														
 
															+    # Getting Xy dataframe
														
 
															+    Xy = get_Xy_df(X, y)
														
 
															+
														
 
															+    fig = plt.figure(constrained_layout=True, figsize=(5*len(labels),2*len(features)))
														
 
															+    figs = fig.subfigures(len(labels), 1)
														
 
															+    axs = [x.subplots(1, len(features)) for x in figs]
														
 
															+
														
 
															+    for i in range(len(labels)):
														
 
															+        figs[i].suptitle(f"Distribution des variables selon le statut {labels[i]} (réalisé (1) ou non (0))")
														
 
															+        for j in range(len(features)):
														
 
															+            feature_name, variable_name = features[j], labels[i]
														
 
															+            axs[i][j].set_title(feature_name)
														
 
															+            axs[i][j].set_xlabel(variable_name)
														
 
															+            axs[i][j].set_ylabel(feature_name)
														
 
															+            sns.boxplot(
														
 
															+                data=Xy, 
														
 
															+                ax=axs[i][j], 
														
 
															+                x=variable_name, 
														
 
															+                y=feature_name, 
														
 
															+                showfliers=False,
														
 
															+                palette=colors[j*2:(j+1)*2]
														
 
															+            )
														
 
															+
														
 
															+    fig.suptitle("Distribution des features en fonction du label")
														
 
															+    plt.show()
														
 
															+
														
 
															+def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
														
 
															+    """
														
 
															+        This function produce a word cloud of words odd-ratio (odd-ratio of seing the word given the label)
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        X: Pandas Dataframe of features
														
 
															+        y: Pandas Dataframe of labels
														
 
															+        text_column: str, name of the column containing the text
														
 
															+        labels: [str], list of output name
														
 
															+        min_occurrence: int, minimum number of ocurrence of the word
														
 
															+        ncols: int, number of columns in the output plot
														
 
															+    """
														
 
															+
														
 
															+    # Computing nrows an getting the structure
														
 
															+    nrows = len(labels)//ncols + 1*((len(labels)%ncols) != 0)
														
 
															+    fig = plt.figure(constrained_layout=True, figsize=(4*ncols, 5*nrows))
														
 
															+    figs = fig.subfigures(nrows, ncols)
														
 
															+    figs = figs.flatten()
														
 
															+    axs = [x.subplots(2, 1) for x in figs]
														
 
															+
														
 
															+    def rand_color_label0(*args, **kwargs):
														
 
															+        return "rgb(0, 100, {})".format(random.randint(200, 455))
														
 
															+
														
 
															+    def rand_color_label1(*args, **kwargs):
														
 
															+        return "rgb({}, 0, 100)".format(random.randint(200, 455))
														
 
															+
														
 
															+    color_fn = [rand_color_label0, rand_color_label1]
														
 
															+
														
 
															+    # Getting Xy
														
 
															+    Xy = get_Xy_df(X, y)
														
 
															+
														
 
															+    # Text preprocessing
														
 
															+    Xy = Xy.dropna(subset=[text_column])
														
 
															+    Xy["text_preprocessed"] = Xy[text_column] \
														
 
															+        .replace(",", " ").str.lower()
														
 
															+
														
 
															+    # Generating the plots
														
 
															+    for i in range(len(labels)):
														
 
															+        label = labels[i]
														
 
															+        figs[i].suptitle(label)
														
 
															+
														
 
															+        # Filtering text data
														
 
															+        text_data = Xy[[label, "chiefcomplaint"]].dropna().groupby(label).agg(lambda x: " ".join(x))["chiefcomplaint"]
														
 
															+
														
 
															+        # Training countvectorizer model then counting the odd
														
 
															+        cv = CountVectorizer().fit(text_data)
														
 
															+        text_data_array = (cv.transform(text_data).toarray()+1) # Smoothing count
														
 
															+        text_data_array[:,np.where(text_data_array <= (min_occurrence+1))[1]] = 1 # Set the odds to neutral odd
														
 
															+        text_data_array = text_data_array/text_data_array.sum(axis=1).reshape(2, -1) 
														
 
															+
														
 
															+        for j, text in text_data.items():
														
 
															+            values = (text_data_array[j,:]/(text_data_array[1-j,:])).tolist()
														
 
															+
														
 
															+            axs[i][j].imshow(
														
 
															+                WordCloud(background_color = "white", relative_scaling=0.2, max_words = 25, color_func=color_fn[j]).generate_from_frequencies(
														
 
															+                    frequencies=dict(zip(
														
 
															+                        cv.get_feature_names(),
														
 
															+                        values
														
 
															+                    ))
														
 
															+                )
														
 
															+            )
														
 
															+            axs[i][j].set_xlabel(f"{j}")
														
 
															+
														
 
															+    fig.suptitle("WordCloud selon le label")
														
 
															+    plt.show()
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
 
															 progress
														
 
															-pyarrow
														
 
															 pandas
														
 
															-sqlite3
														
 
															+numpy
														
 
															+seaborn
														
 
															+fasttext
														
 
															+wordcloud
														
 
															+sklearn