Ali 2 rokov pred
rodič
commit
0cffdcf66e
3 zmenil súbory, kde vykonal 197 pridanie a 9 odobranie
  1. 11 6
      Project_Report.ipynb
  2. 181 1
      bop_scripts/visualisation.py
  3. 5 2
      requirements.txt

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 11 - 6
Project_Report.ipynb


+ 181 - 1
bop_scripts/visualisation.py

@@ -1,8 +1,13 @@
 from matplotlib import pyplot as plt
+from sklearn.feature_extraction.text import CountVectorizer
+from wordcloud import WordCloud
 import seaborn as sns
 import pandas as pd
+import numpy as np
+import random
 from .preprocessing import get_Xy_df
 
+
 def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
     """
         This function produce a scatter view of all the variables from a dataset
@@ -69,6 +74,16 @@ def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
     ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
 
 def plot_missing_bar(X, features, figsize=(15,10)):
+    """
+        This function produce a bar plot of all the missings values
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        features: [str], list of variables name
+        figsize: (int, int), tuple of the figure size
+    """
+
     fig, ax = plt.subplots(1,1, figsize=figsize)
 
     data = (X[features].isna()*1).mean().reset_index()
@@ -81,4 +96,169 @@ def plot_missing_bar(X, features, figsize=(15,10)):
 
     ax.set_title("% de valeurs manquantes par variable")
     ax.set_xlabel("Variable")
-    ax.set_ylabel("% de valeurs manquantes")
+    ax.set_ylabel("% de valeurs manquantes")
+
+def plot_correlation(X, features, figsize=(10,6)):
+    """
+        This function produce a heatmap plot of all variables correlation values
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        features: [str], list of variables name
+        figsize: (int, int), tuple of the figure size
+    """
+
+    fig, ax = plt.subplots(figsize = figsize)
+
+    correlation_matrix = X[features].corr()
+    sns.heatmap(
+        correlation_matrix,
+        cmap='YlGn',
+        ax=ax
+    )
+    
+    ax.set_title('Corrélations entre les features');
+
+
+def plot_labels_frequencies_and_correlation(y, labels, figsize=(30,10)):
+    """
+        This function produce a bar of label proportion and heatmap plot of all labels correlation values
+
+        Parameters
+        ----------
+        y: Pandas Dataframe of labels
+        labels: [str], list of labels name
+        figsize: (int, int), tuple of the figure size
+    """
+
+    fig, axs = plt.subplots(1, 2, figsize=figsize)
+    axs = axs.flatten()
+
+    # Plotting labels proportion
+    labels_data = ((y[labels].sum()/y.shape[0])*100).reset_index().round(2)
+    sns.barplot(
+        data=labels_data,
+        x="index",
+        y=0,
+        ax=axs[0]
+    )
+    axs[0].tick_params(labelrotation=45)
+    axs[0].set_ylim(0,100)
+    axs[0].set_title("Proportion d'examens biologiques réalisés")
+    axs[0].set_xlabel("Examens biologiques")
+    axs[0].set_ylabel("% d'examens réalisés")
+
+    # Plotting correlation
+
+    correlation_data = y[labels].corr()
+    sns.heatmap(correlation_data, ax=axs[1], cmap='YlGn')
+    axs[1].set_title('Correlations entre les labels');
+
+def plot_box_variable_label_distribution(X, y, features, labels):
+    """
+        This function produce a box plot of the features distribution according to the variable status
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        y: Pandas Dataframe of labels
+        features: [str], list of variables name
+        labels: [str], list of output name
+    """
+
+    # Generating colormap
+    colors = sns.color_palette("muted", 2*len(features))
+
+    # Getting Xy dataframe
+    Xy = get_Xy_df(X, y)
+
+    fig = plt.figure(constrained_layout=True, figsize=(5*len(labels),2*len(features)))
+    figs = fig.subfigures(len(labels), 1)
+    axs = [x.subplots(1, len(features)) for x in figs]
+
+    for i in range(len(labels)):
+        figs[i].suptitle(f"Distribution des variables selon le statut {labels[i]} (réalisé (1) ou non (0))")
+        for j in range(len(features)):
+            feature_name, variable_name = features[j], labels[i]
+            axs[i][j].set_title(feature_name)
+            axs[i][j].set_xlabel(variable_name)
+            axs[i][j].set_ylabel(feature_name)
+            sns.boxplot(
+                data=Xy, 
+                ax=axs[i][j], 
+                x=variable_name, 
+                y=feature_name, 
+                showfliers=False,
+                palette=colors[j*2:(j+1)*2]
+            )
+
+    fig.suptitle("Distribution des features en fonction du label")
+    plt.show()
+
+def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
+    """
+        This function produce a word cloud of words odd-ratio (odd-ratio of seing the word given the label)
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        y: Pandas Dataframe of labels
+        text_column: str, name of the column containing the text
+        labels: [str], list of output name
+        min_occurrence: int, minimum number of ocurrence of the word
+        ncols: int, number of columns in the output plot
+    """
+
+    # Computing nrows an getting the structure
+    nrows = len(labels)//ncols + 1*((len(labels)%ncols) != 0)
+    fig = plt.figure(constrained_layout=True, figsize=(4*ncols, 5*nrows))
+    figs = fig.subfigures(nrows, ncols)
+    figs = figs.flatten()
+    axs = [x.subplots(2, 1) for x in figs]
+
+    def rand_color_label0(*args, **kwargs):
+        return "rgb(0, 100, {})".format(random.randint(200, 455))
+
+    def rand_color_label1(*args, **kwargs):
+        return "rgb({}, 0, 100)".format(random.randint(200, 455))
+
+    color_fn = [rand_color_label0, rand_color_label1]
+
+    # Getting Xy
+    Xy = get_Xy_df(X, y)
+
+    # Text preprocessing
+    Xy = Xy.dropna(subset=[text_column])
+    Xy["text_preprocessed"] = Xy[text_column] \
+        .replace(",", " ").str.lower()
+
+    # Generating the plots
+    for i in range(len(labels)):
+        label = labels[i]
+        figs[i].suptitle(label)
+
+        # Filtering text data
+        text_data = Xy[[label, "chiefcomplaint"]].dropna().groupby(label).agg(lambda x: " ".join(x))["chiefcomplaint"]
+
+        # Training countvectorizer model then counting the odd
+        cv = CountVectorizer().fit(text_data)
+        text_data_array = (cv.transform(text_data).toarray()+1) # Smoothing count
+        text_data_array[:,np.where(text_data_array <= (min_occurrence+1))[1]] = 1 # Set the odds to neutral odd
+        text_data_array = text_data_array/text_data_array.sum(axis=1).reshape(2, -1) 
+
+        for j, text in text_data.items():
+            values = (text_data_array[j,:]/(text_data_array[1-j,:])).tolist()
+
+            axs[i][j].imshow(
+                WordCloud(background_color = "white", relative_scaling=0.2, max_words = 25, color_func=color_fn[j]).generate_from_frequencies(
+                    frequencies=dict(zip(
+                        cv.get_feature_names(),
+                        values
+                    ))
+                )
+            )
+            axs[i][j].set_xlabel(f"{j}")
+
+    fig.suptitle("WordCloud selon le label")
+    plt.show()

+ 5 - 2
requirements.txt

@@ -1,4 +1,7 @@
 progress
-pyarrow
 pandas
-sqlite3
+numpy
+seaborn
+fasttext
+wordcloud
+sklearn

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov