123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394 |
- from matplotlib import pyplot as plt
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, f1_score, confusion_matrix
- from wordcloud import WordCloud
- import seaborn as sns
- import itertools
- import pandas as pd
- import numpy as np
- import random
- from .preprocessing import get_Xy_df
- def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
- """
- This function produce a scatter view of all the variables from a dataset
- Parameters
- ----------
- X: Pandas Dataframe
- variables: [str], list of variables name
- n_cols: int, number of columns in the plot
- figsize: (int, int), tuple of the figure size
- """
- # Getting nrows
- nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
- figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
- axs = axs.flatten()
- for i in range(len(variables)):
- variable = variables[i]
- sns.scatterplot(
- data=X[variable].value_counts(),
- ax = axs[i]
- )
- axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
- axs[i].set_xlabel("Valeur")
- axs[i].set_ylabel("Nombre d'occurences")
- axs[i].set_title(variable)
- plt.tight_layout()
- def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
- """
- This function produce a line plot of all the missings values according to the outcomes values
- Parameters
- ----------
- X: Pandas Dataframe of features
- y: Pandas Dataframe of labels
- features: [str], list of variables name
- labels: [str], list of output name
- figsize: (int, int), tuple of the figure size
- """
- Xy = get_Xy_df(X, y)
- data = Xy[labels].join(
- pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
- ).rename(columns={0:"n_NA"}) \
- .groupby("n_NA") \
- .agg(lambda x: x.sum()/x.count())*100
- fig,ax = plt.subplots(1, 1, figsize=figsize)
- sns.lineplot(
- data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
- hue="variable",
- x="n_NA",
- y="value",
- ax=ax
- )
- ax.set_xlabel("Nombre de valeurs manquantes")
- ax.set_ylabel("Pourcentage d'examen prescrit")
- ax.set_ylim(0,100)
- ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
- def plot_missing_bar(X, features, figsize=(15,10)):
- """
- This function produce a bar plot of all the missings values
- Parameters
- ----------
- X: Pandas Dataframe of features
- features: [str], list of variables name
- figsize: (int, int), tuple of the figure size
- """
- fig, ax = plt.subplots(1,1, figsize=figsize)
- data = ((X[features].isna()*1).mean()*100).reset_index()
- sns.barplot(
- data=data,
- x="index",
- y=0,
- ax=ax
- )
- ax.set_title("% de valeurs manquantes par variable")
- ax.set_xlabel("Variable")
- ax.set_ylim(0,100)
- ax.set_ylabel("% de valeurs manquantes")
- def plot_correlation(X, features, figsize=(10,6)):
- """
- This function produce a heatmap plot of all variables correlation values
- Parameters
- ----------
- X: Pandas Dataframe of features
- features: [str], list of variables name
- figsize: (int, int), tuple of the figure size
- """
- fig, ax = plt.subplots(figsize = figsize)
- correlation_matrix = X[features].corr()
- sns.heatmap(
- correlation_matrix,
- cmap='YlGn',
- ax=ax
- )
-
- ax.set_title('Corrélations entre les features');
- def plot_labels_frequencies_and_correlation(y, labels, figsize=(30,10)):
- """
- This function produce a bar of label proportion and heatmap plot of all labels correlation values
- Parameters
- ----------
- y: Pandas Dataframe of labels
- labels: [str], list of labels name
- figsize: (int, int), tuple of the figure size
- """
- fig, axs = plt.subplots(1, 2, figsize=figsize)
- axs = axs.flatten()
- # Plotting labels proportion
- labels_data = ((y[labels].sum()/y.shape[0])*100).reset_index().round(2)
- sns.barplot(
- data=labels_data,
- x="index",
- y=0,
- ax=axs[0]
- )
- axs[0].tick_params(labelrotation=45)
- axs[0].set_ylim(0,100)
- axs[0].set_title("Proportion d'examens biologiques réalisés")
- axs[0].set_xlabel("Examens biologiques")
- axs[0].set_ylabel("% d'examens réalisés")
- # Plotting correlation
- correlation_data = y[labels].corr()
- sns.heatmap(correlation_data, ax=axs[1], cmap='YlGn')
- axs[1].set_title('Correlations entre les labels');
- def plot_box_variable_label_distribution(X, y, features, labels):
- """
- This function produce a box plot of the features distribution according to the variable status
- Parameters
- ----------
- X: Pandas Dataframe of features
- y: Pandas Dataframe of labels
- features: [str], list of variables name
- labels: [str], list of output name
- """
- # Generating colormap
- colors = sns.color_palette("muted", 2*len(features))
- # Getting Xy dataframe
- Xy = get_Xy_df(X, y)
- fig = plt.figure(constrained_layout=True, figsize=(5*len(labels),2*len(features)))
- figs = fig.subfigures(len(labels), 1)
- axs = [x.subplots(1, len(features)) for x in figs]
- for i in range(len(labels)):
- figs[i].suptitle(f"Distribution des variables selon le statut {labels[i]} (réalisé (1) ou non (0))")
- for j in range(len(features)):
- feature_name, variable_name = features[j], labels[i]
- axs[i][j].set_title(feature_name)
- axs[i][j].set_xlabel(variable_name)
- axs[i][j].set_ylabel(feature_name)
- sns.boxplot(
- data=Xy,
- ax=axs[i][j],
- x=variable_name,
- y=feature_name,
- showfliers=False,
- palette=colors[j*2:(j+1)*2]
- )
- fig.suptitle("Distribution des features en fonction du label")
- plt.show()
- def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
- """
- This function produce a word cloud of words odd-ratio (odd-ratio of seing the word given the label)
- Parameters
- ----------
- X: Pandas Dataframe of features
- y: Pandas Dataframe of labels
- text_column: str, name of the column containing the text
- labels: [str], list of output name
- min_occurrence: int, minimum number of ocurrence of the word
- ncols: int, number of columns in the output plot
- """
- # Computing nrows an getting the structure
- nrows = len(labels)//ncols + 1*((len(labels)%ncols) != 0)
- fig = plt.figure(constrained_layout=True, figsize=(4*ncols, 5*nrows))
- figs = fig.subfigures(nrows, ncols)
- figs = figs.flatten()
- axs = [x.subplots(2, 1) for x in figs]
- def rand_color_label0(*args, **kwargs):
- return "rgb(0, 100, {})".format(random.randint(200, 455))
- def rand_color_label1(*args, **kwargs):
- return "rgb({}, 0, 100)".format(random.randint(200, 455))
- color_fn = [rand_color_label0, rand_color_label1]
- # Getting Xy
- Xy = get_Xy_df(X, y)
- # Text preprocessing
- Xy = Xy.dropna(subset=[text_column])
- Xy["text_preprocessed"] = Xy[text_column] \
- .replace(",", " ").str.lower()
- # Generating the plots
- for i in range(len(labels)):
- label = labels[i]
- figs[i].suptitle(label)
- # Filtering text data
- text_data = Xy[[label, "chiefcomplaint"]].dropna().groupby(label).agg(lambda x: " ".join(x))["chiefcomplaint"]
- # Training countvectorizer model then counting the odd
- cv = CountVectorizer().fit(text_data)
- text_data_array = (cv.transform(text_data).toarray()+1) # Smoothing count
- text_data_array[:,np.where(text_data_array <= (min_occurrence+1))[1]] = 1 # Set the odds to neutral odd
- text_data_array = text_data_array/text_data_array.sum(axis=1).reshape(2, -1)
- for j, text in text_data.items():
- values = (text_data_array[j,:]/(text_data_array[1-j,:])).tolist()
- axs[i][j].imshow(
- WordCloud(background_color = "white", relative_scaling=0.2, max_words = 25, color_func=color_fn[j]).generate_from_frequencies(
- frequencies=dict(zip(
- cv.get_feature_names(),
- values
- ))
- )
- )
- axs[i][j].set_xlabel(f"{j}")
- fig.suptitle("WordCloud selon le label")
- plt.show()
- def vizualize_features_selection (scores, score_name, f_precision=2, n_score_max=5, ncols=3):
- """
- This function produce an heatmap of metrics score according to each variables combination
- Parameters
- ----------
- scores: Dictionnary containing a list of combination and associated score for each label produced by the .models.get_features_selection function
- score_name: str, Name of the score
- f_precision: int, floating point precision is the number of decimal to keep
- n_score_max: int, maximum number of scores to display
- ncols: int, number of columns in the output plot
- """
- # Creating a dataframe containing the scores
- scores_df = []
- for key, value in scores.items():
- scores_df_temp = pd.DataFrame(
- [dict(zip(x[0], [x[1] for i in range(len(x[0]))])) for x in value]
- ).assign(score=lambda x: x.max(axis=1))
- scores_df_temp.iloc[:,:-1] = (scores_df_temp.iloc[:,:-1].fillna("")*0).astype("str").replace("0.0", "x")
- scores_df_temp["name"] = key
- scores_df.append(scores_df_temp.sort_values("score", ascending=False))
- scores_df = pd.concat(scores_df).reset_index(drop=True)
- scores_df["n_features"] = (scores_df == "x").sum(axis=1)
- scores_df[score_name] = scores_df["score"].round(f_precision)
- scores_df = scores_df.sort_values(["name", "roc_auc", score_name], ascending=[True, False, True]).drop_duplicates(["name", score_name])
- # Plotting the dataframe
- scores_list = scores_df["name"].drop_duplicates().values.tolist()
- ncols = 3
- nrows = len(scores_list)//ncols + (len(scores_list)%ncols != 0)*1
- fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols,4*nrows))
- axs = axs.flatten()
- for i in range(len(scores_list)):
- score = scores_list[i]
- sns.heatmap(
- (scores_df.query(f"name == '{score}'").set_index("roc_auc").head(n_score_max).iloc[:, :-3] == 'x')*1,
- ax=axs[i]
- )
- axs[i].set_title(score)
- fig.suptitle(f"{score_name} according to features included in the model")
- plt.tight_layout()
- def display_model_performances(classifier, X_test, y_test, algorithm_name="", threshold=0.5, ncols=1):
- """
- This function produce a vizualization of the model performances
- Parameters
- ----------
- classifier: python object which should contains a predict and a predict_proba method, if many labels a dict in the format {label:classifier,...} is expected
- X_test: pandas dataframe of the features
- y_test: pandas dataframe of the labels
- algorithm_name: str, name of the algorithm
- threshold: float, threshold for classification
- ncols: int, number of columns
- """
- # Checking type of y_test
- if isinstance(y_test, pd.Series):
- y_test = pd.DataFrame(y_test)
- # Checking if one or many labels
- if len(y_test.shape) > 1 and y_test.shape[1] > 1:
- if isinstance(classifier, dict) == False or len(classifier.keys()) != y_test.shape[1]:
- raise ValueError("You should provide as many classifier than labels")
- else:
- if isinstance(classifier, dict) == False:
- classifier = {y_test.columns[0]:classifier}
- labels = y_test.columns.tolist()
- # Construction of the pyplot object
- nrows = (len(labels)//ncols) + ((len(labels)%ncols)!=0)*1
- fig = plt.figure(constrained_layout=True, figsize=(15*ncols,7*nrows))
- figs = fig.subfigures(nrows, ncols)
- figs = figs.flatten()
- if (ncols+nrows) == 1:
- figs = [figs]
- axs = [x.subplots(1, 2) for x in figs]
- # For each label :
- for i in range(len(labels)):
- label = labels[i]
- label_classifier = classifier[label]
- figs[i].suptitle(label)
- y_test_true = y_test[label].values
- y_test_hat_proba = label_classifier.predict_proba(X_test)[:,1]
- y_test_hat = (y_test_hat_proba >= threshold)*1
- # Computation of metrics
- f1_score_, accuracy_score_, recall_score_, precision_score_ = [x(y_test_true, y_test_hat) for x in [f1_score, accuracy_score, recall_score, precision_score]]
- auc_score_ = roc_auc_score(y_test_true, y_test_hat_proba)
- confusion_matrix_ = confusion_matrix(y_test_true, y_test_hat)
- # Plotting
- ## Confusion matrix
- ConfusionMatrixDisplay(
- confusion_matrix_,
- display_labels=[0, 1]
- ).plot(
- ax=axs[i][0]
- )
- ## ROC curve
- fpr, tpr, thresholds = roc_curve(
- y_test_true,
- y_test_hat_proba
- )
- axs[i][1].plot(
- fpr,
- tpr,
- label=f"AUC: {auc_score_:.2f}\nF1-Score: {f1_score_:.2f}\nRecall: {recall_score_:.2f}\nPrecision: {precision_score_:.2f}\nAccuracy: {accuracy_score_:.2f}"
- )
- axs[i][1].legend(loc=4, fontsize="x-large")
- axs[i][1].set_ylabel('Taux de vrai positifs')
- axs[i][1].set_xlabel('Taux de faux positifs')
- fig.suptitle(f"Performance de l'algorithme {algorithm_name} avec un threshold de {threshold}")
|