visualisation.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. from matplotlib import pyplot as plt
  2. from sklearn.feature_extraction.text import CountVectorizer
  3. from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, f1_score, confusion_matrix
  4. from wordcloud import WordCloud
  5. import seaborn as sns
  6. import itertools
  7. import pandas as pd
  8. import numpy as np
  9. import random
  10. from .preprocessing import get_Xy_df
  11. def plot_all_scatter (X, variables, ncols=3, figsize=(20,10)):
  12. """
  13. This function produce a scatter view of all the variables from a dataset
  14. Parameters
  15. ----------
  16. X: Pandas Dataframe
  17. variables: [str], list of variables name
  18. n_cols: int, number of columns in the plot
  19. figsize: (int, int), tuple of the figure size
  20. """
  21. # Getting nrows
  22. nrows = (len(variables) // ncols) + 1*((len(variables) % ncols) != 0)
  23. figs, axs = plt.subplots(nrows, ncols, figsize=figsize)
  24. axs = axs.flatten()
  25. for i in range(len(variables)):
  26. variable = variables[i]
  27. sns.scatterplot(
  28. data=X[variable].value_counts(),
  29. ax = axs[i]
  30. )
  31. axs[i].ticklabel_format(style='scientific', axis='x', scilimits=(0, 4))
  32. axs[i].set_xlabel("Valeur")
  33. axs[i].set_ylabel("Nombre d'occurences")
  34. axs[i].set_title(variable)
  35. plt.tight_layout()
  36. def plot_missing_outcome(X, y, features, labels, figsize=(20,10)):
  37. """
  38. This function produce a line plot of all the missings values according to the outcomes values
  39. Parameters
  40. ----------
  41. X: Pandas Dataframe of features
  42. y: Pandas Dataframe of labels
  43. features: [str], list of variables name
  44. labels: [str], list of output name
  45. figsize: (int, int), tuple of the figure size
  46. """
  47. Xy = get_Xy_df(X, y)
  48. data = Xy[labels].join(
  49. pd.DataFrame(Xy[features].isna().astype("int").sum(axis=1))
  50. ).rename(columns={0:"n_NA"}) \
  51. .groupby("n_NA") \
  52. .agg(lambda x: x.sum()/x.count())*100
  53. fig,ax = plt.subplots(1, 1, figsize=figsize)
  54. sns.lineplot(
  55. data=pd.melt(data.reset_index(), id_vars="n_NA",value_vars=data.columns),
  56. hue="variable",
  57. x="n_NA",
  58. y="value",
  59. ax=ax
  60. )
  61. ax.set_xlabel("Nombre de valeurs manquantes")
  62. ax.set_ylabel("Pourcentage d'examen prescrit")
  63. ax.set_ylim(0,100)
  64. ax.set_title("% de prescription de bilans en fonction du nombre de variables manquantes")
  65. def plot_missing_bar(X, features, figsize=(15,10)):
  66. """
  67. This function produce a bar plot of all the missings values
  68. Parameters
  69. ----------
  70. X: Pandas Dataframe of features
  71. features: [str], list of variables name
  72. figsize: (int, int), tuple of the figure size
  73. """
  74. fig, ax = plt.subplots(1,1, figsize=figsize)
  75. data = ((X[features].isna()*1).mean()*100).reset_index()
  76. sns.barplot(
  77. data=data,
  78. x="index",
  79. y=0,
  80. ax=ax
  81. )
  82. ax.set_title("% de valeurs manquantes par variable")
  83. ax.set_xlabel("Variable")
  84. ax.set_ylim(0,100)
  85. ax.set_ylabel("% de valeurs manquantes")
  86. def plot_correlation(X, features, figsize=(10,6)):
  87. """
  88. This function produce a heatmap plot of all variables correlation values
  89. Parameters
  90. ----------
  91. X: Pandas Dataframe of features
  92. features: [str], list of variables name
  93. figsize: (int, int), tuple of the figure size
  94. """
  95. fig, ax = plt.subplots(figsize = figsize)
  96. correlation_matrix = X[features].corr()
  97. sns.heatmap(
  98. correlation_matrix,
  99. cmap='YlGn',
  100. ax=ax
  101. )
  102. ax.set_title('Corrélations entre les features');
  103. def plot_labels_frequencies_and_correlation(y, labels, figsize=(30,10)):
  104. """
  105. This function produce a bar of label proportion and heatmap plot of all labels correlation values
  106. Parameters
  107. ----------
  108. y: Pandas Dataframe of labels
  109. labels: [str], list of labels name
  110. figsize: (int, int), tuple of the figure size
  111. """
  112. fig, axs = plt.subplots(1, 2, figsize=figsize)
  113. axs = axs.flatten()
  114. # Plotting labels proportion
  115. labels_data = ((y[labels].sum()/y.shape[0])*100).reset_index().round(2)
  116. sns.barplot(
  117. data=labels_data,
  118. x="index",
  119. y=0,
  120. ax=axs[0]
  121. )
  122. axs[0].tick_params(labelrotation=45)
  123. axs[0].set_ylim(0,100)
  124. axs[0].set_title("Proportion d'examens biologiques réalisés")
  125. axs[0].set_xlabel("Examens biologiques")
  126. axs[0].set_ylabel("% d'examens réalisés")
  127. # Plotting correlation
  128. correlation_data = y[labels].corr()
  129. sns.heatmap(correlation_data, ax=axs[1], cmap='YlGn')
  130. axs[1].set_title('Correlations entre les labels');
  131. def plot_box_variable_label_distribution(X, y, features, labels):
  132. """
  133. This function produce a box plot of the features distribution according to the variable status
  134. Parameters
  135. ----------
  136. X: Pandas Dataframe of features
  137. y: Pandas Dataframe of labels
  138. features: [str], list of variables name
  139. labels: [str], list of output name
  140. """
  141. # Generating colormap
  142. colors = sns.color_palette("muted", 2*len(features))
  143. # Getting Xy dataframe
  144. Xy = get_Xy_df(X, y)
  145. fig = plt.figure(constrained_layout=True, figsize=(5*len(labels),2*len(features)))
  146. figs = fig.subfigures(len(labels), 1)
  147. axs = [x.subplots(1, len(features)) for x in figs]
  148. for i in range(len(labels)):
  149. figs[i].suptitle(f"Distribution des variables selon le statut {labels[i]} (réalisé (1) ou non (0))")
  150. for j in range(len(features)):
  151. feature_name, variable_name = features[j], labels[i]
  152. axs[i][j].set_title(feature_name)
  153. axs[i][j].set_xlabel(variable_name)
  154. axs[i][j].set_ylabel(feature_name)
  155. sns.boxplot(
  156. data=Xy,
  157. ax=axs[i][j],
  158. x=variable_name,
  159. y=feature_name,
  160. showfliers=False,
  161. palette=colors[j*2:(j+1)*2]
  162. )
  163. fig.suptitle("Distribution des features en fonction du label")
  164. plt.show()
  165. def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
  166. """
  167. This function produce a word cloud of words odd-ratio (odd-ratio of seing the word given the label)
  168. Parameters
  169. ----------
  170. X: Pandas Dataframe of features
  171. y: Pandas Dataframe of labels
  172. text_column: str, name of the column containing the text
  173. labels: [str], list of output name
  174. min_occurrence: int, minimum number of ocurrence of the word
  175. ncols: int, number of columns in the output plot
  176. """
  177. # Computing nrows an getting the structure
  178. nrows = len(labels)//ncols + 1*((len(labels)%ncols) != 0)
  179. fig = plt.figure(constrained_layout=True, figsize=(4*ncols, 5*nrows))
  180. figs = fig.subfigures(nrows, ncols)
  181. figs = figs.flatten()
  182. axs = [x.subplots(2, 1) for x in figs]
  183. def rand_color_label0(*args, **kwargs):
  184. return "rgb(0, 100, {})".format(random.randint(200, 455))
  185. def rand_color_label1(*args, **kwargs):
  186. return "rgb({}, 0, 100)".format(random.randint(200, 455))
  187. color_fn = [rand_color_label0, rand_color_label1]
  188. # Getting Xy
  189. Xy = get_Xy_df(X, y)
  190. # Text preprocessing
  191. Xy = Xy.dropna(subset=[text_column])
  192. Xy["text_preprocessed"] = Xy[text_column] \
  193. .replace(",", " ").str.lower()
  194. # Generating the plots
  195. for i in range(len(labels)):
  196. label = labels[i]
  197. figs[i].suptitle(label)
  198. # Filtering text data
  199. text_data = Xy[[label, "chiefcomplaint"]].dropna().groupby(label).agg(lambda x: " ".join(x))["chiefcomplaint"]
  200. # Training countvectorizer model then counting the odd
  201. cv = CountVectorizer().fit(text_data)
  202. text_data_array = (cv.transform(text_data).toarray()+1) # Smoothing count
  203. text_data_array[:,np.where(text_data_array <= (min_occurrence+1))[1]] = 1 # Set the odds to neutral odd
  204. text_data_array = text_data_array/text_data_array.sum(axis=1).reshape(2, -1)
  205. for j, text in text_data.items():
  206. values = (text_data_array[j,:]/(text_data_array[1-j,:])).tolist()
  207. axs[i][j].imshow(
  208. WordCloud(background_color = "white", relative_scaling=0.2, max_words = 25, color_func=color_fn[j]).generate_from_frequencies(
  209. frequencies=dict(zip(
  210. cv.get_feature_names(),
  211. values
  212. ))
  213. )
  214. )
  215. axs[i][j].set_xlabel(f"{j}")
  216. fig.suptitle("WordCloud selon le label")
  217. plt.show()
  218. def vizualize_features_selection (scores, score_name, f_precision=2, n_score_max=5, ncols=3):
  219. """
  220. This function produce an heatmap of metrics score according to each variables combination
  221. Parameters
  222. ----------
  223. scores: Dictionnary containing a list of combination and associated score for each label produced by the .models.get_features_selection function
  224. score_name: str, Name of the score
  225. f_precision: int, floating point precision is the number of decimal to keep
  226. n_score_max: int, maximum number of scores to display
  227. ncols: int, number of columns in the output plot
  228. """
  229. # Creating a dataframe containing the scores
  230. scores_df = []
  231. for key, value in scores.items():
  232. scores_df_temp = pd.DataFrame(
  233. [dict(zip(x[0], [x[1] for i in range(len(x[0]))])) for x in value]
  234. ).assign(score=lambda x: x.max(axis=1))
  235. scores_df_temp.iloc[:,:-1] = (scores_df_temp.iloc[:,:-1].fillna("")*0).astype("str").replace("0.0", "x")
  236. scores_df_temp["name"] = key
  237. scores_df.append(scores_df_temp.sort_values("score", ascending=False))
  238. scores_df = pd.concat(scores_df).reset_index(drop=True)
  239. scores_df["n_features"] = (scores_df == "x").sum(axis=1)
  240. scores_df[score_name] = scores_df["score"].round(f_precision)
  241. scores_df = scores_df.sort_values(["name", "roc_auc", score_name], ascending=[True, False, True]).drop_duplicates(["name", score_name])
  242. # Plotting the dataframe
  243. scores_list = scores_df["name"].drop_duplicates().values.tolist()
  244. ncols = 3
  245. nrows = len(scores_list)//ncols + (len(scores_list)%ncols != 0)*1
  246. fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols,4*nrows))
  247. axs = axs.flatten()
  248. for i in range(len(scores_list)):
  249. score = scores_list[i]
  250. sns.heatmap(
  251. (scores_df.query(f"name == '{score}'").set_index("roc_auc").head(n_score_max).iloc[:, :-3] == 'x')*1,
  252. ax=axs[i]
  253. )
  254. axs[i].set_title(score)
  255. fig.suptitle(f"{score_name} according to features included in the model")
  256. plt.tight_layout()
  257. def display_model_performances(classifier, X_test, y_test, algorithm_name="", threshold=0.5, ncols=1):
  258. """
  259. This function produce a vizualization of the model performances
  260. Parameters
  261. ----------
  262. classifier: python object which should contains a predict and a predict_proba method, if many labels a dict in the format {label:classifier,...} is expected
  263. X_test: pandas dataframe of the features
  264. y_test: pandas dataframe of the labels
  265. algorithm_name: str, name of the algorithm
  266. threshold: float, threshold for classification
  267. ncols: int, number of columns
  268. """
  269. # Checking type of y_test
  270. if isinstance(y_test, pd.Series):
  271. y_test = pd.DataFrame(y_test)
  272. # Checking if one or many labels
  273. if len(y_test.shape) > 1 and y_test.shape[1] > 1:
  274. if isinstance(classifier, dict) == False or len(classifier.keys()) != y_test.shape[1]:
  275. raise ValueError("You should provide as many classifier than labels")
  276. else:
  277. if isinstance(classifier, dict) == False:
  278. classifier = {y_test.columns[0]:classifier}
  279. labels = y_test.columns.tolist()
  280. # Construction of the pyplot object
  281. nrows = (len(labels)//ncols) + ((len(labels)%ncols)!=0)*1
  282. fig = plt.figure(constrained_layout=True, figsize=(15*ncols,7*nrows))
  283. figs = fig.subfigures(nrows, ncols)
  284. figs = figs.flatten()
  285. if (ncols+nrows) == 1:
  286. figs = [figs]
  287. axs = [x.subplots(1, 2) for x in figs]
  288. # For each label :
  289. for i in range(len(labels)):
  290. label = labels[i]
  291. label_classifier = classifier[label]
  292. figs[i].suptitle(label)
  293. y_test_true = y_test[label].values
  294. y_test_hat_proba = label_classifier.predict_proba(X_test)[:,1]
  295. y_test_hat = (y_test_hat_proba >= threshold)*1
  296. # Computation of metrics
  297. f1_score_, accuracy_score_, recall_score_, precision_score_ = [x(y_test_true, y_test_hat) for x in [f1_score, accuracy_score, recall_score, precision_score]]
  298. auc_score_ = roc_auc_score(y_test_true, y_test_hat_proba)
  299. confusion_matrix_ = confusion_matrix(y_test_true, y_test_hat)
  300. # Plotting
  301. ## Confusion matrix
  302. ConfusionMatrixDisplay(
  303. confusion_matrix_,
  304. display_labels=[0, 1]
  305. ).plot(
  306. ax=axs[i][0]
  307. )
  308. ## ROC curve
  309. fpr, tpr, thresholds = roc_curve(
  310. y_test_true,
  311. y_test_hat_proba
  312. )
  313. axs[i][1].plot(
  314. fpr,
  315. tpr,
  316. label=f"AUC: {auc_score_:.2f}\nF1-Score: {f1_score_:.2f}\nRecall: {recall_score_:.2f}\nPrecision: {precision_score_:.2f}\nAccuracy: {accuracy_score_:.2f}"
  317. )
  318. axs[i][1].legend(loc=4, fontsize="x-large")
  319. axs[i][1].set_ylabel('Taux de vrai positifs')
  320. axs[i][1].set_xlabel('Taux de faux positifs')
  321. fig.suptitle(f"Performance de l'algorithme {algorithm_name} avec un threshold de {threshold}")