%!s(int64=2) %!d(string=hai) anos · 6e3f54c8d3
--- a/Project_Report.ipynb
+++ b/Project_Report.ipynb
--- a/bop_scripts/models.py
+++ b/bop_scripts/models.py
@@ -0,0 +1,185 @@
 
				+from sklearn.compose import ColumnTransformer
			
 
				+from sklearn.pipeline import Pipeline, FeatureUnion
			
 
				+from sklearn.preprocessing import OrdinalEncoder, StandardScaler
			
 
				+from sklearn.feature_extraction.text import CountVectorizer
			
 
				+from sklearn.impute import SimpleImputer, MissingIndicator
			
 
				+from sklearn.model_selection import cross_val_score
			
 
				+from sklearn.exceptions import ConvergenceWarning
			
 
				+from .preprocessing import OutlierRemover, TextPreprocessing
			
 
				+from warnings import simplefilter
			
 
				+import itertools
			
 
				+
			
 
				+def generate_model (classifier, categorical_variables, continuous_variables, text_variable=None, missing_indicator=True, OrdinalEncoder_kwargs={}, StandardScaler_kwargs={}, CountVectorizer_kwargs={}, SimpleImputer_kwargs={}, MissingIndicator_kwargs={}, remove_outliers=False, outliers_variables_ranges=None):
			
 
				+    """
			
 
				+        Generate a model Pipeline containing features pre-processing 
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        classifier: sklearn classifier object with a fit and transform method
			
 
				+        categorical_variables: [str], list of categorical variables
			
 
				+        continuous_variables: [str], list of continuous variables
			
 
				+        text_variable: [str], text variables, None if missing
			
 
				+        missing_indicator: boolean, if True a missing indicator is added to the Pipeline
			
 
				+        OrdinalEncoder_kwargs: dict, argument passed to the ordinal encoder
			
 
				+        StandardScaler_kwargs: dict, argument passed to the standard scaler
			
 
				+        CountVectorizer_kwargs: dict, argument passed to the count vectorizer
			
 
				+        SimpleImputer_kwargs: dict, argument passed to the simple imputer
			
 
				+        MissingIndicator_kwargs: dict, argument passed to the missing indicator    
			
 
				+        remove_outliers: boolean, if true the outliers are set to nan
			
 
				+        outliers_variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
			
 
				+    """
			
 
				+
			
 
				+    variables = categorical_variables+continuous_variables
			
 
				+    if text_variable is not None:
			
 
				+        variables += [text_variable]
			
 
				+
			
 
				+    # Features pre-processing
			
 
				+    features_preprocessing_list = []
			
 
				+
			
 
				+    ## Outliers removal :
			
 
				+    if remove_outliers==True and outliers_variables_ranges is not None:
			
 
				+        # Creating the range list
			
 
				+        outliers_variables_range_clean = dict([(x, y) for x,y in outliers_variables_ranges.items() if x in variables])
			
 
				+        features_preprocessing_list.append(("outliers", OutlierRemover(variables_ranges=outliers_variables_range_clean), list(outliers_variables_range_clean.keys())))
			
 
				+
			
 
				+    if len(categorical_variables) > 0:
			
 
				+        features_preprocessing_list.append(("binary_encoder", OrdinalEncoder(**OrdinalEncoder_kwargs), categorical_variables))
			
 
				+    if len(continuous_variables) > 0:
			
 
				+        features_preprocessing_list.append(("continuous_scaling", StandardScaler(**StandardScaler_kwargs), continuous_variables))
			
 
				+    if text_variable is not None:
			
 
				+        # Text pre-processing then count vectorizer
			
 
				+        text_preprocessing_pipeline = Pipeline([
			
 
				+            ("text_preprocessing", TextPreprocessing()),
			
 
				+            ("text_countvectorizer", CountVectorizer(**CountVectorizer_kwargs))
			
 
				+        ])
			
 
				+        
			
 
				+        features_preprocessing_list.append(("text_encoding", text_preprocessing_pipeline, text_variable))
			
 
				+        
			
 
				+    # Imputation methods
			
 
				+    imputation_list = []
			
 
				+    imputation_list.append(("missing_imputer", SimpleImputer(**SimpleImputer_kwargs)))
			
 
				+    if missing_indicator:
			
 
				+        imputation_list.append(
			
 
				+            ("missing_indicator", MissingIndicator(**MissingIndicator_kwargs))
			
 
				+        )
			
 
				+
			
 
				+    # Creating the pipeline
			
 
				+
			
 
				+    features_preprocessing = ColumnTransformer(features_preprocessing_list)
			
 
				+    full_preprocessing = Pipeline([
			
 
				+        ("features", features_preprocessing),
			
 
				+        ("impute_and_store_missing", FeatureUnion(imputation_list)),
			
 
				+    ])
			
 
				+
			
 
				+    pipeline = Pipeline([
			
 
				+        ("preprocessing", full_preprocessing),
			
 
				+        ("lr", classifier)
			
 
				+    ])
			
 
				+    
			
 
				+    return pipeline
			
 
				+
			
 
				+def get_features_selection (X, y, classifier, categorical_variables, continuous_variables, text_variable=None, min_features=1, cv=3, metric_score="auc_score"):
			
 
				+    """
			
 
				+        This function return the metrics score according to each variables combination
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        X: Pandas Dataframe of features
			
 
				+        y: Pandas Dataframe of labels
			
 
				+        classifier: sklearn classifier object with a fit and transform method
			
 
				+        categorical_variables: [str], list of categorical variables
			
 
				+        continuous_variables: [str], list of continuous variables
			
 
				+        text_variable: [str], text variables, None if missing
			
 
				+        min_features: int, minimum number of features to include in the model
			
 
				+        cv: int, cross-validation splitting strategy according to the cross_val_score documentation
			
 
				+        metric_score: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
			
 
				+
			
 
				+        Output
			
 
				+        ------
			
 
				+        Dictionnary containing a list of combination and associated score for each label
			
 
				+    """
			
 
				+
			
 
				+    # Getting labels list
			
 
				+    labels = y.columns.tolist()
			
 
				+
			
 
				+    # Getting the combinations
			
 
				+    variables = categorical_variables + continuous_variables
			
 
				+    if text_variable is not None:
			
 
				+        variables += [text_variable]
			
 
				+
			
 
				+    variables_combinations = []
			
 
				+    for i in range(min_features, len(variables)+1):
			
 
				+        variables_combinations += itertools.combinations(variables, i)
			
 
				+
			
 
				+    # Getting global model
			
 
				+    global_pipeline = generate_model(
			
 
				+            classifier,
			
 
				+            categorical_variables,
			
 
				+            continuous_variables,
			
 
				+            text_variable,
			
 
				+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
			
 
				+    )
			
 
				+
			
 
				+    # Preprocessing the data : we accept here to mix train/eval in feature scaling to reduce execution time
			
 
				+    X_transformed = global_pipeline.steps[0][1].steps[0][1].fit_transform(X)
			
 
				+
			
 
				+    # Storing scores dictionnary
			
 
				+    scores = dict(zip(
			
 
				+        labels,
			
 
				+        [[] for x in labels]
			
 
				+    ))
			
 
				+
			
 
				+    # Getting the scores
			
 
				+    for variable_combination in variables_combinations:
			
 
				+        combination_categorical_variables, combination_continuous_variables = [x for x in categorical_variables if x in variable_combination], [x for x in continuous_variables if x in variable_combination]
			
 
				+        combination_text_variable = text_variable if (text_variable is not None and text_variable in variable_combination) else None
			
 
				+
			
 
				+        pipeline = generate_model(
			
 
				+            classifier,
			
 
				+            combination_categorical_variables,
			
 
				+            combination_continuous_variables,
			
 
				+            combination_text_variable,
			
 
				+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
			
 
				+        )
			
 
				+
			
 
				+        # Get X index
			
 
				+        if text_variable is not None:
			
 
				+            variables_index = [variables.index(x) for x in variable_combination if x != text_variable]
			
 
				+            if text_variable in variable_combination:
			
 
				+                variables_index += list(range(X_transformed.shape[1]-200, X_transformed.shape[1]))
			
 
				+        else:
			
 
				+            variables_index = [variables.index(x) for x in variable_combination]
			
 
				+
			
 
				+        pipeline.steps[0][1].steps.pop(0) # Removing preprocessing step
			
 
				+
			
 
				+        for label in labels:
			
 
				+            score = cross_val_score(pipeline, X_transformed[:,variables_index], y[label], cv=cv, scoring="roc_auc").mean().mean()
			
 
				+            scores[label].append([variable_combination, score])
			
 
				+
			
 
				+    return scores
			
 
				+
			
 
				+def fit_all_classifiers(classifier, X_train, y_train, hide_warnings=True):
			
 
				+    """
			
 
				+        This function fill all the models for each label.
			
 
				+
			
 
				+        Parameters:
			
 
				+        ----------
			
 
				+        model: Classifier with a fit method
			
 
				+        X: Pandas Dataframe of features
			
 
				+        y: Pandas Dataframe of labels
			
 
				+        hide_warnings: boolean, if true the warnings will be hidden
			
 
				+
			
 
				+        Output:
			
 
				+        -------
			
 
				+        Dictionnary containing a classifier per label
			
 
				+    """
			
 
				+
			
 
				+    if hide_warnings == True:
			
 
				+        simplefilter("ignore", category=ConvergenceWarning)
			
 
				+
			
 
				+    labels = y_train.columns.tolist()
			
 
				+    classifiers = {}
			
 
				+    for label in labels:
			
 
				+        classifiers[label] = classifier.fit(X_train, y_train[label])
			
 
				+
			
 
				+    return classifiers
			
--- a/bop_scripts/nn_models.py
+++ b/bop_scripts/nn_models.py
@@ -0,0 +1,218 @@
 
				+from random import sample
			
 
				+from sklearn.base import BaseEstimator
			
 
				+from sklearn.metrics import SCORERS
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+from sklearn.utils import check_X_y, check_array
			
 
				+import torch
			
 
				+from torch import nn, optim
			
 
				+from torch.utils.data import DataLoader
			
 
				+from scipy.sparse import issparse
			
 
				+import numpy as np
			
 
				+
			
 
				+class torchMLP (nn.Module):
			
 
				+    """
			
 
				+        Neural network model for 
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, n_features, n_labels):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        self.network = nn.Sequential(*[
			
 
				+            nn.Linear(n_features, 200),
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(200, 50),
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(50, n_labels),
			
 
				+            nn.Sigmoid()
			
 
				+        ])
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        
			
 
				+        y_hat = self.network(x)
			
 
				+
			
 
				+        return y_hat
			
 
				+
			
 
				+class torchMLPClassifier_sklearn (BaseEstimator):
			
 
				+
			
 
				+    """
			
 
				+        Pytorch neural network with a sklearn-like API
			
 
				+    """
			
 
				+
			
 
				+    def __init__ (self, model, n_epochs=50, early_stop=True, early_stop_metric="accuracy", early_stop_validations_size=0.1, batch_size=1024, learning_rate=1e-3, class_weight=None, device_train="cpu", device_predict="cpu"):
			
 
				+        """
			
 
				+            Parameters:
			
 
				+            -----------
			
 
				+            model: non instanciated pytorch neural network model with a n_features and n_labels parameter
			
 
				+            n_epochs: int, number of epochs
			
 
				+            early_stop: boolean, if true an evaluation dataset is created and used to stop the training
			
 
				+            early_stop_metric: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
			
 
				+            early_stop_validations_size: int or float, if float percentage of the train dataset used for validation, otherwise number of sample to use 
			
 
				+            batch_size: int, size of the training batch
			
 
				+            learning_rate: float, Adam optimizer learning rate
			
 
				+            class_weight: dict or str, same as the sklearn API
			
 
				+            device_train: str, device on which to train
			
 
				+            device_predict: str, device on which to predict
			
 
				+        """
			
 
				+
			
 
				+        self.model = model
			
 
				+
			
 
				+        self.n_epochs = n_epochs
			
 
				+        if early_stop and (early_stop_metric is not None) and (early_stop_metric in SCORERS.keys()) and (isinstance(early_stop_validations_size, int) or isinstance(early_stop_validations_size, float)):
			
 
				+            self.early_stop = early_stop
			
 
				+            self.early_stop_metric = SCORERS[early_stop_metric]
			
 
				+            self.early_stop_validations_size = early_stop_validations_size
			
 
				+        else:
			
 
				+            self.early_stop = False
			
 
				+
			
 
				+        self.class_weight = class_weight
			
 
				+        self.learning_rate = learning_rate
			
 
				+        self.device_train = device_train
			
 
				+        self.device_predict = device_predict
			
 
				+        self.batch_size = batch_size
			
 
				+
			
 
				+    def fit(self, X, y):
			
 
				+        """
			
 
				+            Training the model
			
 
				+
			
 
				+            Parameters:
			
 
				+            -----------
			
 
				+            X_test: pandas dataframe of the features
			
 
				+            y_test: pandas dataframe of the labels
			
 
				+        """
			
 
				+
			
 
				+        X, y = check_X_y(X, y, accept_sparse=True, multi_output=True)
			
 
				+        if y.ndim == 1:
			
 
				+            y = np.expand_dims(y, 1)
			
 
				+
			
 
				+        # Validation split if early stopping
			
 
				+        if self.early_stop:
			
 
				+            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.early_stop_validations_size)
			
 
				+            if issparse(X_val): # To deal with the sparse matrix situations
			
 
				+                X_val = X_val.toarray()
			
 
				+        else:
			
 
				+            X_train, y_train = X, y
			
 
				+
			
 
				+        n_samples = y_train.shape[0]
			
 
				+        n_labels_values = len(np.unique(y_train))
			
 
				+        n_labels = y_train.shape[1]
			
 
				+        n_features = X.shape[1]
			
 
				+
			
 
				+        # Raising the model
			
 
				+        self.network = self.model(n_features=n_features, n_labels=n_labels)
			
 
				+        self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate)
			
 
				+
			
 
				+
			
 
				+        # Creating dataloader for X_train, y_train
			
 
				+        data_loader = DataLoader(range(X_train.shape[0]), shuffle=True, batch_size=self.batch_size)
			
 
				+
			
 
				+        # Initializing loss function
			
 
				+        ## Getting weights
			
 
				+        if self.class_weight is not None:
			
 
				+            if self.class_weight == "balanced":
			
 
				+                weights = n_samples/(n_labels_values*np.bincount(y_train[:,0]))
			
 
				+                weights_dict = dict(zip(range(len(weights)), weights))
			
 
				+            else:
			
 
				+                weights_dict = self.class_weight
			
 
				+        else:
			
 
				+            weights_dict = None
			
 
				+
			
 
				+        criterion = nn.BCELoss()
			
 
				+
			
 
				+        # Running train
			
 
				+        last_score = 0
			
 
				+        for i in range(self.n_epochs):
			
 
				+
			
 
				+            # Starting an epoch
			
 
				+            for indices in data_loader:
			
 
				+                self.optimizer.zero_grad()
			
 
				+
			
 
				+                X_train_sample, y_train_sample = X_train[indices, :], y_train[indices, :]
			
 
				+                if issparse(X_train_sample): # To deal with the sparse matrix situations
			
 
				+                    X_train_sample = X_train_sample.toarray()
			
 
				+                X_train_sample_tensor, y_train_sample_tensor = [torch.tensor(x, dtype=torch.float32).to(self.device_train) for x in [X_train_sample, y_train_sample]]
			
 
				+
			
 
				+                # Weighting the loss
			
 
				+                if self.class_weight is not None:
			
 
				+                    sample_weights = y_train_sample.copy()
			
 
				+                    for x, y in weights_dict.items():
			
 
				+                        sample_weights[sample_weights == x] = y
			
 
				+                    criterion.weigths = sample_weights
			
 
				+
			
 
				+                # Get prediction
			
 
				+                y_train_sample_hat = self.network(X_train_sample_tensor)
			
 
				+
			
 
				+                loss = criterion(y_train_sample_hat, y_train_sample_tensor)
			
 
				+                loss.backward()
			
 
				+
			
 
				+                self.optimizer.step()
			
 
				+
			
 
				+            # End of the Epoch : evaluating the score
			
 
				+            if self.early_stop:
			
 
				+                score = self.early_stop_metric(self, X_val, y_val)
			
 
				+
			
 
				+                if score < last_score:
			
 
				+                    return self
			
 
				+                else:
			
 
				+                    last_score = score
			
 
				+
			
 
				+        return self
			
 
				+
			
 
				+    def predict(self, X):
			
 
				+        """
			
 
				+            Getting the prediction
			
 
				+
			
 
				+            Parameters:
			
 
				+            -----------
			
 
				+            X_test: pandas dataframe of the features
			
 
				+        
			
 
				+        """
			
 
				+
			
 
				+        y_hat_proba = self.predict_raw_proba(X)
			
 
				+        y_hat = (y_hat_proba >= 0.5)*1
			
 
				+
			
 
				+        return y_hat
			
 
				+
			
 
				+    def predict_raw_proba(self, X):
			
 
				+        """
			
 
				+            Getting the prediction score in tensor format
			
 
				+
			
 
				+            Parameters:
			
 
				+            -----------
			
 
				+            X_test: pandas dataframe of the features
			
 
				+        
			
 
				+        """
			
 
				+
			
 
				+        X = check_array(X, accept_sparse=True)
			
 
				+        if issparse(X): # To deal with the sparse matrix situations
			
 
				+            X = X.toarray()
			
 
				+
			
 
				+        with torch.no_grad():
			
 
				+            model_predict = self.network.to(self.device_predict)
			
 
				+            model_predict.eval()
			
 
				+
			
 
				+            # Create a tensor from X
			
 
				+            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device_predict)
			
 
				+            
			
 
				+            y_hat_proba_torch = model_predict(X_tensor)
			
 
				+            y_hat_proba_torch = y_hat_proba_torch.detach().cpu().numpy()
			
 
				+
			
 
				+        return y_hat_proba_torch
			
 
				+
			
 
				+    def predict_proba(self, X):
			
 
				+        """
			
 
				+            Getting the prediction score in sklearn format
			
 
				+
			
 
				+            Parameters:
			
 
				+            -----------
			
 
				+            X_test: pandas dataframe of the features
			
 
				+        
			
 
				+        """
			
 
				+        
			
 
				+        y_hat_proba_torch = self.predict_raw_proba(X)
			
 
				+        y_hat_proba_torch = np.concatenate([
			
 
				+            1-y_hat_proba_torch,
			
 
				+            y_hat_proba_torch
			
 
				+        ], axis=0)
			
 
				+        y_hat_proba = y_hat_proba_torch
			
 
				+
			
 
				+        return y_hat_proba
			
--- a/bop_scripts/preprocessing.py
+++ b/bop_scripts/preprocessing.py
@@ -5,6 +5,7 @@
 
				 import sqlite3
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				+from sklearn.base import BaseEstimator, TransformerMixin
			
 
				 
			
 
				 def get_Xy_df (X, y):
			
 
				     """
			
@@ -144,7 +145,6 @@ def generate_features_dataset(database, get_drugs=True, get_diseases=True):
 
				     
			
 
				 
			
 
				 def generate_labels_dataset(database, lab_dictionnary):
			
 
				-
			
 
				     """
			
 
				         Generate features dataset according to the data
			
 
				 
			
@@ -241,4 +241,60 @@ def remove_outliers (X, variables_ranges):
 
				                 pourcentage=lambda x: (x["n"]/x["total"])*100
			
 
				         )
			
 
				 
			
 
				-    return X_copy, outlier_report
			
 
				+    return X_copy, outlier_report
			
 
				+
			
 
				+class OutlierRemover(BaseEstimator, TransformerMixin):
			
 
				+    """
			
 
				+        Sklearn-like class for removing outliers
			
 
				+        To be included in the pipeline
			
 
				+    """
			
 
				+
			
 
				+    def __init__ (self, variables_ranges):
			
 
				+        """
			
 
				+            Parameters:
			
 
				+            ----------
			
 
				+            variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
			
 
				+        """
			
 
				+
			
 
				+        super().__init__()
			
 
				+
			
 
				+        # Storing ranges rules
			
 
				+        self.variables_ranges = variables_ranges
			
 
				+
			
 
				+    def fit(self, X=None, y=None):
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        X_copy, _ = remove_outliers(X, self.variables_ranges)
			
 
				+        return X_copy
			
 
				+
			
 
				+class TextPreprocessing(BaseEstimator, TransformerMixin):
			
 
				+    """
			
 
				+        Sklearn-like class for text preprocessing
			
 
				+        To be included in the pipeline
			
 
				+
			
 
				+        What does it do :
			
 
				+        - It fills na with empty string
			
 
				+        - It lower string
			
 
				+        - It replace comma by space
			
 
				+    """
			
 
				+
			
 
				+    def __init__ (self):
			
 
				+
			
 
				+        """
			
 
				+            Parameters:
			
 
				+            ----------
			
 
				+        """
			
 
				+
			
 
				+        super().__init__()
			
 
				+
			
 
				+    def fit(self, X=None, y=None):
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        colname = X.name
			
 
				+        X = X \
			
 
				+            .fillna("") \
			
 
				+            .replace(",", " ").str.lower()
			
 
				+
			
 
				+        return X
			
--- a/bop_scripts/visualisation.py
+++ b/bop_scripts/visualisation.py
@@ -1,7 +1,9 @@
 
				 from matplotlib import pyplot as plt
			
 
				 from sklearn.feature_extraction.text import CountVectorizer
			
 
				+from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, f1_score, confusion_matrix
			
 
				 from wordcloud import WordCloud
			
 
				 import seaborn as sns
			
 
				+import itertools
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 import random
			
@@ -261,4 +263,129 @@ def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
 
				             axs[i][j].set_xlabel(f"{j}")
			
 
				 
			
 
				     fig.suptitle("WordCloud selon le label")
			
 
				-    plt.show()
			
 
				+    plt.show()
			
 
				+
			
 
				+def vizualize_features_selection (scores, score_name, f_precision=2, n_score_max=5, ncols=3):
			
 
				+    """
			
 
				+        This function produce an heatmap of metrics score according to each variables combination
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        scores: Dictionnary containing a list of combination and associated score for each label produced by the .models.get_features_selection function
			
 
				+        score_name: str, Name of the score
			
 
				+        f_precision: int, floating point precision is the number of decimal to keep
			
 
				+        n_score_max: int, maximum number of scores to display
			
 
				+        ncols: int, number of columns in the output plot
			
 
				+    """
			
 
				+
			
 
				+    # Creating a dataframe containing the scores
			
 
				+    scores_df = []
			
 
				+    for key, value in scores.items():
			
 
				+        scores_df_temp = pd.DataFrame(
			
 
				+            [dict(zip(x[0], [x[1] for i in range(len(x[0]))])) for x in value]
			
 
				+        ).assign(score=lambda x: x.max(axis=1))
			
 
				+        scores_df_temp.iloc[:,:-1] = (scores_df_temp.iloc[:,:-1].fillna("")*0).astype("str").replace("0.0", "x")
			
 
				+        scores_df_temp["name"] = key
			
 
				+        scores_df.append(scores_df_temp.sort_values("score", ascending=False))
			
 
				+
			
 
				+    scores_df = pd.concat(scores_df).reset_index(drop=True)
			
 
				+    scores_df["n_features"] = (scores_df == "x").sum(axis=1)
			
 
				+    scores_df[score_name] = scores_df["score"].round(f_precision)
			
 
				+    scores_df = scores_df.sort_values(["name", "roc_auc", score_name], ascending=[True, False, True]).drop_duplicates(["name", score_name])
			
 
				+
			
 
				+    # Plotting the dataframe
			
 
				+    scores_list = scores_df["name"].drop_duplicates().values.tolist()
			
 
				+    ncols = 3
			
 
				+    nrows = len(scores_list)//ncols + (len(scores_list)%ncols != 0)*1
			
 
				+
			
 
				+    fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols,4*nrows))
			
 
				+    axs = axs.flatten()
			
 
				+
			
 
				+    for i in range(len(scores_list)):
			
 
				+        score = scores_list[i]
			
 
				+        sns.heatmap(
			
 
				+            (scores_df.query(f"name == '{score}'").set_index("roc_auc").head(n_score_max).iloc[:, :-3] == 'x')*1,
			
 
				+            ax=axs[i]
			
 
				+        )
			
 
				+        axs[i].set_title(score)
			
 
				+
			
 
				+    fig.suptitle(f"{score_name} according to features included in the model")
			
 
				+    plt.tight_layout()
			
 
				+
			
 
				+def display_model_performances(classifier, X_test, y_test, algorithm_name="", threshold=0.5, ncols=1):
			
 
				+    """
			
 
				+        This function produce a vizualization of the model performances
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        classifier: python object which should contains a predict and a predict_proba method, if many labels a dict in the format {label:classifier,...} is expected
			
 
				+        X_test: pandas dataframe of the features
			
 
				+        y_test: pandas dataframe of the labels
			
 
				+        algorithm_name: str, name of the algorithm
			
 
				+        threshold: float, threshold for classification
			
 
				+        ncols: int, number of columns
			
 
				+    """
			
 
				+
			
 
				+    # Checking type of y_test
			
 
				+    if isinstance(y_test, pd.Series):
			
 
				+        y_test = pd.DataFrame(y_test)
			
 
				+
			
 
				+    # Checking if one or many labels
			
 
				+    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
			
 
				+        if isinstance(classifier, dict) == False or len(classifier.keys()) != y_test.shape[1]:
			
 
				+            raise ValueError("You should provide as many classifier than labels")
			
 
				+    else:
			
 
				+        if isinstance(classifier, dict) == False:
			
 
				+            classifier = {y_test.columns[0]:classifier}
			
 
				+
			
 
				+    labels = y_test.columns.tolist()
			
 
				+
			
 
				+    # Construction of the pyplot object
			
 
				+    nrows = (len(labels)//ncols) + ((len(labels)%ncols)!=0)*1
			
 
				+    fig = plt.figure(constrained_layout=True, figsize=(15*ncols,7*nrows))
			
 
				+    figs = fig.subfigures(nrows, ncols)
			
 
				+    figs = figs.flatten()
			
 
				+    if len(labels) == 1:
			
 
				+        figs = [figs]
			
 
				+    axs = [x.subplots(1, 2) for x in figs]
			
 
				+
			
 
				+    # For each label :
			
 
				+    for i in range(len(labels)):
			
 
				+        label = labels[i]
			
 
				+        label_classifier = classifier[label]
			
 
				+        figs[i].suptitle(label)
			
 
				+
			
 
				+        y_test_true = y_test[label].values
			
 
				+        y_test_hat_proba = label_classifier.predict_proba(X_test)[:,1]
			
 
				+        y_test_hat = (y_test_hat_proba >= threshold)*1
			
 
				+
			
 
				+        # Computation of metrics
			
 
				+        f1_score_, accuracy_score_, recall_score_, precision_score_ = [x(y_test_true, y_test_hat) for x in [f1_score, accuracy_score, recall_score, precision_score]]
			
 
				+        auc_score_ = roc_auc_score(y_test_true, y_test_hat_proba)
			
 
				+        confusion_matrix_ = confusion_matrix(y_test_true, y_test_hat)
			
 
				+
			
 
				+        # Plotting
			
 
				+        ## Confusion matrix
			
 
				+        ConfusionMatrixDisplay(
			
 
				+            confusion_matrix_,
			
 
				+            display_labels=[0, 1]
			
 
				+        ).plot(
			
 
				+            ax=axs[i][0]
			
 
				+        )
			
 
				+
			
 
				+        ## ROC curve
			
 
				+        fpr, tpr, thresholds = roc_curve(
			
 
				+            y_test_true,
			
 
				+            y_test_hat_proba
			
 
				+        )
			
 
				+
			
 
				+        axs[i][1].plot(
			
 
				+            fpr,
			
 
				+            tpr,
			
 
				+            label=f"AUC: {auc_score_:.2f}\nF1-Score: {f1_score_:.2f}\nRecall: {recall_score_:.2f}\nPrecision: {precision_score_:.2f}\nAccuracy: {accuracy_score_:.2f}"
			
 
				+        )
			
 
				+        axs[i][1].legend(loc=4, fontsize="x-large")
			
 
				+        axs[i][1].set_ylabel('Taux de vrai positifs')
			
 
				+        axs[i][1].set_xlabel('Taux de faux positifs')
			
 
				+
			
 
				+    fig.suptitle(f"Performance de l'algorithme {algorithm_name} avec un threshold de {threshold}")
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,6 @@ numpy
 
				 seaborn
			
 
				 fasttext
			
 
				 wordcloud
			
 
				-sklearn
			
 
				+sklearn
			
 
				+torch
			
 
				+scipy