3 years ago · 6e3f54c8d3
--- a/Project_Report.ipynb
+++ b/Project_Report.ipynb
--- a/bop_scripts/models.py
+++ b/bop_scripts/models.py
@@ -0,0 +1,185 @@
 
															+from sklearn.compose import ColumnTransformer
														
 
															+from sklearn.pipeline import Pipeline, FeatureUnion
														
 
															+from sklearn.preprocessing import OrdinalEncoder, StandardScaler
														
 
															+from sklearn.feature_extraction.text import CountVectorizer
														
 
															+from sklearn.impute import SimpleImputer, MissingIndicator
														
 
															+from sklearn.model_selection import cross_val_score
														
 
															+from sklearn.exceptions import ConvergenceWarning
														
 
															+from .preprocessing import OutlierRemover, TextPreprocessing
														
 
															+from warnings import simplefilter
														
 
															+import itertools
														
 
															+
														
 
															+def generate_model (classifier, categorical_variables, continuous_variables, text_variable=None, missing_indicator=True, OrdinalEncoder_kwargs={}, StandardScaler_kwargs={}, CountVectorizer_kwargs={}, SimpleImputer_kwargs={}, MissingIndicator_kwargs={}, remove_outliers=False, outliers_variables_ranges=None):
														
 
															+    """
														
 
															+        Generate a model Pipeline containing features pre-processing 
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        classifier: sklearn classifier object with a fit and transform method
														
 
															+        categorical_variables: [str], list of categorical variables
														
 
															+        continuous_variables: [str], list of continuous variables
														
 
															+        text_variable: [str], text variables, None if missing
														
 
															+        missing_indicator: boolean, if True a missing indicator is added to the Pipeline
														
 
															+        OrdinalEncoder_kwargs: dict, argument passed to the ordinal encoder
														
 
															+        StandardScaler_kwargs: dict, argument passed to the standard scaler
														
 
															+        CountVectorizer_kwargs: dict, argument passed to the count vectorizer
														
 
															+        SimpleImputer_kwargs: dict, argument passed to the simple imputer
														
 
															+        MissingIndicator_kwargs: dict, argument passed to the missing indicator    
														
 
															+        remove_outliers: boolean, if true the outliers are set to nan
														
 
															+        outliers_variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
														
 
															+    """
														
 
															+
														
 
															+    variables = categorical_variables+continuous_variables
														
 
															+    if text_variable is not None:
														
 
															+        variables += [text_variable]
														
 
															+
														
 
															+    # Features pre-processing
														
 
															+    features_preprocessing_list = []
														
 
															+
														
 
															+    ## Outliers removal :
														
 
															+    if remove_outliers==True and outliers_variables_ranges is not None:
														
 
															+        # Creating the range list
														
 
															+        outliers_variables_range_clean = dict([(x, y) for x,y in outliers_variables_ranges.items() if x in variables])
														
 
															+        features_preprocessing_list.append(("outliers", OutlierRemover(variables_ranges=outliers_variables_range_clean), list(outliers_variables_range_clean.keys())))
														
 
															+
														
 
															+    if len(categorical_variables) > 0:
														
 
															+        features_preprocessing_list.append(("binary_encoder", OrdinalEncoder(**OrdinalEncoder_kwargs), categorical_variables))
														
 
															+    if len(continuous_variables) > 0:
														
 
															+        features_preprocessing_list.append(("continuous_scaling", StandardScaler(**StandardScaler_kwargs), continuous_variables))
														
 
															+    if text_variable is not None:
														
 
															+        # Text pre-processing then count vectorizer
														
 
															+        text_preprocessing_pipeline = Pipeline([
														
 
															+            ("text_preprocessing", TextPreprocessing()),
														
 
															+            ("text_countvectorizer", CountVectorizer(**CountVectorizer_kwargs))
														
 
															+        ])
														
 
															+        
														
 
															+        features_preprocessing_list.append(("text_encoding", text_preprocessing_pipeline, text_variable))
														
 
															+        
														
 
															+    # Imputation methods
														
 
															+    imputation_list = []
														
 
															+    imputation_list.append(("missing_imputer", SimpleImputer(**SimpleImputer_kwargs)))
														
 
															+    if missing_indicator:
														
 
															+        imputation_list.append(
														
 
															+            ("missing_indicator", MissingIndicator(**MissingIndicator_kwargs))
														
 
															+        )
														
 
															+
														
 
															+    # Creating the pipeline
														
 
															+
														
 
															+    features_preprocessing = ColumnTransformer(features_preprocessing_list)
														
 
															+    full_preprocessing = Pipeline([
														
 
															+        ("features", features_preprocessing),
														
 
															+        ("impute_and_store_missing", FeatureUnion(imputation_list)),
														
 
															+    ])
														
 
															+
														
 
															+    pipeline = Pipeline([
														
 
															+        ("preprocessing", full_preprocessing),
														
 
															+        ("lr", classifier)
														
 
															+    ])
														
 
															+    
														
 
															+    return pipeline
														
 
															+
														
 
															+def get_features_selection (X, y, classifier, categorical_variables, continuous_variables, text_variable=None, min_features=1, cv=3, metric_score="auc_score"):
														
 
															+    """
														
 
															+        This function return the metrics score according to each variables combination
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        X: Pandas Dataframe of features
														
 
															+        y: Pandas Dataframe of labels
														
 
															+        classifier: sklearn classifier object with a fit and transform method
														
 
															+        categorical_variables: [str], list of categorical variables
														
 
															+        continuous_variables: [str], list of continuous variables
														
 
															+        text_variable: [str], text variables, None if missing
														
 
															+        min_features: int, minimum number of features to include in the model
														
 
															+        cv: int, cross-validation splitting strategy according to the cross_val_score documentation
														
 
															+        metric_score: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
														
 
															+
														
 
															+        Output
														
 
															+        ------
														
 
															+        Dictionnary containing a list of combination and associated score for each label
														
 
															+    """
														
 
															+
														
 
															+    # Getting labels list
														
 
															+    labels = y.columns.tolist()
														
 
															+
														
 
															+    # Getting the combinations
														
 
															+    variables = categorical_variables + continuous_variables
														
 
															+    if text_variable is not None:
														
 
															+        variables += [text_variable]
														
 
															+
														
 
															+    variables_combinations = []
														
 
															+    for i in range(min_features, len(variables)+1):
														
 
															+        variables_combinations += itertools.combinations(variables, i)
														
 
															+
														
 
															+    # Getting global model
														
 
															+    global_pipeline = generate_model(
														
 
															+            classifier,
														
 
															+            categorical_variables,
														
 
															+            continuous_variables,
														
 
															+            text_variable,
														
 
															+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
														
 
															+    )
														
 
															+
														
 
															+    # Preprocessing the data : we accept here to mix train/eval in feature scaling to reduce execution time
														
 
															+    X_transformed = global_pipeline.steps[0][1].steps[0][1].fit_transform(X)
														
 
															+
														
 
															+    # Storing scores dictionnary
														
 
															+    scores = dict(zip(
														
 
															+        labels,
														
 
															+        [[] for x in labels]
														
 
															+    ))
														
 
															+
														
 
															+    # Getting the scores
														
 
															+    for variable_combination in variables_combinations:
														
 
															+        combination_categorical_variables, combination_continuous_variables = [x for x in categorical_variables if x in variable_combination], [x for x in continuous_variables if x in variable_combination]
														
 
															+        combination_text_variable = text_variable if (text_variable is not None and text_variable in variable_combination) else None
														
 
															+
														
 
															+        pipeline = generate_model(
														
 
															+            classifier,
														
 
															+            combination_categorical_variables,
														
 
															+            combination_continuous_variables,
														
 
															+            combination_text_variable,
														
 
															+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
														
 
															+        )
														
 
															+
														
 
															+        # Get X index
														
 
															+        if text_variable is not None:
														
 
															+            variables_index = [variables.index(x) for x in variable_combination if x != text_variable]
														
 
															+            if text_variable in variable_combination:
														
 
															+                variables_index += list(range(X_transformed.shape[1]-200, X_transformed.shape[1]))
														
 
															+        else:
														
 
															+            variables_index = [variables.index(x) for x in variable_combination]
														
 
															+
														
 
															+        pipeline.steps[0][1].steps.pop(0) # Removing preprocessing step
														
 
															+
														
 
															+        for label in labels:
														
 
															+            score = cross_val_score(pipeline, X_transformed[:,variables_index], y[label], cv=cv, scoring="roc_auc").mean().mean()
														
 
															+            scores[label].append([variable_combination, score])
														
 
															+
														
 
															+    return scores
														
 
															+
														
 
															+def fit_all_classifiers(classifier, X_train, y_train, hide_warnings=True):
														
 
															+    """
														
 
															+        This function fill all the models for each label.
														
 
															+
														
 
															+        Parameters:
														
 
															+        ----------
														
 
															+        model: Classifier with a fit method
														
 
															+        X: Pandas Dataframe of features
														
 
															+        y: Pandas Dataframe of labels
														
 
															+        hide_warnings: boolean, if true the warnings will be hidden
														
 
															+
														
 
															+        Output:
														
 
															+        -------
														
 
															+        Dictionnary containing a classifier per label
														
 
															+    """
														
 
															+
														
 
															+    if hide_warnings == True:
														
 
															+        simplefilter("ignore", category=ConvergenceWarning)
														
 
															+
														
 
															+    labels = y_train.columns.tolist()
														
 
															+    classifiers = {}
														
 
															+    for label in labels:
														
 
															+        classifiers[label] = classifier.fit(X_train, y_train[label])
														
 
															+
														
 
															+    return classifiers
														
--- a/bop_scripts/nn_models.py
+++ b/bop_scripts/nn_models.py
@@ -0,0 +1,218 @@
 
															+from random import sample
														
 
															+from sklearn.base import BaseEstimator
														
 
															+from sklearn.metrics import SCORERS
														
 
															+from sklearn.model_selection import train_test_split
														
 
															+from sklearn.utils import check_X_y, check_array
														
 
															+import torch
														
 
															+from torch import nn, optim
														
 
															+from torch.utils.data import DataLoader
														
 
															+from scipy.sparse import issparse
														
 
															+import numpy as np
														
 
															+
														
 
															+class torchMLP (nn.Module):
														
 
															+    """
														
 
															+        Neural network model for 
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, n_features, n_labels):
														
 
															+        super().__init__()
														
 
															+
														
 
															+        self.network = nn.Sequential(*[
														
 
															+            nn.Linear(n_features, 200),
														
 
															+            nn.ReLU(),
														
 
															+            nn.Linear(200, 50),
														
 
															+            nn.ReLU(),
														
 
															+            nn.Linear(50, n_labels),
														
 
															+            nn.Sigmoid()
														
 
															+        ])
														
 
															+
														
 
															+    def forward(self, x):
														
 
															+        
														
 
															+        y_hat = self.network(x)
														
 
															+
														
 
															+        return y_hat
														
 
															+
														
 
															+class torchMLPClassifier_sklearn (BaseEstimator):
														
 
															+
														
 
															+    """
														
 
															+        Pytorch neural network with a sklearn-like API
														
 
															+    """
														
 
															+
														
 
															+    def __init__ (self, model, n_epochs=50, early_stop=True, early_stop_metric="accuracy", early_stop_validations_size=0.1, batch_size=1024, learning_rate=1e-3, class_weight=None, device_train="cpu", device_predict="cpu"):
														
 
															+        """
														
 
															+            Parameters:
														
 
															+            -----------
														
 
															+            model: non instanciated pytorch neural network model with a n_features and n_labels parameter
														
 
															+            n_epochs: int, number of epochs
														
 
															+            early_stop: boolean, if true an evaluation dataset is created and used to stop the training
														
 
															+            early_stop_metric: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
														
 
															+            early_stop_validations_size: int or float, if float percentage of the train dataset used for validation, otherwise number of sample to use 
														
 
															+            batch_size: int, size of the training batch
														
 
															+            learning_rate: float, Adam optimizer learning rate
														
 
															+            class_weight: dict or str, same as the sklearn API
														
 
															+            device_train: str, device on which to train
														
 
															+            device_predict: str, device on which to predict
														
 
															+        """
														
 
															+
														
 
															+        self.model = model
														
 
															+
														
 
															+        self.n_epochs = n_epochs
														
 
															+        if early_stop and (early_stop_metric is not None) and (early_stop_metric in SCORERS.keys()) and (isinstance(early_stop_validations_size, int) or isinstance(early_stop_validations_size, float)):
														
 
															+            self.early_stop = early_stop
														
 
															+            self.early_stop_metric = SCORERS[early_stop_metric]
														
 
															+            self.early_stop_validations_size = early_stop_validations_size
														
 
															+        else:
														
 
															+            self.early_stop = False
														
 
															+
														
 
															+        self.class_weight = class_weight
														
 
															+        self.learning_rate = learning_rate
														
 
															+        self.device_train = device_train
														
 
															+        self.device_predict = device_predict
														
 
															+        self.batch_size = batch_size
														
 
															+
														
 
															+    def fit(self, X, y):
														
 
															+        """
														
 
															+            Training the model
														
 
															+
														
 
															+            Parameters:
														
 
															+            -----------
														
 
															+            X_test: pandas dataframe of the features
														
 
															+            y_test: pandas dataframe of the labels
														
 
															+        """
														
 
															+
														
 
															+        X, y = check_X_y(X, y, accept_sparse=True, multi_output=True)
														
 
															+        if y.ndim == 1:
														
 
															+            y = np.expand_dims(y, 1)
														
 
															+
														
 
															+        # Validation split if early stopping
														
 
															+        if self.early_stop:
														
 
															+            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.early_stop_validations_size)
														
 
															+            if issparse(X_val): # To deal with the sparse matrix situations
														
 
															+                X_val = X_val.toarray()
														
 
															+        else:
														
 
															+            X_train, y_train = X, y
														
 
															+
														
 
															+        n_samples = y_train.shape[0]
														
 
															+        n_labels_values = len(np.unique(y_train))
														
 
															+        n_labels = y_train.shape[1]
														
 
															+        n_features = X.shape[1]
														
 
															+
														
 
															+        # Raising the model
														
 
															+        self.network = self.model(n_features=n_features, n_labels=n_labels)
														
 
															+        self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate)
														
 
															+
														
 
															+
														
 
															+        # Creating dataloader for X_train, y_train
														
 
															+        data_loader = DataLoader(range(X_train.shape[0]), shuffle=True, batch_size=self.batch_size)
														
 
															+
														
 
															+        # Initializing loss function
														
 
															+        ## Getting weights
														
 
															+        if self.class_weight is not None:
														
 
															+            if self.class_weight == "balanced":
														
 
															+                weights = n_samples/(n_labels_values*np.bincount(y_train[:,0]))
														
 
															+                weights_dict = dict(zip(range(len(weights)), weights))
														
 
															+            else:
														
 
															+                weights_dict = self.class_weight
														
 
															+        else:
														
 
															+            weights_dict = None
														
 
															+
														
 
															+        criterion = nn.BCELoss()
														
 
															+
														
 
															+        # Running train
														
 
															+        last_score = 0
														
 
															+        for i in range(self.n_epochs):
														
 
															+
														
 
															+            # Starting an epoch
														
 
															+            for indices in data_loader:
														
 
															+                self.optimizer.zero_grad()
														
 
															+
														
 
															+                X_train_sample, y_train_sample = X_train[indices, :], y_train[indices, :]
														
 
															+                if issparse(X_train_sample): # To deal with the sparse matrix situations
														
 
															+                    X_train_sample = X_train_sample.toarray()
														
 
															+                X_train_sample_tensor, y_train_sample_tensor = [torch.tensor(x, dtype=torch.float32).to(self.device_train) for x in [X_train_sample, y_train_sample]]
														
 
															+
														
 
															+                # Weighting the loss
														
 
															+                if self.class_weight is not None:
														
 
															+                    sample_weights = y_train_sample.copy()
														
 
															+                    for x, y in weights_dict.items():
														
 
															+                        sample_weights[sample_weights == x] = y
														
 
															+                    criterion.weigths = sample_weights
														
 
															+
														
 
															+                # Get prediction
														
 
															+                y_train_sample_hat = self.network(X_train_sample_tensor)
														
 
															+
														
 
															+                loss = criterion(y_train_sample_hat, y_train_sample_tensor)
														
 
															+                loss.backward()
														
 
															+
														
 
															+                self.optimizer.step()
														
 
															+
														
 
															+            # End of the Epoch : evaluating the score
														
 
															+            if self.early_stop:
														
 
															+                score = self.early_stop_metric(self, X_val, y_val)
														
 
															+
														
 
															+                if score < last_score:
														
 
															+                    return self
														
 
															+                else:
														
 
															+                    last_score = score
														
 
															+
														
 
															+        return self
														
 
															+
														
 
															+    def predict(self, X):
														
 
															+        """
														
 
															+            Getting the prediction
														
 
															+
														
 
															+            Parameters:
														
 
															+            -----------
														
 
															+            X_test: pandas dataframe of the features
														
 
															+        
														
 
															+        """
														
 
															+
														
 
															+        y_hat_proba = self.predict_raw_proba(X)
														
 
															+        y_hat = (y_hat_proba >= 0.5)*1
														
 
															+
														
 
															+        return y_hat
														
 
															+
														
 
															+    def predict_raw_proba(self, X):
														
 
															+        """
														
 
															+            Getting the prediction score in tensor format
														
 
															+
														
 
															+            Parameters:
														
 
															+            -----------
														
 
															+            X_test: pandas dataframe of the features
														
 
															+        
														
 
															+        """
														
 
															+
														
 
															+        X = check_array(X, accept_sparse=True)
														
 
															+        if issparse(X): # To deal with the sparse matrix situations
														
 
															+            X = X.toarray()
														
 
															+
														
 
															+        with torch.no_grad():
														
 
															+            model_predict = self.network.to(self.device_predict)
														
 
															+            model_predict.eval()
														
 
															+
														
 
															+            # Create a tensor from X
														
 
															+            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device_predict)
														
 
															+            
														
 
															+            y_hat_proba_torch = model_predict(X_tensor)
														
 
															+            y_hat_proba_torch = y_hat_proba_torch.detach().cpu().numpy()
														
 
															+
														
 
															+        return y_hat_proba_torch
														
 
															+
														
 
															+    def predict_proba(self, X):
														
 
															+        """
														
 
															+            Getting the prediction score in sklearn format
														
 
															+
														
 
															+            Parameters:
														
 
															+            -----------
														
 
															+            X_test: pandas dataframe of the features
														
 
															+        
														
 
															+        """
														
 
															+        
														
 
															+        y_hat_proba_torch = self.predict_raw_proba(X)
														
 
															+        y_hat_proba_torch = np.concatenate([
														
 
															+            1-y_hat_proba_torch,
														
 
															+            y_hat_proba_torch
														
 
															+        ], axis=0)
														
 
															+        y_hat_proba = y_hat_proba_torch
														
 
															+
														
 
															+        return y_hat_proba
														
--- a/bop_scripts/preprocessing.py
+++ b/bop_scripts/preprocessing.py
@@ -5,6 +5,7 @@
 
															 import sqlite3
														
 
															 import pandas as pd
														
 
															 import numpy as np
														
 
															+from sklearn.base import BaseEstimator, TransformerMixin
														
 
															 def get_Xy_df (X, y):
														
 
															     """
														
@@ -144,7 +145,6 @@ def generate_features_dataset(database, get_drugs=True, get_diseases=True):
 
															 def generate_labels_dataset(database, lab_dictionnary):
														
 
															-
														
 
															     """
														
 
															         Generate features dataset according to the data
														
@@ -241,4 +241,60 @@ def remove_outliers (X, variables_ranges):
 
															                 pourcentage=lambda x: (x["n"]/x["total"])*100
														
 
															         )
														
 
															-    return X_copy, outlier_report
														
 
															+    return X_copy, outlier_report
														
 
															+
														
 
															+class OutlierRemover(BaseEstimator, TransformerMixin):
														
 
															+    """
														
 
															+        Sklearn-like class for removing outliers
														
 
															+        To be included in the pipeline
														
 
															+    """
														
 
															+
														
 
															+    def __init__ (self, variables_ranges):
														
 
															+        """
														
 
															+            Parameters:
														
 
															+            ----------
														
 
															+            variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
														
 
															+        """
														
 
															+
														
 
															+        super().__init__()
														
 
															+
														
 
															+        # Storing ranges rules
														
 
															+        self.variables_ranges = variables_ranges
														
 
															+
														
 
															+    def fit(self, X=None, y=None):
														
 
															+        return self
														
 
															+
														
 
															+    def transform(self, X):
														
 
															+        X_copy, _ = remove_outliers(X, self.variables_ranges)
														
 
															+        return X_copy
														
 
															+
														
 
															+class TextPreprocessing(BaseEstimator, TransformerMixin):
														
 
															+    """
														
 
															+        Sklearn-like class for text preprocessing
														
 
															+        To be included in the pipeline
														
 
															+
														
 
															+        What does it do :
														
 
															+        - It fills na with empty string
														
 
															+        - It lower string
														
 
															+        - It replace comma by space
														
 
															+    """
														
 
															+
														
 
															+    def __init__ (self):
														
 
															+
														
 
															+        """
														
 
															+            Parameters:
														
 
															+            ----------
														
 
															+        """
														
 
															+
														
 
															+        super().__init__()
														
 
															+
														
 
															+    def fit(self, X=None, y=None):
														
 
															+        return self
														
 
															+
														
 
															+    def transform(self, X):
														
 
															+        colname = X.name
														
 
															+        X = X \
														
 
															+            .fillna("") \
														
 
															+            .replace(",", " ").str.lower()
														
 
															+
														
 
															+        return X
														
--- a/bop_scripts/visualisation.py
+++ b/bop_scripts/visualisation.py
@@ -1,7 +1,9 @@
 
															 from matplotlib import pyplot as plt
														
 
															 from sklearn.feature_extraction.text import CountVectorizer
														
 
															+from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, f1_score, confusion_matrix
														
 
															 from wordcloud import WordCloud
														
 
															 import seaborn as sns
														
 
															+import itertools
														
 
															 import pandas as pd
														
 
															 import numpy as np
														
 
															 import random
														
@@ -261,4 +263,129 @@ def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
 
															             axs[i][j].set_xlabel(f"{j}")
														
 
															     fig.suptitle("WordCloud selon le label")
														
 
															-    plt.show()
														
 
															+    plt.show()
														
 
															+
														
 
															+def vizualize_features_selection (scores, score_name, f_precision=2, n_score_max=5, ncols=3):
														
 
															+    """
														
 
															+        This function produce an heatmap of metrics score according to each variables combination
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        scores: Dictionnary containing a list of combination and associated score for each label produced by the .models.get_features_selection function
														
 
															+        score_name: str, Name of the score
														
 
															+        f_precision: int, floating point precision is the number of decimal to keep
														
 
															+        n_score_max: int, maximum number of scores to display
														
 
															+        ncols: int, number of columns in the output plot
														
 
															+    """
														
 
															+
														
 
															+    # Creating a dataframe containing the scores
														
 
															+    scores_df = []
														
 
															+    for key, value in scores.items():
														
 
															+        scores_df_temp = pd.DataFrame(
														
 
															+            [dict(zip(x[0], [x[1] for i in range(len(x[0]))])) for x in value]
														
 
															+        ).assign(score=lambda x: x.max(axis=1))
														
 
															+        scores_df_temp.iloc[:,:-1] = (scores_df_temp.iloc[:,:-1].fillna("")*0).astype("str").replace("0.0", "x")
														
 
															+        scores_df_temp["name"] = key
														
 
															+        scores_df.append(scores_df_temp.sort_values("score", ascending=False))
														
 
															+
														
 
															+    scores_df = pd.concat(scores_df).reset_index(drop=True)
														
 
															+    scores_df["n_features"] = (scores_df == "x").sum(axis=1)
														
 
															+    scores_df[score_name] = scores_df["score"].round(f_precision)
														
 
															+    scores_df = scores_df.sort_values(["name", "roc_auc", score_name], ascending=[True, False, True]).drop_duplicates(["name", score_name])
														
 
															+
														
 
															+    # Plotting the dataframe
														
 
															+    scores_list = scores_df["name"].drop_duplicates().values.tolist()
														
 
															+    ncols = 3
														
 
															+    nrows = len(scores_list)//ncols + (len(scores_list)%ncols != 0)*1
														
 
															+
														
 
															+    fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols,4*nrows))
														
 
															+    axs = axs.flatten()
														
 
															+
														
 
															+    for i in range(len(scores_list)):
														
 
															+        score = scores_list[i]
														
 
															+        sns.heatmap(
														
 
															+            (scores_df.query(f"name == '{score}'").set_index("roc_auc").head(n_score_max).iloc[:, :-3] == 'x')*1,
														
 
															+            ax=axs[i]
														
 
															+        )
														
 
															+        axs[i].set_title(score)
														
 
															+
														
 
															+    fig.suptitle(f"{score_name} according to features included in the model")
														
 
															+    plt.tight_layout()
														
 
															+
														
 
															+def display_model_performances(classifier, X_test, y_test, algorithm_name="", threshold=0.5, ncols=1):
														
 
															+    """
														
 
															+        This function produce a vizualization of the model performances
														
 
															+
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        classifier: python object which should contains a predict and a predict_proba method, if many labels a dict in the format {label:classifier,...} is expected
														
 
															+        X_test: pandas dataframe of the features
														
 
															+        y_test: pandas dataframe of the labels
														
 
															+        algorithm_name: str, name of the algorithm
														
 
															+        threshold: float, threshold for classification
														
 
															+        ncols: int, number of columns
														
 
															+    """
														
 
															+
														
 
															+    # Checking type of y_test
														
 
															+    if isinstance(y_test, pd.Series):
														
 
															+        y_test = pd.DataFrame(y_test)
														
 
															+
														
 
															+    # Checking if one or many labels
														
 
															+    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
														
 
															+        if isinstance(classifier, dict) == False or len(classifier.keys()) != y_test.shape[1]:
														
 
															+            raise ValueError("You should provide as many classifier than labels")
														
 
															+    else:
														
 
															+        if isinstance(classifier, dict) == False:
														
 
															+            classifier = {y_test.columns[0]:classifier}
														
 
															+
														
 
															+    labels = y_test.columns.tolist()
														
 
															+
														
 
															+    # Construction of the pyplot object
														
 
															+    nrows = (len(labels)//ncols) + ((len(labels)%ncols)!=0)*1
														
 
															+    fig = plt.figure(constrained_layout=True, figsize=(15*ncols,7*nrows))
														
 
															+    figs = fig.subfigures(nrows, ncols)
														
 
															+    figs = figs.flatten()
														
 
															+    if len(labels) == 1:
														
 
															+        figs = [figs]
														
 
															+    axs = [x.subplots(1, 2) for x in figs]
														
 
															+
														
 
															+    # For each label :
														
 
															+    for i in range(len(labels)):
														
 
															+        label = labels[i]
														
 
															+        label_classifier = classifier[label]
														
 
															+        figs[i].suptitle(label)
														
 
															+
														
 
															+        y_test_true = y_test[label].values
														
 
															+        y_test_hat_proba = label_classifier.predict_proba(X_test)[:,1]
														
 
															+        y_test_hat = (y_test_hat_proba >= threshold)*1
														
 
															+
														
 
															+        # Computation of metrics
														
 
															+        f1_score_, accuracy_score_, recall_score_, precision_score_ = [x(y_test_true, y_test_hat) for x in [f1_score, accuracy_score, recall_score, precision_score]]
														
 
															+        auc_score_ = roc_auc_score(y_test_true, y_test_hat_proba)
														
 
															+        confusion_matrix_ = confusion_matrix(y_test_true, y_test_hat)
														
 
															+
														
 
															+        # Plotting
														
 
															+        ## Confusion matrix
														
 
															+        ConfusionMatrixDisplay(
														
 
															+            confusion_matrix_,
														
 
															+            display_labels=[0, 1]
														
 
															+        ).plot(
														
 
															+            ax=axs[i][0]
														
 
															+        )
														
 
															+
														
 
															+        ## ROC curve
														
 
															+        fpr, tpr, thresholds = roc_curve(
														
 
															+            y_test_true,
														
 
															+            y_test_hat_proba
														
 
															+        )
														
 
															+
														
 
															+        axs[i][1].plot(
														
 
															+            fpr,
														
 
															+            tpr,
														
 
															+            label=f"AUC: {auc_score_:.2f}\nF1-Score: {f1_score_:.2f}\nRecall: {recall_score_:.2f}\nPrecision: {precision_score_:.2f}\nAccuracy: {accuracy_score_:.2f}"
														
 
															+        )
														
 
															+        axs[i][1].legend(loc=4, fontsize="x-large")
														
 
															+        axs[i][1].set_ylabel('Taux de vrai positifs')
														
 
															+        axs[i][1].set_xlabel('Taux de faux positifs')
														
 
															+
														
 
															+    fig.suptitle(f"Performance de l'algorithme {algorithm_name} avec un threshold de {threshold}")
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,6 @@ numpy
 
															 seaborn
														
 
															 fasttext
														
 
															 wordcloud
														
 
															-sklearn
														
 
															+sklearn
														
 
															+torch
														
 
															+scipy