Explorar o código

Adding models

Ali %!s(int64=2) %!d(string=hai) anos
pai
achega
6e3f54c8d3

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 3 - 3
Project_Report.ipynb


+ 185 - 0
bop_scripts/models.py

@@ -0,0 +1,185 @@
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.preprocessing import OrdinalEncoder, StandardScaler
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.impute import SimpleImputer, MissingIndicator
+from sklearn.model_selection import cross_val_score
+from sklearn.exceptions import ConvergenceWarning
+from .preprocessing import OutlierRemover, TextPreprocessing
+from warnings import simplefilter
+import itertools
+
+def generate_model (classifier, categorical_variables, continuous_variables, text_variable=None, missing_indicator=True, OrdinalEncoder_kwargs={}, StandardScaler_kwargs={}, CountVectorizer_kwargs={}, SimpleImputer_kwargs={}, MissingIndicator_kwargs={}, remove_outliers=False, outliers_variables_ranges=None):
+    """
+        Generate a model Pipeline containing features pre-processing 
+
+        Parameters
+        ----------
+        classifier: sklearn classifier object with a fit and transform method
+        categorical_variables: [str], list of categorical variables
+        continuous_variables: [str], list of continuous variables
+        text_variable: [str], text variables, None if missing
+        missing_indicator: boolean, if True a missing indicator is added to the Pipeline
+        OrdinalEncoder_kwargs: dict, argument passed to the ordinal encoder
+        StandardScaler_kwargs: dict, argument passed to the standard scaler
+        CountVectorizer_kwargs: dict, argument passed to the count vectorizer
+        SimpleImputer_kwargs: dict, argument passed to the simple imputer
+        MissingIndicator_kwargs: dict, argument passed to the missing indicator    
+        remove_outliers: boolean, if true the outliers are set to nan
+        outliers_variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
+    """
+
+    variables = categorical_variables+continuous_variables
+    if text_variable is not None:
+        variables += [text_variable]
+
+    # Features pre-processing
+    features_preprocessing_list = []
+
+    ## Outliers removal :
+    if remove_outliers==True and outliers_variables_ranges is not None:
+        # Creating the range list
+        outliers_variables_range_clean = dict([(x, y) for x,y in outliers_variables_ranges.items() if x in variables])
+        features_preprocessing_list.append(("outliers", OutlierRemover(variables_ranges=outliers_variables_range_clean), list(outliers_variables_range_clean.keys())))
+
+    if len(categorical_variables) > 0:
+        features_preprocessing_list.append(("binary_encoder", OrdinalEncoder(**OrdinalEncoder_kwargs), categorical_variables))
+    if len(continuous_variables) > 0:
+        features_preprocessing_list.append(("continuous_scaling", StandardScaler(**StandardScaler_kwargs), continuous_variables))
+    if text_variable is not None:
+        # Text pre-processing then count vectorizer
+        text_preprocessing_pipeline = Pipeline([
+            ("text_preprocessing", TextPreprocessing()),
+            ("text_countvectorizer", CountVectorizer(**CountVectorizer_kwargs))
+        ])
+        
+        features_preprocessing_list.append(("text_encoding", text_preprocessing_pipeline, text_variable))
+        
+    # Imputation methods
+    imputation_list = []
+    imputation_list.append(("missing_imputer", SimpleImputer(**SimpleImputer_kwargs)))
+    if missing_indicator:
+        imputation_list.append(
+            ("missing_indicator", MissingIndicator(**MissingIndicator_kwargs))
+        )
+
+    # Creating the pipeline
+
+    features_preprocessing = ColumnTransformer(features_preprocessing_list)
+    full_preprocessing = Pipeline([
+        ("features", features_preprocessing),
+        ("impute_and_store_missing", FeatureUnion(imputation_list)),
+    ])
+
+    pipeline = Pipeline([
+        ("preprocessing", full_preprocessing),
+        ("lr", classifier)
+    ])
+    
+    return pipeline
+
+def get_features_selection (X, y, classifier, categorical_variables, continuous_variables, text_variable=None, min_features=1, cv=3, metric_score="auc_score"):
+    """
+        This function return the metrics score according to each variables combination
+
+        Parameters
+        ----------
+        X: Pandas Dataframe of features
+        y: Pandas Dataframe of labels
+        classifier: sklearn classifier object with a fit and transform method
+        categorical_variables: [str], list of categorical variables
+        continuous_variables: [str], list of continuous variables
+        text_variable: [str], text variables, None if missing
+        min_features: int, minimum number of features to include in the model
+        cv: int, cross-validation splitting strategy according to the cross_val_score documentation
+        metric_score: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
+
+        Output
+        ------
+        Dictionnary containing a list of combination and associated score for each label
+    """
+
+    # Getting labels list
+    labels = y.columns.tolist()
+
+    # Getting the combinations
+    variables = categorical_variables + continuous_variables
+    if text_variable is not None:
+        variables += [text_variable]
+
+    variables_combinations = []
+    for i in range(min_features, len(variables)+1):
+        variables_combinations += itertools.combinations(variables, i)
+
+    # Getting global model
+    global_pipeline = generate_model(
+            classifier,
+            categorical_variables,
+            continuous_variables,
+            text_variable,
+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
+    )
+
+    # Preprocessing the data : we accept here to mix train/eval in feature scaling to reduce execution time
+    X_transformed = global_pipeline.steps[0][1].steps[0][1].fit_transform(X)
+
+    # Storing scores dictionnary
+    scores = dict(zip(
+        labels,
+        [[] for x in labels]
+    ))
+
+    # Getting the scores
+    for variable_combination in variables_combinations:
+        combination_categorical_variables, combination_continuous_variables = [x for x in categorical_variables if x in variable_combination], [x for x in continuous_variables if x in variable_combination]
+        combination_text_variable = text_variable if (text_variable is not None and text_variable in variable_combination) else None
+
+        pipeline = generate_model(
+            classifier,
+            combination_categorical_variables,
+            combination_continuous_variables,
+            combination_text_variable,
+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
+        )
+
+        # Get X index
+        if text_variable is not None:
+            variables_index = [variables.index(x) for x in variable_combination if x != text_variable]
+            if text_variable in variable_combination:
+                variables_index += list(range(X_transformed.shape[1]-200, X_transformed.shape[1]))
+        else:
+            variables_index = [variables.index(x) for x in variable_combination]
+
+        pipeline.steps[0][1].steps.pop(0) # Removing preprocessing step
+
+        for label in labels:
+            score = cross_val_score(pipeline, X_transformed[:,variables_index], y[label], cv=cv, scoring="roc_auc").mean().mean()
+            scores[label].append([variable_combination, score])
+
+    return scores
+
+def fit_all_classifiers(classifier, X_train, y_train, hide_warnings=True):
+    """
+        This function fill all the models for each label.
+
+        Parameters:
+        ----------
+        model: Classifier with a fit method
+        X: Pandas Dataframe of features
+        y: Pandas Dataframe of labels
+        hide_warnings: boolean, if true the warnings will be hidden
+
+        Output:
+        -------
+        Dictionnary containing a classifier per label
+    """
+
+    if hide_warnings == True:
+        simplefilter("ignore", category=ConvergenceWarning)
+
+    labels = y_train.columns.tolist()
+    classifiers = {}
+    for label in labels:
+        classifiers[label] = classifier.fit(X_train, y_train[label])
+
+    return classifiers

+ 218 - 0
bop_scripts/nn_models.py

@@ -0,0 +1,218 @@
+from random import sample
+from sklearn.base import BaseEstimator
+from sklearn.metrics import SCORERS
+from sklearn.model_selection import train_test_split
+from sklearn.utils import check_X_y, check_array
+import torch
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from scipy.sparse import issparse
+import numpy as np
+
+class torchMLP (nn.Module):
+    """
+        Neural network model for 
+    """
+
+    def __init__(self, n_features, n_labels):
+        super().__init__()
+
+        self.network = nn.Sequential(*[
+            nn.Linear(n_features, 200),
+            nn.ReLU(),
+            nn.Linear(200, 50),
+            nn.ReLU(),
+            nn.Linear(50, n_labels),
+            nn.Sigmoid()
+        ])
+
+    def forward(self, x):
+        
+        y_hat = self.network(x)
+
+        return y_hat
+
+class torchMLPClassifier_sklearn (BaseEstimator):
+
+    """
+        Pytorch neural network with a sklearn-like API
+    """
+
+    def __init__ (self, model, n_epochs=50, early_stop=True, early_stop_metric="accuracy", early_stop_validations_size=0.1, batch_size=1024, learning_rate=1e-3, class_weight=None, device_train="cpu", device_predict="cpu"):
+        """
+            Parameters:
+            -----------
+            model: non instanciated pytorch neural network model with a n_features and n_labels parameter
+            n_epochs: int, number of epochs
+            early_stop: boolean, if true an evaluation dataset is created and used to stop the training
+            early_stop_metric: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
+            early_stop_validations_size: int or float, if float percentage of the train dataset used for validation, otherwise number of sample to use 
+            batch_size: int, size of the training batch
+            learning_rate: float, Adam optimizer learning rate
+            class_weight: dict or str, same as the sklearn API
+            device_train: str, device on which to train
+            device_predict: str, device on which to predict
+        """
+
+        self.model = model
+
+        self.n_epochs = n_epochs
+        if early_stop and (early_stop_metric is not None) and (early_stop_metric in SCORERS.keys()) and (isinstance(early_stop_validations_size, int) or isinstance(early_stop_validations_size, float)):
+            self.early_stop = early_stop
+            self.early_stop_metric = SCORERS[early_stop_metric]
+            self.early_stop_validations_size = early_stop_validations_size
+        else:
+            self.early_stop = False
+
+        self.class_weight = class_weight
+        self.learning_rate = learning_rate
+        self.device_train = device_train
+        self.device_predict = device_predict
+        self.batch_size = batch_size
+
+    def fit(self, X, y):
+        """
+            Training the model
+
+            Parameters:
+            -----------
+            X_test: pandas dataframe of the features
+            y_test: pandas dataframe of the labels
+        """
+
+        X, y = check_X_y(X, y, accept_sparse=True, multi_output=True)
+        if y.ndim == 1:
+            y = np.expand_dims(y, 1)
+
+        # Validation split if early stopping
+        if self.early_stop:
+            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.early_stop_validations_size)
+            if issparse(X_val): # To deal with the sparse matrix situations
+                X_val = X_val.toarray()
+        else:
+            X_train, y_train = X, y
+
+        n_samples = y_train.shape[0]
+        n_labels_values = len(np.unique(y_train))
+        n_labels = y_train.shape[1]
+        n_features = X.shape[1]
+
+        # Raising the model
+        self.network = self.model(n_features=n_features, n_labels=n_labels)
+        self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate)
+
+
+        # Creating dataloader for X_train, y_train
+        data_loader = DataLoader(range(X_train.shape[0]), shuffle=True, batch_size=self.batch_size)
+
+        # Initializing loss function
+        ## Getting weights
+        if self.class_weight is not None:
+            if self.class_weight == "balanced":
+                weights = n_samples/(n_labels_values*np.bincount(y_train[:,0]))
+                weights_dict = dict(zip(range(len(weights)), weights))
+            else:
+                weights_dict = self.class_weight
+        else:
+            weights_dict = None
+
+        criterion = nn.BCELoss()
+
+        # Running train
+        last_score = 0
+        for i in range(self.n_epochs):
+
+            # Starting an epoch
+            for indices in data_loader:
+                self.optimizer.zero_grad()
+
+                X_train_sample, y_train_sample = X_train[indices, :], y_train[indices, :]
+                if issparse(X_train_sample): # To deal with the sparse matrix situations
+                    X_train_sample = X_train_sample.toarray()
+                X_train_sample_tensor, y_train_sample_tensor = [torch.tensor(x, dtype=torch.float32).to(self.device_train) for x in [X_train_sample, y_train_sample]]
+
+                # Weighting the loss
+                if self.class_weight is not None:
+                    sample_weights = y_train_sample.copy()
+                    for x, y in weights_dict.items():
+                        sample_weights[sample_weights == x] = y
+                    criterion.weigths = sample_weights
+
+                # Get prediction
+                y_train_sample_hat = self.network(X_train_sample_tensor)
+
+                loss = criterion(y_train_sample_hat, y_train_sample_tensor)
+                loss.backward()
+
+                self.optimizer.step()
+
+            # End of the Epoch : evaluating the score
+            if self.early_stop:
+                score = self.early_stop_metric(self, X_val, y_val)
+
+                if score < last_score:
+                    return self
+                else:
+                    last_score = score
+
+        return self
+
+    def predict(self, X):
+        """
+            Getting the prediction
+
+            Parameters:
+            -----------
+            X_test: pandas dataframe of the features
+        
+        """
+
+        y_hat_proba = self.predict_raw_proba(X)
+        y_hat = (y_hat_proba >= 0.5)*1
+
+        return y_hat
+
+    def predict_raw_proba(self, X):
+        """
+            Getting the prediction score in tensor format
+
+            Parameters:
+            -----------
+            X_test: pandas dataframe of the features
+        
+        """
+
+        X = check_array(X, accept_sparse=True)
+        if issparse(X): # To deal with the sparse matrix situations
+            X = X.toarray()
+
+        with torch.no_grad():
+            model_predict = self.network.to(self.device_predict)
+            model_predict.eval()
+
+            # Create a tensor from X
+            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device_predict)
+            
+            y_hat_proba_torch = model_predict(X_tensor)
+            y_hat_proba_torch = y_hat_proba_torch.detach().cpu().numpy()
+
+        return y_hat_proba_torch
+
+    def predict_proba(self, X):
+        """
+            Getting the prediction score in sklearn format
+
+            Parameters:
+            -----------
+            X_test: pandas dataframe of the features
+        
+        """
+        
+        y_hat_proba_torch = self.predict_raw_proba(X)
+        y_hat_proba_torch = np.concatenate([
+            1-y_hat_proba_torch,
+            y_hat_proba_torch
+        ], axis=0)
+        y_hat_proba = y_hat_proba_torch
+
+        return y_hat_proba

+ 58 - 2
bop_scripts/preprocessing.py

@@ -5,6 +5,7 @@
 import sqlite3
 import pandas as pd
 import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
 
 def get_Xy_df (X, y):
     """
@@ -144,7 +145,6 @@ def generate_features_dataset(database, get_drugs=True, get_diseases=True):
     
 
 def generate_labels_dataset(database, lab_dictionnary):
-
     """
         Generate features dataset according to the data
 
@@ -241,4 +241,60 @@ def remove_outliers (X, variables_ranges):
                 pourcentage=lambda x: (x["n"]/x["total"])*100
         )
 
-    return X_copy, outlier_report
+    return X_copy, outlier_report
+
+class OutlierRemover(BaseEstimator, TransformerMixin):
+    """
+        Sklearn-like class for removing outliers
+        To be included in the pipeline
+    """
+
+    def __init__ (self, variables_ranges):
+        """
+            Parameters:
+            ----------
+            variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
+        """
+
+        super().__init__()
+
+        # Storing ranges rules
+        self.variables_ranges = variables_ranges
+
+    def fit(self, X=None, y=None):
+        return self
+
+    def transform(self, X):
+        X_copy, _ = remove_outliers(X, self.variables_ranges)
+        return X_copy
+
+class TextPreprocessing(BaseEstimator, TransformerMixin):
+    """
+        Sklearn-like class for text preprocessing
+        To be included in the pipeline
+
+        What does it do :
+        - It fills na with empty string
+        - It lower string
+        - It replace comma by space
+    """
+
+    def __init__ (self):
+
+        """
+            Parameters:
+            ----------
+        """
+
+        super().__init__()
+
+    def fit(self, X=None, y=None):
+        return self
+
+    def transform(self, X):
+        colname = X.name
+        X = X \
+            .fillna("") \
+            .replace(",", " ").str.lower()
+
+        return X

+ 128 - 1
bop_scripts/visualisation.py

@@ -1,7 +1,9 @@
 from matplotlib import pyplot as plt
 from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, f1_score, confusion_matrix
 from wordcloud import WordCloud
 import seaborn as sns
+import itertools
 import pandas as pd
 import numpy as np
 import random
@@ -261,4 +263,129 @@ def plot_odd_word_wc(X, y, text_column, labels, min_occurrence=3, ncols=5):
             axs[i][j].set_xlabel(f"{j}")
 
     fig.suptitle("WordCloud selon le label")
-    plt.show()
+    plt.show()
+
+def vizualize_features_selection (scores, score_name, f_precision=2, n_score_max=5, ncols=3):
+    """
+        This function produce an heatmap of metrics score according to each variables combination
+
+        Parameters
+        ----------
+        scores: Dictionnary containing a list of combination and associated score for each label produced by the .models.get_features_selection function
+        score_name: str, Name of the score
+        f_precision: int, floating point precision is the number of decimal to keep
+        n_score_max: int, maximum number of scores to display
+        ncols: int, number of columns in the output plot
+    """
+
+    # Creating a dataframe containing the scores
+    scores_df = []
+    for key, value in scores.items():
+        scores_df_temp = pd.DataFrame(
+            [dict(zip(x[0], [x[1] for i in range(len(x[0]))])) for x in value]
+        ).assign(score=lambda x: x.max(axis=1))
+        scores_df_temp.iloc[:,:-1] = (scores_df_temp.iloc[:,:-1].fillna("")*0).astype("str").replace("0.0", "x")
+        scores_df_temp["name"] = key
+        scores_df.append(scores_df_temp.sort_values("score", ascending=False))
+
+    scores_df = pd.concat(scores_df).reset_index(drop=True)
+    scores_df["n_features"] = (scores_df == "x").sum(axis=1)
+    scores_df[score_name] = scores_df["score"].round(f_precision)
+    scores_df = scores_df.sort_values(["name", "roc_auc", score_name], ascending=[True, False, True]).drop_duplicates(["name", score_name])
+
+    # Plotting the dataframe
+    scores_list = scores_df["name"].drop_duplicates().values.tolist()
+    ncols = 3
+    nrows = len(scores_list)//ncols + (len(scores_list)%ncols != 0)*1
+
+    fig, axs = plt.subplots(nrows, ncols, figsize=(5*ncols,4*nrows))
+    axs = axs.flatten()
+
+    for i in range(len(scores_list)):
+        score = scores_list[i]
+        sns.heatmap(
+            (scores_df.query(f"name == '{score}'").set_index("roc_auc").head(n_score_max).iloc[:, :-3] == 'x')*1,
+            ax=axs[i]
+        )
+        axs[i].set_title(score)
+
+    fig.suptitle(f"{score_name} according to features included in the model")
+    plt.tight_layout()
+
+def display_model_performances(classifier, X_test, y_test, algorithm_name="", threshold=0.5, ncols=1):
+    """
+        This function produce a vizualization of the model performances
+
+        Parameters
+        ----------
+        classifier: python object which should contains a predict and a predict_proba method, if many labels a dict in the format {label:classifier,...} is expected
+        X_test: pandas dataframe of the features
+        y_test: pandas dataframe of the labels
+        algorithm_name: str, name of the algorithm
+        threshold: float, threshold for classification
+        ncols: int, number of columns
+    """
+
+    # Checking type of y_test
+    if isinstance(y_test, pd.Series):
+        y_test = pd.DataFrame(y_test)
+
+    # Checking if one or many labels
+    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
+        if isinstance(classifier, dict) == False or len(classifier.keys()) != y_test.shape[1]:
+            raise ValueError("You should provide as many classifier than labels")
+    else:
+        if isinstance(classifier, dict) == False:
+            classifier = {y_test.columns[0]:classifier}
+
+    labels = y_test.columns.tolist()
+
+    # Construction of the pyplot object
+    nrows = (len(labels)//ncols) + ((len(labels)%ncols)!=0)*1
+    fig = plt.figure(constrained_layout=True, figsize=(15*ncols,7*nrows))
+    figs = fig.subfigures(nrows, ncols)
+    figs = figs.flatten()
+    if len(labels) == 1:
+        figs = [figs]
+    axs = [x.subplots(1, 2) for x in figs]
+
+    # For each label :
+    for i in range(len(labels)):
+        label = labels[i]
+        label_classifier = classifier[label]
+        figs[i].suptitle(label)
+
+        y_test_true = y_test[label].values
+        y_test_hat_proba = label_classifier.predict_proba(X_test)[:,1]
+        y_test_hat = (y_test_hat_proba >= threshold)*1
+
+        # Computation of metrics
+        f1_score_, accuracy_score_, recall_score_, precision_score_ = [x(y_test_true, y_test_hat) for x in [f1_score, accuracy_score, recall_score, precision_score]]
+        auc_score_ = roc_auc_score(y_test_true, y_test_hat_proba)
+        confusion_matrix_ = confusion_matrix(y_test_true, y_test_hat)
+
+        # Plotting
+        ## Confusion matrix
+        ConfusionMatrixDisplay(
+            confusion_matrix_,
+            display_labels=[0, 1]
+        ).plot(
+            ax=axs[i][0]
+        )
+
+        ## ROC curve
+        fpr, tpr, thresholds = roc_curve(
+            y_test_true,
+            y_test_hat_proba
+        )
+
+        axs[i][1].plot(
+            fpr,
+            tpr,
+            label=f"AUC: {auc_score_:.2f}\nF1-Score: {f1_score_:.2f}\nRecall: {recall_score_:.2f}\nPrecision: {precision_score_:.2f}\nAccuracy: {accuracy_score_:.2f}"
+        )
+        axs[i][1].legend(loc=4, fontsize="x-large")
+        axs[i][1].set_ylabel('Taux de vrai positifs')
+        axs[i][1].set_xlabel('Taux de faux positifs')
+
+    fig.suptitle(f"Performance de l'algorithme {algorithm_name} avec un threshold de {threshold}")

+ 3 - 1
requirements.txt

@@ -4,4 +4,6 @@ numpy
 seaborn
 fasttext
 wordcloud
-sklearn
+sklearn
+torch
+scipy

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio