models.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. from sklearn.compose import ColumnTransformer
  2. from sklearn.pipeline import Pipeline, FeatureUnion
  3. from sklearn.preprocessing import OrdinalEncoder, StandardScaler
  4. from sklearn.feature_extraction.text import CountVectorizer
  5. from sklearn.impute import SimpleImputer, MissingIndicator
  6. from sklearn.model_selection import cross_val_score
  7. from sklearn.exceptions import ConvergenceWarning
  8. from .preprocessing import OutlierRemover, TextPreprocessing
  9. from warnings import simplefilter
  10. import itertools
  11. def generate_model (classifier, categorical_variables, continuous_variables, text_variable=None, missing_indicator=True, OrdinalEncoder_kwargs={}, StandardScaler_kwargs={}, CountVectorizer_kwargs={}, SimpleImputer_kwargs={}, MissingIndicator_kwargs={}, remove_outliers=False, outliers_variables_ranges=None):
  12. """
  13. Generate a model Pipeline containing features pre-processing
  14. Parameters
  15. ----------
  16. classifier: sklearn classifier object with a fit and transform method
  17. categorical_variables: [str], list of categorical variables
  18. continuous_variables: [str], list of continuous variables
  19. text_variable: [str], text variables, None if missing
  20. missing_indicator: boolean, if True a missing indicator is added to the Pipeline
  21. OrdinalEncoder_kwargs: dict, argument passed to the ordinal encoder
  22. StandardScaler_kwargs: dict, argument passed to the standard scaler
  23. CountVectorizer_kwargs: dict, argument passed to the count vectorizer
  24. SimpleImputer_kwargs: dict, argument passed to the simple imputer
  25. MissingIndicator_kwargs: dict, argument passed to the missing indicator
  26. remove_outliers: boolean, if true the outliers are set to nan
  27. outliers_variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  28. """
  29. variables = categorical_variables+continuous_variables
  30. if text_variable is not None:
  31. variables += [text_variable]
  32. # Features pre-processing
  33. features_preprocessing_list = []
  34. ## Outliers removal :
  35. if remove_outliers==True and outliers_variables_ranges is not None:
  36. # Creating the range list
  37. outliers_variables_range_clean = dict([(x, y) for x,y in outliers_variables_ranges.items() if x in variables])
  38. features_preprocessing_list.append(("outliers", OutlierRemover(variables_ranges=outliers_variables_range_clean), list(outliers_variables_range_clean.keys())))
  39. if len(categorical_variables) > 0:
  40. features_preprocessing_list.append(("binary_encoder", OrdinalEncoder(**OrdinalEncoder_kwargs), categorical_variables))
  41. if len(continuous_variables) > 0:
  42. features_preprocessing_list.append(("continuous_scaling", StandardScaler(**StandardScaler_kwargs), continuous_variables))
  43. if text_variable is not None:
  44. # Text pre-processing then count vectorizer
  45. text_preprocessing_pipeline = Pipeline([
  46. ("text_preprocessing", TextPreprocessing()),
  47. ("text_countvectorizer", CountVectorizer(**CountVectorizer_kwargs))
  48. ])
  49. features_preprocessing_list.append(("text_encoding", text_preprocessing_pipeline, text_variable))
  50. # Imputation methods
  51. imputation_list = []
  52. imputation_list.append(("missing_imputer", SimpleImputer(**SimpleImputer_kwargs)))
  53. if missing_indicator:
  54. imputation_list.append(
  55. ("missing_indicator", MissingIndicator(**MissingIndicator_kwargs))
  56. )
  57. # Creating the pipeline
  58. features_preprocessing = ColumnTransformer(features_preprocessing_list)
  59. full_preprocessing = Pipeline([
  60. ("features", features_preprocessing),
  61. ("impute_and_store_missing", FeatureUnion(imputation_list)),
  62. ])
  63. pipeline = Pipeline([
  64. ("preprocessing", full_preprocessing),
  65. ("lr", classifier)
  66. ])
  67. return pipeline
  68. def get_features_selection (X, y, classifier, categorical_variables, continuous_variables, text_variable=None, min_features=1, cv=3, metric_score="auc_score"):
  69. """
  70. This function return the metrics score according to each variables combination
  71. Parameters
  72. ----------
  73. X: Pandas Dataframe of features
  74. y: Pandas Dataframe of labels
  75. classifier: sklearn classifier object with a fit and transform method
  76. categorical_variables: [str], list of categorical variables
  77. continuous_variables: [str], list of continuous variables
  78. text_variable: [str], text variables, None if missing
  79. min_features: int, minimum number of features to include in the model
  80. cv: int, cross-validation splitting strategy according to the cross_val_score documentation
  81. metric_score: str, metric score to evaluate the model, according to sklearn.metrics.SCORERS.keys()
  82. Output
  83. ------
  84. Dictionnary containing a list of combination and associated score for each label
  85. """
  86. # Getting labels list
  87. labels = y.columns.tolist()
  88. # Getting the combinations
  89. variables = categorical_variables + continuous_variables
  90. if text_variable is not None:
  91. variables += [text_variable]
  92. variables_combinations = []
  93. for i in range(min_features, len(variables)+1):
  94. variables_combinations += itertools.combinations(variables, i)
  95. # Getting global model
  96. global_pipeline = generate_model(
  97. classifier,
  98. categorical_variables,
  99. continuous_variables,
  100. text_variable,
  101. CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
  102. )
  103. # Preprocessing the data : we accept here to mix train/eval in feature scaling to reduce execution time
  104. X_transformed = global_pipeline.steps[0][1].steps[0][1].fit_transform(X)
  105. # Storing scores dictionnary
  106. scores = dict(zip(
  107. labels,
  108. [[] for x in labels]
  109. ))
  110. # Getting the scores
  111. for variable_combination in variables_combinations:
  112. combination_categorical_variables, combination_continuous_variables = [x for x in categorical_variables if x in variable_combination], [x for x in continuous_variables if x in variable_combination]
  113. combination_text_variable = text_variable if (text_variable is not None and text_variable in variable_combination) else None
  114. pipeline = generate_model(
  115. classifier,
  116. combination_categorical_variables,
  117. combination_continuous_variables,
  118. combination_text_variable,
  119. CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":200}
  120. )
  121. # Get X index
  122. if text_variable is not None:
  123. variables_index = [variables.index(x) for x in variable_combination if x != text_variable]
  124. if text_variable in variable_combination:
  125. variables_index += list(range(X_transformed.shape[1]-200, X_transformed.shape[1]))
  126. else:
  127. variables_index = [variables.index(x) for x in variable_combination]
  128. pipeline.steps[0][1].steps.pop(0) # Removing preprocessing step
  129. for label in labels:
  130. score = cross_val_score(pipeline, X_transformed[:,variables_index], y[label], cv=cv, scoring="roc_auc").mean().mean()
  131. scores[label].append([variable_combination, score])
  132. return scores
  133. def fit_all_classifiers(classifier_fn, X_train, y_train, hide_warnings=True, verbose=False):
  134. """
  135. This function fill all the models for each label.
  136. Parameters:
  137. ----------
  138. classifier_fn: Function to raise a new classifier with fit method
  139. X: Pandas Dataframe of features
  140. y: Pandas Dataframe of labels
  141. hide_warnings: boolean, if true the warnings will be hidden
  142. verbose: boolean, if true the trained model are printed
  143. Output:
  144. -------
  145. Dictionnary containing a classifier per label
  146. """
  147. if hide_warnings == True:
  148. simplefilter("ignore", category=ConvergenceWarning)
  149. labels = y_train.columns.tolist()
  150. classifiers = {}
  151. for label in labels:
  152. if verbose:
  153. print(f"Training model {label}")
  154. classifier = classifier_fn()
  155. classifiers[label] = classifier.fit(X_train, y_train[label])
  156. return classifiers