problem.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #%%
  2. from multiprocessing.sharedctypes import Value
  3. import sys
  4. import os
  5. sys.path.append(os.path.abspath(os.path.dirname(__file__))) # Dirty but it works
  6. import rampwf as rw
  7. from rampwf.prediction_types.detection import Predictions as DetectionPredictions
  8. from rampwf.utils.importing import import_module_from_source
  9. import pandas as pd
  10. import numpy as np
  11. from sklearn.model_selection import ShuffleSplit, train_test_split
  12. from sklearn.metrics import roc_auc_score
  13. from bop_scripts import preprocessing
  14. import itertools
  15. # %%
  16. # Parameters
  17. problem_title = 'Biology Order Prescription'
  18. data = "./data/mimic-iv.sqlite"
  19. lab_dictionnary = pd.read_csv("./config/lab_items.csv").set_index("item_id")["3"].to_dict()
  20. get_drugs, get_diseases = True, True
  21. # %%
  22. # Getting data
  23. if os.path.exists("./data/X.csv"):
  24. X = pd.read_csv("./data/X.csv")
  25. else:
  26. print("Creating X dataset (first run)")
  27. X = preprocessing.generate_features_dataset(
  28. database="./data/mimic-iv.sqlite",
  29. get_drugs=get_drugs,
  30. get_diseases=get_diseases
  31. )
  32. X["last_7"] = X["last_7"].fillna(0)
  33. X["last_30"] = X["last_30"].fillna(0)
  34. X.to_csv("./data/X.csv", header=True, index=False)
  35. if os.path.exists("./data/y.csv"):
  36. y = pd.read_csv("./data/y.csv")
  37. else:
  38. print("Creating y dataset (first run)")
  39. y = preprocessing.generate_labels_dataset(
  40. database="./data/mimic-iv.sqlite",
  41. lab_dictionnary=lab_dictionnary,
  42. )
  43. y.to_csv("./data/y.csv", header=True, index=False)
  44. # Creating train and test
  45. if (os.path.exists("./data/train.csv") == False) or (os.path.exists("./data/test.csv") == False):
  46. X_train, X_test, y_train, y_test = train_test_split(
  47. X, y, test_size=10000, random_state=42
  48. )
  49. train = pd.merge(
  50. X_train,
  51. y_train,
  52. left_on="stay_id",
  53. right_on="stay_id"
  54. ).reset_index(drop=True)
  55. train.to_csv("./data/train.csv", header=True, index=False)
  56. test = pd.merge(
  57. X_test,
  58. y_test,
  59. left_on="stay_id",
  60. right_on="stay_id"
  61. ).reset_index(drop=True)
  62. test.to_csv("./data/test.csv", header=True, index=False)
  63. # %%
  64. # Get rampwf evaluation
  65. class make_detection_fixed(DetectionPredictions):
  66. def __init__ (self, *args, **kwargs):
  67. super().__init__ (*args, **kwargs)
  68. def set_valid_in_train (self, predictions, test_is):
  69. self.y_pred = np.repeat(self.y_pred.reshape(-1, 1), predictions.y_pred.shape[1], axis=1)
  70. self.y_pred[test_is] = predictions.y_pred
  71. def set_slice(self, valid_indexes):
  72. if isinstance(valid_indexes, list):
  73. self.y_pred = self.y_pred[valid_indexes]
  74. @classmethod
  75. def combine(cls, predictions_list, index_list=None, greedy=False):
  76. if index_list is None: # we combine the full list
  77. index_list = range(len(predictions_list))
  78. y_comb_list = [predictions_list[i].y_pred for i in index_list]
  79. n_preds = y_comb_list[0].shape[0]
  80. n_labels = y_comb_list[0].shape[1]
  81. y_preds_combined = np.empty((n_preds, n_labels), dtype=object)
  82. for i in range(n_preds):
  83. preds_list = [preds[i,:] for preds in y_comb_list
  84. if preds[i, 0] is not None]
  85. if len(preds_list) == 1:
  86. # no overlap in the different prediction sets -> simply take
  87. # the single one that is not None
  88. preds_combined = preds_list[0]
  89. elif len(preds_list) > 1:
  90. preds_combined, _ = combine_predictions(
  91. preds_list, cls.iou_threshold, greedy=greedy)
  92. if len(preds_list) > 0:
  93. y_preds_combined[i,:] = preds_combined
  94. combined_predictions = cls(y_pred=y_preds_combined)
  95. return combined_predictions
  96. #%%
  97. def combine_predictions(preds_list, iou_threshold, greedy=False):
  98. """
  99. Combine multiple sets of predictions (of different models)
  100. for a single patch.
  101. """
  102. combined_prediction = np.array(preds_list).mean(axis=0)
  103. return combined_prediction, None
  104. #%%
  105. _features_name = X.columns.tolist()[1:]
  106. _prediction_label_names = y.columns.tolist()[1:]
  107. prediction_type = make_detection_fixed
  108. # %%
  109. class ROCAUC_fixed(rw.score_types.base.BaseScoreType):
  110. is_lower_the_better = False
  111. minimum = 0.0
  112. maximum = 1.0
  113. def __init__(self, index, name='roc_auc', precision=2):
  114. self.name = name
  115. self.precision = precision
  116. self.index = index
  117. def score_function(self, ground_truths, predictions):
  118. """A hybrid score.
  119. It tests the predicted _probability_ of the second class
  120. against the true _label index_ (which is 0 if the first label is the
  121. ground truth, and 1 if it is not, in other words, it is the
  122. true probability of the second class). Thus we have to override the
  123. `Base` function here
  124. """
  125. y_proba = predictions.y_pred[:, self.index]
  126. y_true_proba = ground_truths.y_pred[:, self.index]
  127. mask = (y_proba != None)
  128. y_proba, y_true_proba = y_proba[mask], y_true_proba[mask]
  129. return self.__call__(y_true_proba, y_proba)
  130. def __call__(self, y_true_proba, y_proba):
  131. return roc_auc_score(y_true_proba, y_proba)
  132. # %%
  133. class customClassifier(rw.workflows.Classifier):
  134. def train_submission (self, module_path, X, y_array, train_is=None, prev_trained_model=None):
  135. if train_is is None:
  136. train_is = slice(None, None, None)
  137. classifier = import_module_from_source(
  138. os.path.join(module_path, self.element_names[0] + '.py'),
  139. self.element_names[0],
  140. sanitize=True
  141. )
  142. clf = classifier.Classifier()
  143. if prev_trained_model is None:
  144. clf.fit(X.iloc[train_is,:], y_array[train_is])
  145. else:
  146. clf.fit(
  147. X.iloc[train_is,:], y_array[train_is], prev_trained_model)
  148. return clf
  149. workflow = customClassifier()
  150. score_types = [ROCAUC_fixed(name=f"AUC {_prediction_label_names[i]}", index=i) for i in range(len(_prediction_label_names))]
  151. Predictions = prediction_type
  152. # %%
  153. def get_cv(X, y):
  154. cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
  155. return cv.split(X, y)
  156. def _read_data(path, f_name):
  157. df = pd.read_csv(os.path.join(path, 'data', f_name))
  158. X = df[_features_name]
  159. y = df[_prediction_label_names].astype("int").values
  160. return X, y
  161. def get_train_data(path='.'):
  162. f_name = 'train.csv'
  163. return _read_data(path, f_name)
  164. def get_test_data(path='.'):
  165. f_name = 'test.csv'
  166. return _read_data(path, f_name)