SHA1
--- a/biologyOrderPrediction_starting_kit.html
+++ b/biologyOrderPrediction_starting_kit.html
--- a/biologyOrderPrediction_starting_kit.ipynb
+++ b/biologyOrderPrediction_starting_kit.ipynb
--- a/biologyOrderPrediction_starting_kit_no_input.html
+++ b/biologyOrderPrediction_starting_kit_no_input.html
--- a/problem.py
+++ b/problem.py
@@ -0,0 +1,201 @@
 
				+#%%
			
 
				+from multiprocessing.sharedctypes import Value
			
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath(os.path.dirname(__file__))) # Dirty but it works
			
 
				+
			
 
				+import rampwf as rw
			
 
				+from rampwf.prediction_types.detection import Predictions as DetectionPredictions
			
 
				+from rampwf.utils.importing import import_module_from_source
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from sklearn.model_selection import ShuffleSplit, train_test_split
			
 
				+from sklearn.metrics import roc_auc_score
			
 
				+from bop_scripts import preprocessing
			
 
				+import itertools
			
 
				+# %%
			
 
				+# Parameters
			
 
				+problem_title = 'Biology Order Prescription'
			
 
				+data = "./data/mimic-iv.sqlite"
			
 
				+lab_dictionnary = pd.read_csv("./config/lab_items.csv").set_index("item_id")["3"].to_dict()
			
 
				+get_drugs, get_diseases = True, True
			
 
				+# %%
			
 
				+# Getting data
			
 
				+if os.path.exists("./data/X.csv"):
			
 
				+    X = pd.read_csv("./data/X.csv")
			
 
				+else:
			
 
				+    print("Creating X dataset (first run)")
			
 
				+    X = preprocessing.generate_features_dataset(
			
 
				+        database="./data/mimic-iv.sqlite",
			
 
				+        get_drugs=get_drugs,
			
 
				+        get_diseases=get_diseases
			
 
				+    )
			
 
				+    X["last_7"] = X["last_7"].fillna(0)
			
 
				+    X["last_30"] = X["last_30"].fillna(0)
			
 
				+    X.to_csv("./data/X.csv", header=True, index=False)
			
 
				+
			
 
				+if os.path.exists("./data/y.csv"):
			
 
				+    y = pd.read_csv("./data/y.csv")
			
 
				+else:
			
 
				+    print("Creating y dataset (first run)")
			
 
				+    y = preprocessing.generate_labels_dataset(
			
 
				+        database="./data/mimic-iv.sqlite",
			
 
				+        lab_dictionnary=lab_dictionnary,
			
 
				+    )
			
 
				+    y.to_csv("./data/y.csv", header=True, index=False)
			
 
				+
			
 
				+# Creating train and test
			
 
				+if (os.path.exists("./data/train.csv") == False) or (os.path.exists("./data/test.csv") == False):
			
 
				+    X_train, X_test, y_train, y_test = train_test_split(
			
 
				+        X, y, test_size=10000, random_state=42
			
 
				+    )
			
 
				+
			
 
				+    train = pd.merge(
			
 
				+        X_train,
			
 
				+        y_train,
			
 
				+        left_on="stay_id",
			
 
				+        right_on="stay_id"
			
 
				+    ).reset_index(drop=True)
			
 
				+    train.to_csv("./data/train.csv", header=True, index=False)
			
 
				+
			
 
				+    test = pd.merge(
			
 
				+        X_test,
			
 
				+        y_test,
			
 
				+        left_on="stay_id",
			
 
				+        right_on="stay_id"
			
 
				+    ).reset_index(drop=True)
			
 
				+    test.to_csv("./data/test.csv", header=True, index=False)
			
 
				+
			
 
				+# %%
			
 
				+# Get rampwf evaluation 
			
 
				+class make_detection_fixed(DetectionPredictions):
			
 
				+    def __init__ (self, *args, **kwargs):
			
 
				+        super().__init__ (*args, **kwargs)
			
 
				+
			
 
				+    def set_valid_in_train (self, predictions, test_is):
			
 
				+        self.y_pred = np.repeat(self.y_pred.reshape(-1, 1), predictions.y_pred.shape[1], axis=1)
			
 
				+        self.y_pred[test_is] = predictions.y_pred
			
 
				+
			
 
				+    def set_slice(self, valid_indexes):
			
 
				+        if isinstance(valid_indexes, list):
			
 
				+            self.y_pred = self.y_pred[valid_indexes]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def combine(cls, predictions_list, index_list=None, greedy=False):
			
 
				+        if index_list is None:  # we combine the full list
			
 
				+            index_list = range(len(predictions_list))
			
 
				+        y_comb_list = [predictions_list[i].y_pred for i in index_list]
			
 
				+
			
 
				+        n_preds = y_comb_list[0].shape[0]
			
 
				+        n_labels = y_comb_list[0].shape[1]
			
 
				+
			
 
				+        y_preds_combined = np.empty((n_preds, n_labels), dtype=object)
			
 
				+
			
 
				+        for i in range(n_preds):
			
 
				+            preds_list = [preds[i,:] for preds in y_comb_list
			
 
				+                          if preds[i, 0] is not None]
			
 
				+
			
 
				+            if len(preds_list) == 1:
			
 
				+                # no overlap in the different prediction sets -> simply take
			
 
				+                # the single one that is not None
			
 
				+                preds_combined = preds_list[0]
			
 
				+            elif len(preds_list) > 1:
			
 
				+                preds_combined, _ = combine_predictions(
			
 
				+                    preds_list, cls.iou_threshold, greedy=greedy)
			
 
				+
			
 
				+            if len(preds_list) > 0:
			
 
				+                y_preds_combined[i,:] = preds_combined
			
 
				+
			
 
				+        combined_predictions = cls(y_pred=y_preds_combined)
			
 
				+        
			
 
				+        return combined_predictions
			
 
				+
			
 
				+#%%
			
 
				+def combine_predictions(preds_list, iou_threshold, greedy=False):
			
 
				+    """
			
 
				+    Combine multiple sets of predictions (of different models)
			
 
				+    for a single patch.
			
 
				+    """
			
 
				+
			
 
				+    combined_prediction = np.array(preds_list).mean(axis=0)
			
 
				+    return combined_prediction, None
			
 
				+
			
 
				+#%%
			
 
				+_features_name = X.columns.tolist()[1:]
			
 
				+_prediction_label_names = y.columns.tolist()[1:]
			
 
				+prediction_type = make_detection_fixed
			
 
				+# %%
			
 
				+class ROCAUC_fixed(rw.score_types.base.BaseScoreType):
			
 
				+    is_lower_the_better = False
			
 
				+    minimum = 0.0
			
 
				+    maximum = 1.0
			
 
				+
			
 
				+    def __init__(self, index, name='roc_auc', precision=2):
			
 
				+        self.name = name
			
 
				+        self.precision = precision
			
 
				+        self.index = index
			
 
				+
			
 
				+    def score_function(self, ground_truths, predictions):
			
 
				+        """A hybrid score.
			
 
				+        It tests the predicted _probability_ of the second class
			
 
				+        against the true _label index_ (which is 0 if the first label is the
			
 
				+        ground truth, and 1 if it is not, in other words, it is the
			
 
				+        true probability of the second class). Thus we have to override the
			
 
				+        `Base` function here
			
 
				+        """
			
 
				+
			
 
				+        y_proba = predictions.y_pred[:, self.index]
			
 
				+        y_true_proba = ground_truths.y_pred[:, self.index]
			
 
				+    
			
 
				+        mask = (y_proba != None)
			
 
				+        y_proba, y_true_proba = y_proba[mask], y_true_proba[mask]
			
 
				+        
			
 
				+        return self.__call__(y_true_proba, y_proba)
			
 
				+
			
 
				+    def __call__(self, y_true_proba, y_proba):
			
 
				+        return roc_auc_score(y_true_proba, y_proba)
			
 
				+
			
 
				+# %%
			
 
				+class customClassifier(rw.workflows.Classifier):
			
 
				+    def train_submission (self, module_path, X, y_array, train_is=None, prev_trained_model=None):
			
 
				+        if train_is is None:
			
 
				+            train_is = slice(None, None, None)
			
 
				+        
			
 
				+        classifier = import_module_from_source(
			
 
				+            os.path.join(module_path, self.element_names[0] + '.py'),
			
 
				+            self.element_names[0],
			
 
				+            sanitize=True
			
 
				+        )
			
 
				+        clf = classifier.Classifier()
			
 
				+        if prev_trained_model is None:
			
 
				+            clf.fit(X.iloc[train_is,:], y_array[train_is])
			
 
				+        else:
			
 
				+            clf.fit(
			
 
				+                X.iloc[train_is,:], y_array[train_is], prev_trained_model)
			
 
				+
			
 
				+        return clf
			
 
				+
			
 
				+
			
 
				+workflow = customClassifier()
			
 
				+score_types = [ROCAUC_fixed(name=f"AUC {_prediction_label_names[i]}", index=i) for i in range(len(_prediction_label_names))]
			
 
				+Predictions = prediction_type
			
 
				+# %%
			
 
				+def get_cv(X, y):
			
 
				+    cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
			
 
				+    return cv.split(X, y)
			
 
				+
			
 
				+
			
 
				+def _read_data(path, f_name):
			
 
				+    df = pd.read_csv(os.path.join(path, 'data', f_name))
			
 
				+    X = df[_features_name]
			
 
				+    y = df[_prediction_label_names].astype("int").values
			
 
				+
			
 
				+    return X, y
			
 
				+
			
 
				+def get_train_data(path='.'):
			
 
				+    f_name = 'train.csv'
			
 
				+    return _read_data(path, f_name)
			
 
				+
			
 
				+def get_test_data(path='.'):
			
 
				+    f_name = 'test.csv'
			
 
				+    return _read_data(path, f_name)
			
--- a/submissions/dummy/classifier.py
+++ b/submissions/dummy/classifier.py
@@ -0,0 +1,20 @@
 
				+import numpy as np
			
 
				+from sklearn.base import BaseEstimator
			
 
				+
			
 
				+
			
 
				+class Classifier(BaseEstimator):
			
 
				+
			
 
				+    def fit(self, X, y):
			
 
				+        self.pred = y.mean(axis=0)
			
 
				+
			
 
				+        return self
			
 
				+
			
 
				+    def predict_proba(self, X):
			
 
				+        y_pred = np.repeat(self.pred.reshape(1, -1), X.shape[0], axis=0)
			
 
				+
			
 
				+        return y_pred
			
 
				+
			
 
				+    def predict(self, X):
			
 
				+        y_pred = self.predict_proba(X)
			
 
				+
			
 
				+        return (y_pred >= 0.5)*1
			
--- a/submissions/starting_kit/classifier.py
+++ b/submissions/starting_kit/classifier.py
@@ -0,0 +1,90 @@
 
				+import sys
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+sys.path.append(str(Path(os.path.dirname(__file__)).parent)) # Dirty but it works
			
 
				+
			
 
				+from bop_scripts.preprocessing import remove_outliers
			
 
				+from bop_scripts.nn_models import torchMLPClassifier_sklearn, torchMLP
			
 
				+from bop_scripts.models import generate_model, fit_all_classifiers
			
 
				+import torch
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.base import BaseEstimator
			
 
				+
			
 
				+qualitatives_variables = ["gender", "last_7", "last_30"]
			
 
				+quantitatives_variables = ['age', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']
			
 
				+text_variables = ["chiefcomplaint"]
			
 
				+labels = ['Cardiaque', 'Coagulation', 'Gazometrie', 'Glycemie_Sanguine', 'Hepato-Biliaire', 'IonoC', 'Lipase', 'NFS', 'Phospho-Calcique']
			
 
				+variables_ranges = {
			
 
				+    "temperature":[60,130],
			
 
				+    "heartrate":[20, 300],
			
 
				+    "resprate":[5, 50],
			
 
				+    "o2sat":[20, 100],
			
 
				+    "sbp":[40, 250],
			
 
				+    "dbp":[20, 200],
			
 
				+    "pain":[0,10]
			
 
				+}
			
 
				+device = "cuda:0" if torch.cuda.is_available() else "cpu"
			
 
				+
			
 
				+def torch_classifier_fn ():
			
 
				+
			
 
				+    torch_classifier = torchMLPClassifier_sklearn(
			
 
				+        torchMLP,
			
 
				+        early_stop_validations_size=10000,
			
 
				+        early_stop=True,
			
 
				+        early_stop_metric="f1",
			
 
				+        early_stop_tol=1,
			
 
				+        n_epochs=50,
			
 
				+        device_train= device,
			
 
				+        device_predict="cpu",
			
 
				+        class_weight="balanced",
			
 
				+        learning_rate=1e-4,
			
 
				+        verbose=False
			
 
				+    )
			
 
				+
			
 
				+    torch_sklearn_classifier = generate_model(
			
 
				+            torch_classifier,
			
 
				+            qualitatives_variables,
			
 
				+            quantitatives_variables,
			
 
				+            text_variables[0],
			
 
				+            remove_outliers=True,
			
 
				+            outliers_variables_ranges=variables_ranges,
			
 
				+            CountVectorizer_kwargs={"ngram_range":(1,1), "max_features":600}
			
 
				+    )
			
 
				+
			
 
				+    return torch_sklearn_classifier
			
 
				+
			
 
				+class Classifier(BaseEstimator):
			
 
				+
			
 
				+    def preprocess (self, X, y=None):
			
 
				+        X_clean, outliers = remove_outliers(X, variables_ranges)
			
 
				+        if y is not None:
			
 
				+            y = pd.DataFrame(y, columns=labels)
			
 
				+
			
 
				+        return X_clean, y
			
 
				+
			
 
				+    def fit(self, X, y):
			
 
				+        X, y = self.preprocess(X, y)
			
 
				+        self.classifiers = fit_all_classifiers(
			
 
				+            torch_classifier_fn,
			
 
				+            X,
			
 
				+            y,
			
 
				+            verbose=False
			
 
				+        )
			
 
				+        return self
			
 
				+
			
 
				+    def predict_proba(self, X):
			
 
				+        X, y = self.preprocess(X)
			
 
				+        predictions = []
			
 
				+        y_columns = labels
			
 
				+        for y_column in y_columns:
			
 
				+            predictions.append(self.classifiers[y_column].predict(X).reshape(-1, 1))
			
 
				+        y_pred = np.concatenate(predictions, axis=1)
			
 
				+
			
 
				+        return y_pred
			
 
				+
			
 
				+    def predict(self, X):
			
 
				+        y_pred = self.predict_proba(X)
			
 
				+
			
 
				+        return (y_pred >= 0.5)*1
Author	SHA1 Message	Date
Ali	cc65a3a794 Adding starting kiit classifier	3 years ago
Ali	e0ef580cd0 Set to rampkit format	3 years ago