alibell
/
biologyOrderPredictor


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
							"""
    This script contains the preprocessing functions
"""

import sqlite3
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

def get_Xy_df (X, y):
    """
        Merge together the X and y dataframe on stay_id basis

        Parameters
        ----------
        X: pandas dataframe of features
        y: pandas dataframe of labeles

        Output
        ------
        Xy : merged pandas dataframe
    """

    Xy = pd.merge(
        X,
        y,
        left_on="stay_id",
        right_on="stay_id"
    )

    return Xy

def generate_features_dataset(database, get_drugs=True, get_diseases=True):

    """
        Generate features dataset according to the data

        Parameters
        ----------
        database: str, path of the database sqlite file
        get_drugs: boolean, if true the drug history is returned,
        get_diseases: boolean, if true the disease history is returned
    """

    to_merge = []
    
    # Sqlite connection
    conn = sqlite3.connect("./data/mimic-iv.sqlite")

    ## Getting the features
    features = pd.read_sql(f"""
        SELECT 
            s.stay_id,
            s.intime intime,
            p.gender gender,
            p.anchor_age age,
            t.temperature,
            t.heartrate,
            t.resprate,
            t.o2sat,
            t.sbp,
            t.dbp,
            t.pain,
            t.chiefcomplaint
        FROM edstays s
        LEFT JOIN patients p
            ON p.subject_id = s.subject_id
        LEFT Join triage t
            ON t.stay_id = s.stay_id
    """, conn)

    ## Additional features
    ### Last visit
    last_visit = pd.read_sql(f"""
        SELECT DISTINCT
            s1.stay_id,
            CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
            CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
        FROM edstays s1
        INNER JOIN edstays s2
            ON s1.subject_id = s2.subject_id
                AND s1.stay_id != s2.stay_id
                AND s1.intime >= s2.intime
        WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
        GROUP BY s1.stay_id 
    """, conn)
    to_merge.append(last_visit)

    ### Past diagnosis
    if get_diseases:
        past_diagnosis = pd.read_sql(f"""
            SELECT 
                s1.stay_id,
                d.icd_code,
                d.icd_version,
                COUNT(1) n
            FROM edstays s1
            INNER JOIN diagnosis d
                ON d.subject_id = s1.subject_id
            INNER JOIN edstays s2
                ON d.stay_id = s2.stay_id
            WHERE 
                s1.intime >= s2.intime
                AND s1.stay_id != s2.stay_id
            GROUP BY 
                s1.stay_id,
                d.icd_code,
                d.icd_version
        """, conn)
        past_diagnosis = pd.pivot_table(
            past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
                    .reset_index(),
                index="stay_id",
                columns="icd_version",
                values="icd_code",
                aggfunc=lambda x: x
        ).reset_index().rename(columns={
            9:"icd9",
            10:"icd10"
        })
        to_merge.append(past_diagnosis)

    ### Drugs
    if get_drugs:
        drugs = pd.read_sql(f"""
            SELECT stay_id, gsn, 1 n
            FROM medrecon
        """, conn)
        drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
        to_merge.append(drugs)

    ### Merging all together
    for df_to_merge in to_merge:
        features = pd.merge(
            features,
            df_to_merge,
            left_on="stay_id",
            right_on="stay_id",
            how="left"
        )

    features = features.sort_values("stay_id").reset_index(drop=True)

    return features
    

def generate_labels_dataset(database, lab_dictionnary):
    """
        Generate features dataset according to the data

        Parameters
        ----------
        database: str, path of the database sqlite file
        lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
    """

    to_merge = []
    
    # Sqlite connection
    conn = sqlite3.connect("./data/mimic-iv.sqlite")

    # Getting biological values
    lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
    lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]

    ## Let's create an index to speed up queries
    conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")

    # 1. Generating features

    ## Getting list of stay_id
    stays = pd.read_sql(
        "SELECT DISTINCT stay_id FROM edstays",
        conn
    )

    ## Getting the features
    labs = pd.read_sql(f"""
        SELECT 
            le.stay_id,
            le.itemid item_id
        FROM labevents le
        WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
        GROUP BY
            le.stay_id,
            le.itemid
    """, conn)

    labs_deduplicate = pd.merge(
        lab_dictionnary_pd.rename(columns={0:"label"}),
        labs,
        left_on="index",
        right_on="item_id"
    ) \
    .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
    .reset_index(drop=True)

    labs_deduplicate_pivot = pd.pivot_table(
        labs_deduplicate.assign(value=1),
        index="stay_id",
        columns="label",
        values="value"
    ).fillna(0)

    labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
        stays[["stay_id"]].set_index("stay_id"),
        how="right"
    ).fillna(0).astype("int8").reset_index()

    labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)

    return labels

def remove_outliers (X, variables_ranges):
    """
        This function remove the outliers and replace them by an NA according to the variable_ranges rules

        Parameters
        ----------
        X: pandas Dataframe
        variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range

        Outputs
        -------
        Tuple containing :
        - Processing dataframe
        - A Dataframe containing the number and percentage of processed outliers per variable
    """

    outliers = {}
    X_copy = X.copy()

    for key, value in variables_ranges.items():
        outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
        outliers[key] = outliers_mask.sum() # Storing the number of outliers
        X_copy.loc[outliers_mask, key] = np.NaN # Setting outliers to NA

    outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
        .rename(columns={0:"n"}) \
        .assign(total=X[outliers.keys()].count().values,
                pourcentage=lambda x: (x["n"]/x["total"])*100
        )

    return X_copy, outlier_report

class OutlierRemover(BaseEstimator, TransformerMixin):
    """
        Sklearn-like class for removing outliers
        To be included in the pipeline
    """

    def __init__ (self, variables_ranges):
        """
            Parameters:
            ----------
            variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
        """

        super().__init__()

        # Storing ranges rules
        self.variables_ranges = variables_ranges

    def fit(self, X=None, y=None):
        return self

    def transform(self, X):
        X_copy, _ = remove_outliers(X, self.variables_ranges)
        return X_copy

class TextPreprocessing(BaseEstimator, TransformerMixin):
    """
        Sklearn-like class for text preprocessing
        To be included in the pipeline

        What does it do :
        - It fills na with empty string
        - It lower string
        - It replace comma by space
    """

    def __init__ (self):

        """
            Parameters:
            ----------
        """

        super().__init__()

    def fit(self, X=None, y=None):
        return self

    def transform(self, X):
        colname = X.name
        X = X \
            .fillna("") \
            .replace(",", " ").str.lower()

        return X