0_Preprocessing.ipynb 8.5 KB

![Urgences - Image CC0 - pexels.com](img/pexels-pixabay-263402.jpg "Urgences")
# Challenge - [ED Lab Prediction] _Nom à trouver_
## Objectif
Ce notebook effectue le pre-processing des données. Il exploite les données stockées dans la base sqlite, téléchargées à partir de `download_data.py` et les exporte dans un fichier csv à destination de l'entrainement de la data-visualisation l'algorithme.
## Chargement des données
import sqlite3
import pandas as pd
# Sqlite connection
conn = sqlite3.connect("./data/mimic-iv.sqlite")

# Classification des items de biologie
items = pd.read_csv("./config/lab_items.csv").dropna()
items_list = items["item_id"].astype("str").tolist()

# Classification ATC des médicaments
drugs_rules = pd.read_csv("./config/atc_items.csv")
drugs_rules_list = drugs_rules["gsn"].drop_duplicates().astype("str").tolist()
# Création d'un index pour accélérer les requêtes
conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)");
## Stays dataset
stays = pd.read_sql(f"""
    SELECT 
        s.stay_id,
        s.intime intime,
        p.gender gender,
        p.anchor_age age,
        t.temperature,
        t.heartrate,
        t.resprate,
        t.o2sat,
        t.sbp,
        t.dbp,
        t.pain,
        t.chiefcomplaint
    FROM edstays s
    LEFT JOIN patients p
        ON p.subject_id = s.subject_id
    LEFT Join triage t
        ON t.stay_id = s.stay_id
""", conn)
stays["intime"] = pd.to_datetime(stays["intime"])
stays["gender"] = stays["gender"].astype("string") # Pas de valeurs manquantes en gender
stays["chiefcomplaint"] = stays["chiefcomplaint"].fillna("").astype("string") # ¨Chiefcomplaint manquant = chiefcomplaint vide
drugs = pd.read_sql(f"""
    SELECT stay_id, gsn
    FROM medrecon
    WHERE gsn IN ({','.join(drugs_rules_list)})
""", conn)
# Liste des codes ATC pour chaque séjour
atc_stays = pd.merge(
    drugs,
    drugs_rules,
    left_on="gsn",
    right_on="gsn"
).drop_duplicates(["stay_id","atc"])

atc_stays["atc_2"] = atc_stays["atc"].str.slice(0, 3)
# Considérons 2 niveaux de granularité
## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV

atc_stays_pivoted_4 = pd.pivot_table(
    atc_stays[["stay_id","atc"]] \
        .assign(value=1),
    columns=["atc"],
    index=["stay_id"],
    values="value"
).fillna(0).reset_index()
## Le code ATC 2 (Anatomique et Thérapeutique)

atc_stays_pivoted_2 = pd.pivot_table(
    atc_stays[["stay_id","atc_2"]] \
        .drop_duplicates() \
        .rename(columns={"atc_2":"atc"}) \
        .assign(value=1),
    columns=["atc"],
    index=["stay_id"],
    values="value"
).fillna(0).reset_index()
stays_atc_4 = pd.merge(
    stays,
    atc_stays_pivoted_4,
    left_on="stay_id",
    right_on="stay_id",
    how="left"
)

stays_atc_2 = pd.merge(
    stays,
    atc_stays_pivoted_2,
    left_on="stay_id",
    right_on="stay_id",
    how="left"
)

stays_atc_4[atc_stays_pivoted_4.columns[1:]] = stays_atc_4[atc_stays_pivoted_4.columns[1:]].fillna(0)
stays_atc_2[atc_stays_pivoted_2.columns[1:]] = stays_atc_2[atc_stays_pivoted_2.columns[1:]].fillna(0)
# Ecriture du featues dataset
# On écrit en parquet pour optimiser le stockage et les temps d'io

stays_atc_2.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/features_atc2.parquet", engine="pyarrow", index=False)
stays_atc_4.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/features_atc4.parquet", engine="pyarrow", index=False)
## Lab dataset
labs = pd.read_sql(f"""
    SELECT 
        le.stay_id,
        le.itemid item_id
    FROM labevents le
    WHERE le.itemid IN ('{"','".join(items_list)}')
    GROUP BY
        le.stay_id,
        le.itemid
""", conn)
labs_deduplicate = pd.merge(
    items[["item_id","3"]].rename(columns={"3":"label"}),
    labs,
    left_on="item_id",
    right_on="item_id"
) \
 .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
 .reset_index(drop=True)
labs_deduplicate_pivot = pd.pivot_table(
    labs_deduplicate.assign(value=1),
    index=["stay_id"],
    columns=["label"],
    values="value"
).fillna(0)
labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
    stays[["stay_id"]].set_index("stay_id"),
    how="right"
).fillna(0).astype("int8").reset_index()
labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/labels.parquet", index=False)