Nom à trouver
Ce notebook effectue le pre-processing des données.
Il exploite les données stockées dans la base sqlite, téléchargées à partir de download_data.py
et les exporte dans un fichier csv à destination de l'entrainement de la data-visualisation l'algorithme.
import sqlite3
import pandas as pd
# Sqlite connection
conn = sqlite3.connect("./data/mimic-iv.sqlite")
# Classification des items de biologie
items = pd.read_csv("./config/lab_items.csv").dropna()
items_list = items["item_id"].astype("str").tolist()
# Classification ATC des médicaments
drugs_rules = pd.read_csv("./config/atc_items.csv")
drugs_rules_list = drugs_rules["gsn"].drop_duplicates().astype("str").tolist()
# Création d'un index pour accélérer les requêtes
conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)");
stays = pd.read_sql(f"""
SELECT
s.stay_id,
s.intime intime,
p.gender gender,
p.anchor_age age,
t.temperature,
t.heartrate,
t.resprate,
t.o2sat,
t.sbp,
t.dbp,
t.pain,
t.chiefcomplaint
FROM edstays s
LEFT JOIN patients p
ON p.subject_id = s.subject_id
LEFT Join triage t
ON t.stay_id = s.stay_id
""", conn)
# Passage dans les 30 et 7 jours
derniers_passages = pd.read_sql(f"""
SELECT DISTINCT
s1.stay_id,
CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
FROM edstays s1
INNER JOIN edstays s2
ON s1.subject_id = s2.subject_id
AND s1.stay_id != s2.stay_id
AND s1.intime >= s2.intime
WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
GROUP BY s1.stay_id
""", conn)
# Derniers diagnostic
from icdcodex import icd2vec, hierarchy
import numpy as np
dernier_diag = pd.read_sql(f"""
SELECT
s1.stay_id,
d.icd_code,
d.icd_version,
COUNT(1) n
FROM edstays s1
INNER JOIN diagnosis d
ON d.subject_id = s1.subject_id
INNER JOIN edstays s2
ON d.stay_id = s2.stay_id
WHERE
s1.intime >= s2.intime
AND s1.stay_id != s2.stay_id
GROUP BY
s1.stay_id,
d.icd_code,
d.icd_version
""", conn)
embedder_icd9 = icd2vec.Icd2Vec(num_embedding_dimensions=10, workers=-1)
embedder_icd9.fit(*hierarchy.icd9())
icd_9 = dernier_diag.query("icd_version == 9")["icd_code"]
# Hotfix
icd_9 = icd_9.replace("E119","E0119")
icd_9 = icd_9[icd_9.isin(hierarchy.icd9()[1])].drop_duplicates()
icd_9_embedding = embedder_icd9.to_vec(icd_9)
embedder_icd10 = icd2vec.Icd2Vec(num_embedding_dimensions=10, workers=-1)
embedder_icd10.fit(*hierarchy.icd10cm(version="2020"))
icd_10 = dernier_diag.query("icd_version == 10")["icd_code"]
icd_10 = icd_10.apply(lambda x: x[0:3]+"."+x[3:] if len(x) > 3 else x)
icd_10 = icd_10[icd_10.isin(hierarchy.icd10cm(version="2020")[1])].drop_duplicates()
icd_10_embedding = embedder_icd10.to_vec(icd_10)
icd_series = pd.concat([
"ICD9_"+icd_9,
"ICD10_"+icd_10
]).reset_index(drop=True)
dernier_diag_list = dernier_diag.assign(
icd_str = lambda x: "ICD"+x["icd_version"].astype("str")+"_"+x["icd_code"],
)["icd_str"]
icd_embeddings_matrix = np.concatenate([
icd_9_embedding,
icd_10_embedding
], axis=0)
icd_to_idx = icd_series.reset_index().set_index("icd_code").join(
dernier_diag_list.drop_duplicates().reset_index().set_index("icd_str").drop(columns="index"),
how="right"
).fillna(pd.NA).astype(pd.Int64Dtype())["index"].to_dict()
dernier_diag["icd_idx"] = dernier_diag_list.apply(lambda x: icd_to_idx[x])
dernier_diag_idx = dernier_diag.dropna().groupby("stay_id")["icd_idx"].agg(lambda x: x.tolist())
from torch.nn import Embedding
from torch.nn.utils.rnn import pad_sequence
import torch
icd_embeddings_matrix_with_pad = np.concatenate([
icd_embeddings_matrix,
np.zeros((1, icd_embeddings_matrix.shape[1]))
])
torch_embedding = Embedding(
icd_embeddings_matrix_with_pad.shape[0],
embedding_dim=10,
_weight=torch.tensor(icd_embeddings_matrix_with_pad)
)
torch_embedding.requires_grad = False
dernier_diag_idx_tensor = pad_sequence([torch.tensor(x) for x in dernier_diag_idx.tolist()],
batch_first=True,
padding_value=icd_embeddings_matrix_with_pad.shape[0]-1
)
dernier_diag_idx_tensor_mask = (dernier_diag_idx_tensor != icd_embeddings_matrix_with_pad.shape[0]-1).unsqueeze(2)*1
dernier_diag_idx_tensor_embeddings = torch_embedding(dernier_diag_idx_tensor).sum(axis=1)/(dernier_diag_idx_tensor_mask.sum(axis=1)+1e-8)
dernier_diag_idx_tensor_embeddings = dernier_diag_idx_tensor_embeddings.detach().numpy()
stays = stays.join(
pd.DataFrame(dernier_diag_idx_tensor_embeddings, index=dernier_diag_idx.index, columns=["diag_"+str(x) for x in range(10)]),
on = "stay_id",
how="left"
)
stays = stays \
.join(derniers_passages.set_index("stay_id"), on="stay_id")
stays["last_7"] = stays["last_7"].fillna(0)
stays["last_30"] = stays["last_30"].fillna(0)
stays["intime"] = pd.to_datetime(stays["intime"])
stays["gender"] = stays["gender"].astype("string") # Pas de valeurs manquantes en gender
stays["chiefcomplaint"] = stays["chiefcomplaint"].fillna("").astype("string") # ¨Chiefcomplaint manquant = chiefcomplaint vide
drugs = pd.read_sql(f"""
SELECT stay_id, gsn, etccode, 1 n
FROM medrecon
WHERE gsn IN ({','.join(drugs_rules_list)})
""", conn)
# Liste des codes ATC pour chaque séjour
atc_stays = pd.merge(
drugs,
drugs_rules,
left_on="gsn",
right_on="gsn"
).groupby(["stay_id","atc"])["n"].sum() \
.reset_index()
atc_stays["atc_2"] = atc_stays["atc"].str.slice(0, 3)
# Considérons 2 niveaux de granularité
## Le code ATC complet (Anatomique, Thérapeutique et Pharmacologique), ATC IV
atc_stays_pivoted_4 = pd.pivot_table(
atc_stays[["stay_id","atc", "n"]],
columns=["atc"],
index=["stay_id"],
values="n"
).fillna(0).reset_index()
## Le code ATC 2 (Anatomique et Thérapeutique)
atc_stays_pivoted_2 = pd.pivot_table(
atc_stays[["stay_id","atc_2", "n"]] \
.groupby(["stay_id","atc_2"])["n"].sum() \
.reset_index() \
.rename(columns={"atc_2":"atc"}),
columns=["atc"],
index=["stay_id"],
values="n"
).fillna(0).reset_index()
## Les codes ETC
etc_pivoted = pd.pivot_table(
drugs[["stay_id","etccode", "n"]].dropna() \
.assign(etccode = lambda x: x["etccode"].astype("int").astype("str")) \
.groupby(["stay_id","etccode"])["n"].sum() \
.reset_index() \
.rename(columns={"etccode":"atc"}),
columns=["atc"],
index=["stay_id"],
values="n"
).fillna(0).reset_index()
stays_atc_4 = pd.merge(
stays,
atc_stays_pivoted_4,
left_on="stay_id",
right_on="stay_id",
how="left"
)
stays_atc_2 = pd.merge(
stays,
atc_stays_pivoted_2,
left_on="stay_id",
right_on="stay_id",
how="left"
)
stays_etc = pd.merge(
stays,
etc_pivoted,
left_on="stay_id",
right_on="stay_id",
how="left"
)
stays_atc_4[atc_stays_pivoted_4.columns[1:]] = stays_atc_4[atc_stays_pivoted_4.columns[1:]].fillna(0)
stays_atc_2[atc_stays_pivoted_2.columns[1:]] = stays_atc_2[atc_stays_pivoted_2.columns[1:]].fillna(0)
stays_etc[etc_pivoted.columns[1:]] = stays_etc[etc_pivoted.columns[1:]].fillna(0)
# Ecriture du featues dataset
# On écrit en parquet pour optimiser le stockage et les temps d'io
stays_atc_2.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/features_atc2.parquet", engine="pyarrow", index=False)
stays_atc_4.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/features_atc4.parquet", engine="pyarrow", index=False)
stays_etc.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/features_etc.parquet", engine="pyarrow", index=False)
labs = pd.read_sql(f"""
SELECT
le.stay_id,
le.itemid item_id
FROM labevents le
WHERE le.itemid IN ('{"','".join(items_list)}')
GROUP BY
le.stay_id,
le.itemid
""", conn)
[0;31m---------------------------------------------------------------------------[0m [0;31mNameError[0m Traceback (most recent call last) Input [0;32mIn [1][0m, in [0;36m<module>[0;34m[0m [0;32m----> 1[0m labs [38;5;241m=[39m [43mpd[49m[38;5;241m.[39mread_sql([38;5;124mf[39m[38;5;124m"""[39m [1;32m 2[0m [38;5;124m SELECT [39m [1;32m 3[0m [38;5;124m le.stay_id,[39m [1;32m 4[0m [38;5;124m le.itemid item_id[39m [1;32m 5[0m [38;5;124m FROM labevents le[39m [1;32m 6[0m [38;5;124m WHERE le.itemid IN ([39m[38;5;124m'[39m[38;5;132;01m{[39;00m[38;5;124m"[39m[38;5;124m'[39m[38;5;124m,[39m[38;5;124m'[39m[38;5;124m"[39m[38;5;241m.[39mjoin(items_list)[38;5;132;01m}[39;00m[38;5;124m'[39m[38;5;124m)[39m [1;32m 7[0m [38;5;124m GROUP BY[39m [1;32m 8[0m [38;5;124m le.stay_id,[39m [1;32m 9[0m [38;5;124m le.itemid[39m [1;32m 10[0m [38;5;124m"""[39m, conn) [0;31mNameError[0m: name 'pd' is not defined
labs_deduplicate = pd.merge(
items[["item_id","3"]].rename(columns={"3":"label"}),
labs,
left_on="item_id",
right_on="item_id"
) \
.drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
.reset_index(drop=True)
labs_deduplicate_pivot = pd.pivot_table(
labs_deduplicate.assign(value=1),
index=["stay_id"],
columns=["label"],
values="value"
).fillna(0)
labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
stays[["stay_id"]].set_index("stay_id"),
how="right"
).fillna(0).astype("int8").reset_index()
labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True).to_parquet("./data/labels.parquet", index=False)