import sqlite3
import pandas as pd
import numpy as np
# Sqlite connection
conn = sqlite3.connect("./data/mimic-iv.sqlite")
# Classification ATC des médicaments
drugs_rules = pd.read_csv("./config/atc_items.csv")
drugs_rules_list = drugs_rules["gsn"].drop_duplicates().astype("str").tolist()
# Récupération des codes
drugs = pd.read_sql(f"""
SELECT stay_id, gsn, etccode, 1 n
FROM medrecon
WHERE gsn IN ({','.join(drugs_rules_list)})
""", conn)
# Liste des codes pour chaque séjour
stays_code = pd.merge(
drugs,
drugs_rules,
left_on="gsn",
right_on="gsn"
) \
.reset_index()
stays_code["ATC_4"] = stays_code["atc"]
stays_code["ATC_2"] = stays_code["atc"].str.slice(0,3)
stays_code["ETC"] = stays_code["etccode"]
variable = "ETC"
from sklearn.preprocessing import OrdinalEncoder
stays_code_dropped = stays_code.dropna(subset=[variable]).drop_duplicates(["stay_id", variable]).reset_index(drop=True)
stays_code_dropped = stays_code_dropped[["stay_id", "gsn", variable]] \
.rename(columns={variable:"code"})
stays_code_dropped["code"] = stays_code_dropped["code"].astype("int").astype("str")
# Creation de l'encodeur
encoder = OrdinalEncoder().fit(stays_code_dropped[["code"]])
# Entrainement des embeddings
stays_code_dropped["code_id"] = encoder.transform(stays_code_dropped[["code"]]).astype("int32")
pair_matrix = pd.merge(
stays_code_dropped[["stay_id","gsn", "code_id"]],
stays_code_dropped[["stay_id","gsn", "code_id"]],
left_on="stay_id",
right_on="stay_id"
).query("gsn_x != gsn_y")[["code_id_x", "code_id_y"]]
pair_matrix_probability = pair_matrix.assign(n = 1).groupby(["code_id_x", "code_id_y"]).sum() \
.reset_index() \
.join(
pair_matrix.assign(n_total=1).groupby("code_id_x")["n_total"].sum(),
on="code_id_x"
) \
.assign(prob=lambda x: x["n"]/x["n_total"])[["code_id_x", "code_id_y", "prob"]] \
.values
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
class embeddingTrainer (nn.Module):
def __init__ (self, embedding_size=100):
super().__init__()
# Le dernier index correspond au pad token
self.embeddings = nn.Embedding(num_embeddings=encoder.categories_[0].shape[0]+1, embedding_dim=embedding_size)
self.network = nn.Sequential(*[
nn.Linear(embedding_size, 50),
nn.ReLU(),
nn.Linear(50, 200),
nn.ReLU()
])
self.proba = nn.Sequential(*[
nn.Linear(400, 200),
nn.ReLU(),
nn.Linear(200,50),
nn.ReLU(),
nn.Linear(50, 10),
nn.ReLU(),
nn.Linear(10,1),
nn.Sigmoid()
])
self.loss = nn.BCELoss()
self.optimizer = optim.Adam(self.parameters(), lr=5e-5)
def forward(self, x):
word_1 = x[:,0]
word_2 = x[:,1]
embedding_1 = self.network(self.embeddings(word_1))
embedding_2 = self.network(self.embeddings(word_2))
merged_data = torch.concat([embedding_1, embedding_2], axis=1)
y_hat = self.proba(merged_data)
return y_hat
def fit(self, x, y):
self.train()
self.optimizer.zero_grad()
y_hat = self.forward(x)
loss = self.loss(y_hat, y)
loss.backward()
self.optimizer.step()
loss_detach = loss.detach().cpu()
return loss_detach
def predict(self, x):
self.eval()
with torch.no_grad():
y_hat = self.forward(x)
return y_hat
loader = DataLoader(pair_matrix_probability, shuffle=True, batch_size=1000)
embedding_trainer = embeddingTrainer(embedding_size=100)
embedding_trainer = embedding_trainer.to("cuda:0")
n_epoch = 10
n_print_epoch = 10
n_print_batch = 1000
for i in range(n_epoch):
losses = []
j = 0
for x in loader:
x_batch = x[:,[0,1]].int()
x_batch = x_batch.to("cuda:0")
y_batch = x[:,2].float().unsqueeze(dim=1)
y_batch = y_batch.to("cuda:0")
loss = embedding_trainer.fit(x_batch, y_batch)
losses.append(loss)
if j%n_print_batch == 0:
loss_mean = np.array(losses).mean()
print(f"Epoch {i} - Batch {j} - Loss : {loss_mean}")
j += 1
if i%n_print_epoch == 0:
loss_mean = np.array(losses).mean()
print(f"Epoch {i} - Loss : {loss_mean}")
Epoch 0 - Batch 0 - Loss : 0.7970905303955078 Epoch 0 - Loss : 0.45808276534080505 Epoch 1 - Batch 0 - Loss : 0.0421447679400444 Epoch 2 - Batch 0 - Loss : 0.025797121226787567 Epoch 3 - Batch 0 - Loss : 0.027662230655550957 Epoch 4 - Batch 0 - Loss : 0.02129991166293621 Epoch 5 - Batch 0 - Loss : 0.02649623528122902 Epoch 6 - Batch 0 - Loss : 0.025592397898435593 Epoch 7 - Batch 0 - Loss : 0.02580280229449272 Epoch 8 - Batch 0 - Loss : 0.0239135529845953 Epoch 9 - Batch 0 - Loss : 0.025206178426742554
import pickle
with open(f"./models/{variable}_encoder.model","wb") as f:
pickle.dump(encoder, f)
with open(f"./models/{variable}_embedding.model","wb") as f:
torch.save(embedding_trainer.embeddings, f)