0_1_Drug_Embedding.ipynb 9.7 KB

import sqlite3
import pandas as pd
import numpy as np
# Sqlite connection
conn = sqlite3.connect("./data/mimic-iv.sqlite")

# Classification ATC des médicaments
drugs_rules = pd.read_csv("./config/atc_items.csv")
drugs_rules_list = drugs_rules["gsn"].drop_duplicates().astype("str").tolist()
# Récupération des codes

drugs = pd.read_sql(f"""
    SELECT stay_id, gsn, etccode, 1 n
    FROM medrecon
    WHERE gsn IN ({','.join(drugs_rules_list)})
""", conn)

# Liste des codes pour chaque séjour
stays_code = pd.merge(
    drugs,
    drugs_rules,
    left_on="gsn",
    right_on="gsn"
) \
 .reset_index()
stays_code["ATC_4"] = stays_code["atc"]
stays_code["ATC_2"] = stays_code["atc"].str.slice(0,3)
stays_code["ETC"] = stays_code["etccode"]

Création de l'encodeur et des embeddings

variable = "ETC"
from sklearn.preprocessing import OrdinalEncoder
stays_code_dropped = stays_code.dropna(subset=[variable]).drop_duplicates(["stay_id", variable]).reset_index(drop=True)
stays_code_dropped = stays_code_dropped[["stay_id", "gsn", variable]] \
    .rename(columns={variable:"code"})
stays_code_dropped["code"] = stays_code_dropped["code"].astype("int").astype("str")
# Creation de l'encodeur
encoder = OrdinalEncoder().fit(stays_code_dropped[["code"]])
# Entrainement des embeddings
stays_code_dropped["code_id"] = encoder.transform(stays_code_dropped[["code"]]).astype("int32")
pair_matrix = pd.merge(
    stays_code_dropped[["stay_id","gsn", "code_id"]],
    stays_code_dropped[["stay_id","gsn", "code_id"]],
    left_on="stay_id",
    right_on="stay_id"
).query("gsn_x != gsn_y")[["code_id_x", "code_id_y"]]

pair_matrix_probability = pair_matrix.assign(n = 1).groupby(["code_id_x", "code_id_y"]).sum() \
           .reset_index() \
           .join(
               pair_matrix.assign(n_total=1).groupby("code_id_x")["n_total"].sum(),
               on="code_id_x"
           ) \
           .assign(prob=lambda x: x["n"]/x["n_total"])[["code_id_x", "code_id_y", "prob"]] \
           .values
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
class embeddingTrainer (nn.Module):
    def __init__ (self, embedding_size=100):
        super().__init__()

        # Le dernier index correspond au pad token
        self.embeddings = nn.Embedding(num_embeddings=encoder.categories_[0].shape[0]+1, embedding_dim=embedding_size)


        self.network = nn.Sequential(*[
            nn.Linear(embedding_size, 50),
            nn.ReLU(),
            nn.Linear(50, 200),
            nn.ReLU()
        ])

        self.proba = nn.Sequential(*[
            nn.Linear(400, 200),
            nn.ReLU(),
            nn.Linear(200,50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10,1),
            nn.Sigmoid()
        ])

        self.loss = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=5e-5)

    def forward(self, x):

        word_1 = x[:,0]
        word_2 = x[:,1]

        embedding_1 = self.network(self.embeddings(word_1))
        embedding_2 = self.network(self.embeddings(word_2))

        merged_data = torch.concat([embedding_1, embedding_2], axis=1)

        y_hat = self.proba(merged_data)

        return y_hat
    
    def fit(self, x, y):

        self.train()

        self.optimizer.zero_grad()

        y_hat = self.forward(x)
        loss = self.loss(y_hat, y)

        loss.backward()

        self.optimizer.step()

        loss_detach = loss.detach().cpu()

        return loss_detach
    
    def predict(self, x):

        self.eval()
        with torch.no_grad():

            y_hat = self.forward(x)

        return y_hat
loader = DataLoader(pair_matrix_probability, shuffle=True, batch_size=1000)
embedding_trainer = embeddingTrainer(embedding_size=100)
embedding_trainer = embedding_trainer.to("cuda:0")
n_epoch = 10

n_print_epoch = 10
n_print_batch = 1000

for i in range(n_epoch):
    losses = []

    j = 0
    for x in loader:
        x_batch = x[:,[0,1]].int()
        x_batch = x_batch.to("cuda:0")
        y_batch = x[:,2].float().unsqueeze(dim=1)
        y_batch = y_batch.to("cuda:0")

        loss = embedding_trainer.fit(x_batch, y_batch)
        losses.append(loss)

        if j%n_print_batch == 0:
            loss_mean = np.array(losses).mean()
            print(f"Epoch {i} - Batch {j} - Loss : {loss_mean}")

        j += 1

    if i%n_print_epoch == 0:
        loss_mean = np.array(losses).mean()
        print(f"Epoch {i} - Loss : {loss_mean}")
Epoch 0 - Batch 0 - Loss : 0.7970905303955078
Epoch 0 - Loss : 0.45808276534080505
Epoch 1 - Batch 0 - Loss : 0.0421447679400444
Epoch 2 - Batch 0 - Loss : 0.025797121226787567
Epoch 3 - Batch 0 - Loss : 0.027662230655550957
Epoch 4 - Batch 0 - Loss : 0.02129991166293621
Epoch 5 - Batch 0 - Loss : 0.02649623528122902
Epoch 6 - Batch 0 - Loss : 0.025592397898435593
Epoch 7 - Batch 0 - Loss : 0.02580280229449272
Epoch 8 - Batch 0 - Loss : 0.0239135529845953
Epoch 9 - Batch 0 - Loss : 0.025206178426742554

Export

import pickle

Encoder

with open(f"./models/{variable}_encoder.model","wb") as f:
    pickle.dump(encoder, f)

Modele d'embedding

with open(f"./models/{variable}_embedding.model","wb") as f:
    torch.save(embedding_trainer.embeddings, f)