preprocessing.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. """
  2. This script the preprocessing functions
  3. """
  4. import sqlite3
  5. import pandas as pd
  6. def generate_features_dataset(database, get_drugs=True, get_diseases=True):
  7. """
  8. Generate features dataset according to the data
  9. Parameters
  10. ----------
  11. database: str, path of the database sqlite file
  12. get_drugs: boolean, if true the drug history is returned,
  13. get_diseases: boolean, if true the disease history is returned
  14. """
  15. to_merge = []
  16. # Sqlite connection
  17. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  18. ## Getting the features
  19. features = pd.read_sql(f"""
  20. SELECT
  21. s.stay_id,
  22. s.intime intime,
  23. p.gender gender,
  24. p.anchor_age age,
  25. t.temperature,
  26. t.heartrate,
  27. t.resprate,
  28. t.o2sat,
  29. t.sbp,
  30. t.dbp,
  31. t.pain,
  32. t.chiefcomplaint
  33. FROM edstays s
  34. LEFT JOIN patients p
  35. ON p.subject_id = s.subject_id
  36. LEFT Join triage t
  37. ON t.stay_id = s.stay_id
  38. """, conn)
  39. ## Additional features
  40. ### Last visit
  41. last_visit = pd.read_sql(f"""
  42. SELECT DISTINCT
  43. s1.stay_id,
  44. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
  45. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
  46. FROM edstays s1
  47. INNER JOIN edstays s2
  48. ON s1.subject_id = s2.subject_id
  49. AND s1.stay_id != s2.stay_id
  50. AND s1.intime >= s2.intime
  51. WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
  52. GROUP BY s1.stay_id
  53. """, conn)
  54. to_merge.append(last_visit)
  55. ### Past diagnosis
  56. if get_diseases:
  57. past_diagnosis = pd.read_sql(f"""
  58. SELECT
  59. s1.stay_id,
  60. d.icd_code,
  61. d.icd_version,
  62. COUNT(1) n
  63. FROM edstays s1
  64. INNER JOIN diagnosis d
  65. ON d.subject_id = s1.subject_id
  66. INNER JOIN edstays s2
  67. ON d.stay_id = s2.stay_id
  68. WHERE
  69. s1.intime >= s2.intime
  70. AND s1.stay_id != s2.stay_id
  71. GROUP BY
  72. s1.stay_id,
  73. d.icd_code,
  74. d.icd_version
  75. """, conn)
  76. past_diagnosis = pd.pivot_table(
  77. past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
  78. .reset_index(),
  79. index="stay_id",
  80. columns="icd_version",
  81. values="icd_code",
  82. aggfunc=lambda x: x
  83. ).reset_index().rename(columns={
  84. 9:"icd9",
  85. 10:"icd10"
  86. })
  87. to_merge.append(past_diagnosis)
  88. ### Drugs
  89. if get_drugs:
  90. drugs = pd.read_sql(f"""
  91. SELECT stay_id, gsn, 1 n
  92. FROM medrecon
  93. """, conn)
  94. drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
  95. to_merge.append(drugs)
  96. ### Merging all together
  97. for df_to_merge in to_merge:
  98. features = pd.merge(
  99. features,
  100. df_to_merge,
  101. left_on="stay_id",
  102. right_on="stay_id",
  103. how="left"
  104. )
  105. features = features.sort_values("stay_id").reset_index(drop=True)
  106. return features
  107. def generate_labels_dataset(database, lab_dictionnary):
  108. """
  109. Generate features dataset according to the data
  110. Parameters
  111. ----------
  112. database: str, path of the database sqlite file
  113. lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
  114. """
  115. to_merge = []
  116. # Sqlite connection
  117. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  118. # Getting biological values
  119. lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
  120. lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]
  121. ## Let's create an index to speed up queries
  122. conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")
  123. # 1. Generating features
  124. ## Getting list of stay_id
  125. stays = pd.read_sql(
  126. "SELECT DISTINCT stay_id FROM edstays",
  127. conn
  128. )
  129. ## Getting the features
  130. labs = pd.read_sql(f"""
  131. SELECT
  132. le.stay_id,
  133. le.itemid item_id
  134. FROM labevents le
  135. WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
  136. GROUP BY
  137. le.stay_id,
  138. le.itemid
  139. """, conn)
  140. labs_deduplicate = pd.merge(
  141. lab_dictionnary_pd.rename(columns={0:"label"}),
  142. labs,
  143. left_on="index",
  144. right_on="item_id"
  145. ) \
  146. .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
  147. .reset_index(drop=True)
  148. labs_deduplicate_pivot = pd.pivot_table(
  149. labs_deduplicate.assign(value=1),
  150. index="stay_id",
  151. columns="label",
  152. values="value"
  153. ).fillna(0)
  154. labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
  155. stays[["stay_id"]].set_index("stay_id"),
  156. how="right"
  157. ).fillna(0).astype("int8").reset_index()
  158. labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
  159. return labels