preprocessing.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. """
  2. This script contains the preprocessing functions
  3. """
  4. import sqlite3
  5. import pandas as pd
  6. def get_Xy_df (X, y):
  7. """
  8. Merge together the X and y dataframe on stay_id basis
  9. Parameters
  10. ----------
  11. X: pandas dataframe of features
  12. y: pandas dataframe of labeles
  13. Output
  14. ------
  15. Xy : merged pandas dataframe
  16. """
  17. Xy = pd.merge(
  18. X,
  19. y,
  20. left_on="stay_id",
  21. right_on="stay_id"
  22. )
  23. return Xy
  24. def generate_features_dataset(database, get_drugs=True, get_diseases=True):
  25. """
  26. Generate features dataset according to the data
  27. Parameters
  28. ----------
  29. database: str, path of the database sqlite file
  30. get_drugs: boolean, if true the drug history is returned,
  31. get_diseases: boolean, if true the disease history is returned
  32. """
  33. to_merge = []
  34. # Sqlite connection
  35. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  36. ## Getting the features
  37. features = pd.read_sql(f"""
  38. SELECT
  39. s.stay_id,
  40. s.intime intime,
  41. p.gender gender,
  42. p.anchor_age age,
  43. t.temperature,
  44. t.heartrate,
  45. t.resprate,
  46. t.o2sat,
  47. t.sbp,
  48. t.dbp,
  49. t.pain,
  50. t.chiefcomplaint
  51. FROM edstays s
  52. LEFT JOIN patients p
  53. ON p.subject_id = s.subject_id
  54. LEFT Join triage t
  55. ON t.stay_id = s.stay_id
  56. """, conn)
  57. ## Additional features
  58. ### Last visit
  59. last_visit = pd.read_sql(f"""
  60. SELECT DISTINCT
  61. s1.stay_id,
  62. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
  63. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
  64. FROM edstays s1
  65. INNER JOIN edstays s2
  66. ON s1.subject_id = s2.subject_id
  67. AND s1.stay_id != s2.stay_id
  68. AND s1.intime >= s2.intime
  69. WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
  70. GROUP BY s1.stay_id
  71. """, conn)
  72. to_merge.append(last_visit)
  73. ### Past diagnosis
  74. if get_diseases:
  75. past_diagnosis = pd.read_sql(f"""
  76. SELECT
  77. s1.stay_id,
  78. d.icd_code,
  79. d.icd_version,
  80. COUNT(1) n
  81. FROM edstays s1
  82. INNER JOIN diagnosis d
  83. ON d.subject_id = s1.subject_id
  84. INNER JOIN edstays s2
  85. ON d.stay_id = s2.stay_id
  86. WHERE
  87. s1.intime >= s2.intime
  88. AND s1.stay_id != s2.stay_id
  89. GROUP BY
  90. s1.stay_id,
  91. d.icd_code,
  92. d.icd_version
  93. """, conn)
  94. past_diagnosis = pd.pivot_table(
  95. past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
  96. .reset_index(),
  97. index="stay_id",
  98. columns="icd_version",
  99. values="icd_code",
  100. aggfunc=lambda x: x
  101. ).reset_index().rename(columns={
  102. 9:"icd9",
  103. 10:"icd10"
  104. })
  105. to_merge.append(past_diagnosis)
  106. ### Drugs
  107. if get_drugs:
  108. drugs = pd.read_sql(f"""
  109. SELECT stay_id, gsn, 1 n
  110. FROM medrecon
  111. """, conn)
  112. drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
  113. to_merge.append(drugs)
  114. ### Merging all together
  115. for df_to_merge in to_merge:
  116. features = pd.merge(
  117. features,
  118. df_to_merge,
  119. left_on="stay_id",
  120. right_on="stay_id",
  121. how="left"
  122. )
  123. features = features.sort_values("stay_id").reset_index(drop=True)
  124. return features
  125. def generate_labels_dataset(database, lab_dictionnary):
  126. """
  127. Generate features dataset according to the data
  128. Parameters
  129. ----------
  130. database: str, path of the database sqlite file
  131. lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
  132. """
  133. to_merge = []
  134. # Sqlite connection
  135. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  136. # Getting biological values
  137. lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
  138. lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]
  139. ## Let's create an index to speed up queries
  140. conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")
  141. # 1. Generating features
  142. ## Getting list of stay_id
  143. stays = pd.read_sql(
  144. "SELECT DISTINCT stay_id FROM edstays",
  145. conn
  146. )
  147. ## Getting the features
  148. labs = pd.read_sql(f"""
  149. SELECT
  150. le.stay_id,
  151. le.itemid item_id
  152. FROM labevents le
  153. WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
  154. GROUP BY
  155. le.stay_id,
  156. le.itemid
  157. """, conn)
  158. labs_deduplicate = pd.merge(
  159. lab_dictionnary_pd.rename(columns={0:"label"}),
  160. labs,
  161. left_on="index",
  162. right_on="item_id"
  163. ) \
  164. .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
  165. .reset_index(drop=True)
  166. labs_deduplicate_pivot = pd.pivot_table(
  167. labs_deduplicate.assign(value=1),
  168. index="stay_id",
  169. columns="label",
  170. values="value"
  171. ).fillna(0)
  172. labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
  173. stays[["stay_id"]].set_index("stay_id"),
  174. how="right"
  175. ).fillna(0).astype("int8").reset_index()
  176. labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
  177. return labels
  178. def remove_outliers (X, variables_ranges):
  179. """
  180. This function remove the outliers and replace them by an NA according to the variable_ranges rules
  181. Parameters
  182. ----------
  183. X: pandas Dataframe
  184. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  185. Outputs
  186. -------
  187. Tuple containing :
  188. - Processing dataframe
  189. - A Dataframe containing the number and percentage of processed outliers per variable
  190. """
  191. outliers = {}
  192. X_copy = X.copy()
  193. for key, value in variables_ranges.items():
  194. outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
  195. outliers[key] = outliers_mask.sum() # Storing the number of outliers
  196. X_copy.loc[outliers_mask, key] = pd.NA # Setting outliers to NA
  197. outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
  198. .rename(columns={0:"n"}) \
  199. .assign(total=X[outliers.keys()].count().values,
  200. pourcentage=lambda x: (x["n"]/x["total"])*100
  201. )
  202. return X_copy, outlier_report