preprocessing.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """
  2. This script contains the preprocessing functions
  3. """
  4. import sqlite3
  5. import pandas as pd
  6. import numpy as np
  7. def get_Xy_df (X, y):
  8. """
  9. Merge together the X and y dataframe on stay_id basis
  10. Parameters
  11. ----------
  12. X: pandas dataframe of features
  13. y: pandas dataframe of labeles
  14. Output
  15. ------
  16. Xy : merged pandas dataframe
  17. """
  18. Xy = pd.merge(
  19. X,
  20. y,
  21. left_on="stay_id",
  22. right_on="stay_id"
  23. )
  24. return Xy
  25. def generate_features_dataset(database, get_drugs=True, get_diseases=True):
  26. """
  27. Generate features dataset according to the data
  28. Parameters
  29. ----------
  30. database: str, path of the database sqlite file
  31. get_drugs: boolean, if true the drug history is returned,
  32. get_diseases: boolean, if true the disease history is returned
  33. """
  34. to_merge = []
  35. # Sqlite connection
  36. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  37. ## Getting the features
  38. features = pd.read_sql(f"""
  39. SELECT
  40. s.stay_id,
  41. s.intime intime,
  42. p.gender gender,
  43. p.anchor_age age,
  44. t.temperature,
  45. t.heartrate,
  46. t.resprate,
  47. t.o2sat,
  48. t.sbp,
  49. t.dbp,
  50. t.pain,
  51. t.chiefcomplaint
  52. FROM edstays s
  53. LEFT JOIN patients p
  54. ON p.subject_id = s.subject_id
  55. LEFT Join triage t
  56. ON t.stay_id = s.stay_id
  57. """, conn)
  58. ## Additional features
  59. ### Last visit
  60. last_visit = pd.read_sql(f"""
  61. SELECT DISTINCT
  62. s1.stay_id,
  63. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
  64. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
  65. FROM edstays s1
  66. INNER JOIN edstays s2
  67. ON s1.subject_id = s2.subject_id
  68. AND s1.stay_id != s2.stay_id
  69. AND s1.intime >= s2.intime
  70. WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
  71. GROUP BY s1.stay_id
  72. """, conn)
  73. to_merge.append(last_visit)
  74. ### Past diagnosis
  75. if get_diseases:
  76. past_diagnosis = pd.read_sql(f"""
  77. SELECT
  78. s1.stay_id,
  79. d.icd_code,
  80. d.icd_version,
  81. COUNT(1) n
  82. FROM edstays s1
  83. INNER JOIN diagnosis d
  84. ON d.subject_id = s1.subject_id
  85. INNER JOIN edstays s2
  86. ON d.stay_id = s2.stay_id
  87. WHERE
  88. s1.intime >= s2.intime
  89. AND s1.stay_id != s2.stay_id
  90. GROUP BY
  91. s1.stay_id,
  92. d.icd_code,
  93. d.icd_version
  94. """, conn)
  95. past_diagnosis = pd.pivot_table(
  96. past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
  97. .reset_index(),
  98. index="stay_id",
  99. columns="icd_version",
  100. values="icd_code",
  101. aggfunc=lambda x: x
  102. ).reset_index().rename(columns={
  103. 9:"icd9",
  104. 10:"icd10"
  105. })
  106. to_merge.append(past_diagnosis)
  107. ### Drugs
  108. if get_drugs:
  109. drugs = pd.read_sql(f"""
  110. SELECT stay_id, gsn, 1 n
  111. FROM medrecon
  112. """, conn)
  113. drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
  114. to_merge.append(drugs)
  115. ### Merging all together
  116. for df_to_merge in to_merge:
  117. features = pd.merge(
  118. features,
  119. df_to_merge,
  120. left_on="stay_id",
  121. right_on="stay_id",
  122. how="left"
  123. )
  124. features = features.sort_values("stay_id").reset_index(drop=True)
  125. return features
  126. def generate_labels_dataset(database, lab_dictionnary):
  127. """
  128. Generate features dataset according to the data
  129. Parameters
  130. ----------
  131. database: str, path of the database sqlite file
  132. lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
  133. """
  134. to_merge = []
  135. # Sqlite connection
  136. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  137. # Getting biological values
  138. lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
  139. lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]
  140. ## Let's create an index to speed up queries
  141. conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")
  142. # 1. Generating features
  143. ## Getting list of stay_id
  144. stays = pd.read_sql(
  145. "SELECT DISTINCT stay_id FROM edstays",
  146. conn
  147. )
  148. ## Getting the features
  149. labs = pd.read_sql(f"""
  150. SELECT
  151. le.stay_id,
  152. le.itemid item_id
  153. FROM labevents le
  154. WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
  155. GROUP BY
  156. le.stay_id,
  157. le.itemid
  158. """, conn)
  159. labs_deduplicate = pd.merge(
  160. lab_dictionnary_pd.rename(columns={0:"label"}),
  161. labs,
  162. left_on="index",
  163. right_on="item_id"
  164. ) \
  165. .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
  166. .reset_index(drop=True)
  167. labs_deduplicate_pivot = pd.pivot_table(
  168. labs_deduplicate.assign(value=1),
  169. index="stay_id",
  170. columns="label",
  171. values="value"
  172. ).fillna(0)
  173. labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
  174. stays[["stay_id"]].set_index("stay_id"),
  175. how="right"
  176. ).fillna(0).astype("int8").reset_index()
  177. labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
  178. return labels
  179. def remove_outliers (X, variables_ranges):
  180. """
  181. This function remove the outliers and replace them by an NA according to the variable_ranges rules
  182. Parameters
  183. ----------
  184. X: pandas Dataframe
  185. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  186. Outputs
  187. -------
  188. Tuple containing :
  189. - Processing dataframe
  190. - A Dataframe containing the number and percentage of processed outliers per variable
  191. """
  192. outliers = {}
  193. X_copy = X.copy()
  194. for key, value in variables_ranges.items():
  195. outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
  196. outliers[key] = outliers_mask.sum() # Storing the number of outliers
  197. X_copy.loc[outliers_mask, key] = np.NaN # Setting outliers to NA
  198. outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
  199. .rename(columns={0:"n"}) \
  200. .assign(total=X[outliers.keys()].count().values,
  201. pourcentage=lambda x: (x["n"]/x["total"])*100
  202. )
  203. return X_copy, outlier_report