preprocessing.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. """
  2. This script contains the preprocessing functions
  3. """
  4. import sqlite3
  5. import pandas as pd
  6. import numpy as np
  7. from sklearn.base import BaseEstimator, TransformerMixin
  8. def get_Xy_df (X, y):
  9. """
  10. Merge together the X and y dataframe on stay_id basis
  11. Parameters
  12. ----------
  13. X: pandas dataframe of features
  14. y: pandas dataframe of labeles
  15. Output
  16. ------
  17. Xy : merged pandas dataframe
  18. """
  19. Xy = pd.merge(
  20. X,
  21. y,
  22. left_on="stay_id",
  23. right_on="stay_id"
  24. )
  25. return Xy
  26. def generate_features_dataset(database, get_drugs=True, get_diseases=True):
  27. """
  28. Generate features dataset according to the data
  29. Parameters
  30. ----------
  31. database: str, path of the database sqlite file
  32. get_drugs: boolean, if true the drug history is returned,
  33. get_diseases: boolean, if true the disease history is returned
  34. """
  35. to_merge = []
  36. # Sqlite connection
  37. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  38. ## Getting the features
  39. features = pd.read_sql(f"""
  40. SELECT
  41. s.stay_id,
  42. s.intime intime,
  43. p.gender gender,
  44. p.anchor_age age,
  45. t.temperature,
  46. t.heartrate,
  47. t.resprate,
  48. t.o2sat,
  49. t.sbp,
  50. t.dbp,
  51. t.pain,
  52. t.chiefcomplaint
  53. FROM edstays s
  54. LEFT JOIN patients p
  55. ON p.subject_id = s.subject_id
  56. LEFT Join triage t
  57. ON t.stay_id = s.stay_id
  58. """, conn)
  59. ## Additional features
  60. ### Last visit
  61. last_visit = pd.read_sql(f"""
  62. SELECT DISTINCT
  63. s1.stay_id,
  64. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
  65. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
  66. FROM edstays s1
  67. INNER JOIN edstays s2
  68. ON s1.subject_id = s2.subject_id
  69. AND s1.stay_id != s2.stay_id
  70. AND s1.intime >= s2.intime
  71. WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
  72. GROUP BY s1.stay_id
  73. """, conn)
  74. to_merge.append(last_visit)
  75. ### Past diagnosis
  76. if get_diseases:
  77. past_diagnosis = pd.read_sql(f"""
  78. SELECT
  79. s1.stay_id,
  80. d.icd_code,
  81. d.icd_version,
  82. COUNT(1) n
  83. FROM edstays s1
  84. INNER JOIN diagnosis d
  85. ON d.subject_id = s1.subject_id
  86. INNER JOIN edstays s2
  87. ON d.stay_id = s2.stay_id
  88. WHERE
  89. s1.intime >= s2.intime
  90. AND s1.stay_id != s2.stay_id
  91. GROUP BY
  92. s1.stay_id,
  93. d.icd_code,
  94. d.icd_version
  95. """, conn)
  96. past_diagnosis = pd.pivot_table(
  97. past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
  98. .reset_index(),
  99. index="stay_id",
  100. columns="icd_version",
  101. values="icd_code",
  102. aggfunc=lambda x: x
  103. ).reset_index().rename(columns={
  104. 9:"icd9",
  105. 10:"icd10"
  106. })
  107. to_merge.append(past_diagnosis)
  108. ### Drugs
  109. if get_drugs:
  110. drugs = pd.read_sql(f"""
  111. SELECT stay_id, gsn, 1 n
  112. FROM medrecon
  113. """, conn)
  114. drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
  115. to_merge.append(drugs)
  116. ### Merging all together
  117. for df_to_merge in to_merge:
  118. features = pd.merge(
  119. features,
  120. df_to_merge,
  121. left_on="stay_id",
  122. right_on="stay_id",
  123. how="left"
  124. )
  125. features = features.sort_values("stay_id").reset_index(drop=True)
  126. return features
  127. def generate_labels_dataset(database, lab_dictionnary):
  128. """
  129. Generate features dataset according to the data
  130. Parameters
  131. ----------
  132. database: str, path of the database sqlite file
  133. lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
  134. """
  135. to_merge = []
  136. # Sqlite connection
  137. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  138. # Getting biological values
  139. lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
  140. lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]
  141. ## Let's create an index to speed up queries
  142. conn.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")
  143. # 1. Generating features
  144. ## Getting list of stay_id
  145. stays = pd.read_sql(
  146. "SELECT DISTINCT stay_id FROM edstays",
  147. conn
  148. )
  149. ## Getting the features
  150. labs = pd.read_sql(f"""
  151. SELECT
  152. le.stay_id,
  153. le.itemid item_id
  154. FROM labevents le
  155. WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
  156. GROUP BY
  157. le.stay_id,
  158. le.itemid
  159. """, conn)
  160. labs_deduplicate = pd.merge(
  161. lab_dictionnary_pd.rename(columns={0:"label"}),
  162. labs,
  163. left_on="index",
  164. right_on="item_id"
  165. ) \
  166. .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
  167. .reset_index(drop=True)
  168. labs_deduplicate_pivot = pd.pivot_table(
  169. labs_deduplicate.assign(value=1),
  170. index="stay_id",
  171. columns="label",
  172. values="value"
  173. ).fillna(0)
  174. labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
  175. stays[["stay_id"]].set_index("stay_id"),
  176. how="right"
  177. ).fillna(0).astype("int8").reset_index()
  178. labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
  179. return labels
  180. def remove_outliers (X, variables_ranges):
  181. """
  182. This function remove the outliers and replace them by an NA according to the variable_ranges rules
  183. Parameters
  184. ----------
  185. X: pandas Dataframe
  186. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  187. Outputs
  188. -------
  189. Tuple containing :
  190. - Processing dataframe
  191. - A Dataframe containing the number and percentage of processed outliers per variable
  192. """
  193. outliers = {}
  194. X_copy = X.copy()
  195. for key, value in variables_ranges.items():
  196. outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
  197. outliers[key] = outliers_mask.sum() # Storing the number of outliers
  198. X_copy.loc[outliers_mask, key] = np.NaN # Setting outliers to NA
  199. outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
  200. .rename(columns={0:"n"}) \
  201. .assign(total=X[outliers.keys()].count().values,
  202. pourcentage=lambda x: (x["n"]/x["total"])*100
  203. )
  204. return X_copy, outlier_report
  205. class OutlierRemover(BaseEstimator, TransformerMixin):
  206. """
  207. Sklearn-like class for removing outliers
  208. To be included in the pipeline
  209. """
  210. def __init__ (self, variables_ranges):
  211. """
  212. Parameters:
  213. ----------
  214. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  215. """
  216. super().__init__()
  217. # Storing ranges rules
  218. self.variables_ranges = variables_ranges
  219. def fit(self, X=None, y=None):
  220. return self
  221. def transform(self, X):
  222. X_copy, _ = remove_outliers(X, self.variables_ranges)
  223. return X_copy
  224. class TextPreprocessing(BaseEstimator, TransformerMixin):
  225. """
  226. Sklearn-like class for text preprocessing
  227. To be included in the pipeline
  228. What does it do :
  229. - It fills na with empty string
  230. - It lower string
  231. - It replace comma by space
  232. """
  233. def __init__ (self):
  234. """
  235. Parameters:
  236. ----------
  237. """
  238. super().__init__()
  239. def fit(self, X=None, y=None):
  240. return self
  241. def transform(self, X):
  242. colname = X.name
  243. X = X \
  244. .fillna("") \
  245. .replace(",", " ").str.lower()
  246. return X