preprocessing.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. """
  2. This script contains the preprocessing functions
  3. """
  4. import sqlite3
  5. import pandas as pd
  6. import numpy as np
  7. from sklearn.base import BaseEstimator, TransformerMixin
  8. def get_Xy_df (X, y):
  9. """
  10. Merge together the X and y dataframe on stay_id basis
  11. Parameters
  12. ----------
  13. X: pandas dataframe of features
  14. y: pandas dataframe of labeles
  15. Output
  16. ------
  17. Xy : merged pandas dataframe
  18. """
  19. Xy = pd.merge(
  20. X,
  21. y,
  22. left_on="stay_id",
  23. right_on="stay_id"
  24. )
  25. return Xy
  26. def generate_features_dataset(database, get_drugs=True, get_diseases=True):
  27. """
  28. Generate features dataset according to the data
  29. Parameters
  30. ----------
  31. database: str, path of the database sqlite file
  32. get_drugs: boolean, if true the drug history is returned,
  33. get_diseases: boolean, if true the disease history is returned
  34. """
  35. to_merge = []
  36. # Sqlite connection
  37. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  38. ## Getting the features
  39. features = pd.read_sql(f"""
  40. SELECT
  41. s.stay_id,
  42. s.intime intime,
  43. p.gender gender,
  44. p.anchor_age age,
  45. t.temperature,
  46. t.heartrate,
  47. t.resprate,
  48. t.o2sat,
  49. t.sbp,
  50. t.dbp,
  51. t.pain,
  52. t.chiefcomplaint
  53. FROM edstays s
  54. LEFT JOIN patients p
  55. ON p.subject_id = s.subject_id
  56. LEFT Join triage t
  57. ON t.stay_id = s.stay_id
  58. """, conn)
  59. ## Additional features
  60. ### Last visit
  61. last_visit = pd.read_sql(f"""
  62. SELECT DISTINCT
  63. s1.stay_id,
  64. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 7 AS INT) last_7,
  65. CAST(MAX((julianday(s1.intime)-julianday(s2.intime))) <= 30 AS INT) last_30
  66. FROM edstays s1
  67. INNER JOIN edstays s2
  68. ON s1.subject_id = s2.subject_id
  69. AND s1.stay_id != s2.stay_id
  70. AND s1.intime >= s2.intime
  71. WHERE (julianday(s1.intime)-julianday(s2.intime)) <= 30
  72. GROUP BY s1.stay_id
  73. """, conn)
  74. to_merge.append(last_visit)
  75. ### Past diagnosis
  76. if get_diseases:
  77. past_diagnosis = pd.read_sql(f"""
  78. SELECT
  79. s1.stay_id,
  80. d.icd_code,
  81. d.icd_version,
  82. COUNT(1) n
  83. FROM edstays s1
  84. INNER JOIN diagnosis d
  85. ON d.subject_id = s1.subject_id
  86. INNER JOIN edstays s2
  87. ON d.stay_id = s2.stay_id
  88. WHERE
  89. s1.intime >= s2.intime
  90. AND s1.stay_id != s2.stay_id
  91. GROUP BY
  92. s1.stay_id,
  93. d.icd_code,
  94. d.icd_version
  95. """, conn)
  96. past_diagnosis = pd.pivot_table(
  97. past_diagnosis.groupby(["stay_id","icd_version"])["icd_code"].agg(lambda x: x.tolist()) \
  98. .reset_index(),
  99. index="stay_id",
  100. columns="icd_version",
  101. values="icd_code",
  102. aggfunc=lambda x: x
  103. ).reset_index().rename(columns={
  104. 9:"icd9",
  105. 10:"icd10"
  106. })
  107. to_merge.append(past_diagnosis)
  108. ### Drugs
  109. if get_drugs:
  110. drugs = pd.read_sql(f"""
  111. SELECT stay_id, gsn, 1 n
  112. FROM medrecon
  113. """, conn)
  114. drugs = drugs.groupby("stay_id")["gsn"].agg(lambda x: x.tolist()).reset_index()
  115. to_merge.append(drugs)
  116. ### Merging all together
  117. for df_to_merge in to_merge:
  118. features = pd.merge(
  119. features,
  120. df_to_merge,
  121. left_on="stay_id",
  122. right_on="stay_id",
  123. how="left"
  124. )
  125. features = features.sort_values("stay_id").reset_index(drop=True)
  126. return features
  127. def generate_labels_dataset(database, lab_dictionnary):
  128. """
  129. Generate features dataset according to the data
  130. Parameters
  131. ----------
  132. database: str, path of the database sqlite file
  133. lab_dictionnary: dictionnary containing the id (keys) and label (value) of the biological exams to predict
  134. """
  135. to_merge = []
  136. # Sqlite connection
  137. conn = sqlite3.connect("./data/mimic-iv.sqlite")
  138. # Getting biological values
  139. lab_dictionnary_pd = pd.DataFrame.from_dict(lab_dictionnary, orient="index").reset_index()
  140. lab_dictionnary_list = [str(x) for x in lab_dictionnary.keys()]
  141. ## Let's create an index to speed up queries
  142. curs = conn.cursor()
  143. curs.execute("CREATE INDEX IF NOT EXISTS biological_index ON labevents (stay_id, itemid)")
  144. curs.close()
  145. # 1. Generating features
  146. ## Getting list of stay_id
  147. stays = pd.read_sql(
  148. "SELECT DISTINCT stay_id FROM edstays",
  149. conn
  150. )
  151. ## Getting the features
  152. labs = pd.read_sql(f"""
  153. SELECT
  154. le.stay_id,
  155. le.itemid item_id
  156. FROM labevents le
  157. WHERE le.itemid IN ('{"','".join(lab_dictionnary_list)}')
  158. GROUP BY
  159. le.stay_id,
  160. le.itemid
  161. """, conn)
  162. labs_deduplicate = pd.merge(
  163. lab_dictionnary_pd.rename(columns={0:"label"}),
  164. labs,
  165. left_on="index",
  166. right_on="item_id"
  167. ) \
  168. .drop_duplicates(["stay_id", "label"])[["stay_id","label"]] \
  169. .reset_index(drop=True)
  170. labs_deduplicate_pivot = pd.pivot_table(
  171. labs_deduplicate.assign(value=1),
  172. index="stay_id",
  173. columns="label",
  174. values="value"
  175. ).fillna(0)
  176. labs_deduplicate_pivot_final = labs_deduplicate_pivot.join(
  177. stays[["stay_id"]].set_index("stay_id"),
  178. how="right"
  179. ).fillna(0).astype("int8").reset_index()
  180. labels = labs_deduplicate_pivot_final.sort_values("stay_id").reset_index(drop=True)
  181. return labels
  182. def remove_outliers (X, variables_ranges):
  183. """
  184. This function remove the outliers and replace them by an NA according to the variable_ranges rules
  185. Parameters
  186. ----------
  187. X: pandas Dataframe
  188. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  189. Outputs
  190. -------
  191. Tuple containing :
  192. - Processing dataframe
  193. - A Dataframe containing the number and percentage of processed outliers per variable
  194. """
  195. outliers = {}
  196. X_copy = X.copy()
  197. for key, value in variables_ranges.items():
  198. outliers_mask = ((X[key] < value[0]) | (X[key] > value[1]))
  199. outliers[key] = outliers_mask.sum() # Storing the number of outliers
  200. X_copy.loc[outliers_mask, key] = np.NaN # Setting outliers to NA
  201. outlier_report = pd.DataFrame.from_dict(outliers, orient="index") \
  202. .rename(columns={0:"n"}) \
  203. .assign(total=X[outliers.keys()].count().values,
  204. pourcentage=lambda x: (x["n"]/x["total"])*100
  205. )
  206. return X_copy, outlier_report
  207. class OutlierRemover(BaseEstimator, TransformerMixin):
  208. """
  209. Sklearn-like class for removing outliers
  210. To be included in the pipeline
  211. """
  212. def __init__ (self, variables_ranges):
  213. """
  214. Parameters:
  215. ----------
  216. variables_ranges: Dict(variable:[range_inf, range_sup], ...), dictionnary containing for each variable the inferior and superior range
  217. """
  218. super().__init__()
  219. # Storing ranges rules
  220. self.variables_ranges = variables_ranges
  221. def fit(self, X=None, y=None):
  222. return self
  223. def transform(self, X):
  224. X_copy, _ = remove_outliers(X, self.variables_ranges)
  225. return X_copy
  226. class TextPreprocessing(BaseEstimator, TransformerMixin):
  227. """
  228. Sklearn-like class for text preprocessing
  229. To be included in the pipeline
  230. What does it do :
  231. - It fills na with empty string
  232. - It lower string
  233. - It replace comma by space
  234. """
  235. def __init__ (self):
  236. """
  237. Parameters:
  238. ----------
  239. """
  240. super().__init__()
  241. def fit(self, X=None, y=None):
  242. return self
  243. def transform(self, X):
  244. colname = X.name
  245. X = X \
  246. .fillna("") \
  247. .replace(",", " ").str.lower()
  248. return X