pandasToBrat.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. import re
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. def _getDictionnaryKeys(dictionnary):
  6. """
  7. Function that get keys from a dict object and flatten sub dict.
  8. """
  9. keys_array = []
  10. for key in dictionnary.keys():
  11. keys_array.append(key)
  12. if (type(dictionnary[key]) == type({})):
  13. keys_array = keys_array+_getDictionnaryKeys(dictionnary[key])
  14. return(keys_array)
  15. class pandasToBrat:
  16. """
  17. Class for Pandas brat folder management.
  18. For each brat folder, there is an instance of pandasToBrat.
  19. It supports importation and exportation of configurations for relations and entities.
  20. Documents importation and exportation.
  21. Annotations and entities importation and exportation.
  22. Inputs :
  23. folder, str : path of brat folder
  24. """
  25. def __init__(self, folder):
  26. self.folder = folder
  27. self.conf_file = 'annotation.conf'
  28. self.emptyDFCols = {
  29. "annotations":["id","type_id", "word", "label", "start", "end"],
  30. "relations":["id","type_id","relation","Arg1","Arg2"]
  31. }
  32. # Adding '/' to folder path if missing
  33. if(self.folder[-1] != '/'):
  34. self.folder += '/'
  35. # Creating folder if do not exist
  36. if (os.path.isdir(self.folder)) == False:
  37. os.mkdir(self.folder)
  38. # Loading conf file if exists | creating empty conf file if not
  39. self.read_conf()
  40. def _emptyData(self):
  41. fileList = self._getFileList()
  42. nb_files = fileList.shape[0]
  43. confirmation = input("Deleting all data ({} files), press y to confirm :".format(nb_files))
  44. if confirmation == 'y':
  45. fileList["filename"].apply(lambda x: os.remove(self.folder+x))
  46. print("{} files deleted.".format(nb_files))
  47. def _generateEntitiesStr (self, conf, data = '', level = 0):
  48. if (type(conf) != type({})):
  49. return data
  50. # Parsing keys
  51. for key in conf.keys():
  52. value = conf[key]
  53. if value == True:
  54. data += '\n'+level*'\t'+key
  55. elif value == False:
  56. data += '\n'+level*'\t'+'!'+key
  57. elif type(value) == type({}):
  58. data += '\n'+level*'\t'+key
  59. data = self._generateEntitiesStr(value, data, level+1)
  60. return data
  61. def _writeEntitiesLevel (self, conf, data, last_n = -1):
  62. for n in range(last_n,len(conf)):
  63. # If empty : pass, if not the last line : pass
  64. if (conf[n] != '' and n > last_n):
  65. level = len(conf[n].split("\t"))-1
  66. if (n+1 <= len(conf)): # Level of next item
  67. next_level = len(conf[n+1].split("\t"))-1
  68. else:
  69. next_level = level
  70. splitted_str = conf[n].split("\t")
  71. str_clean = splitted_str[len(splitted_str)-1]
  72. if (level >= next_level): # On écrit les lignes de même niveau
  73. if (str_clean[0] == '!'):
  74. data[str_clean[1:]] = False
  75. else:
  76. data[str_clean] = True
  77. if (level > next_level):
  78. # On casse la boucle
  79. break
  80. elif (level < next_level): # On écrit les lignes inférieurs par récurence
  81. splitted_str = conf[n].split("\t")
  82. last_n, data[str_clean] = self._writeEntitiesLevel(conf, {}, n)
  83. return(n, data)
  84. def _readRelations(self, relations, entities = []):
  85. data = {}
  86. for relation in relations.split("\n"):
  87. if relation != '':
  88. relation_data = relation.split("\t")[0]
  89. args = list(map(lambda x: x.split(":")[1], relation.split("\t")[1].split(", ")))
  90. args_valid = list(filter(lambda x: x in entities, args))
  91. if (len(args_valid) > 0):
  92. data[relation_data] = {"args":args_valid}
  93. return data
  94. def _writeRelations(self, relations, entities = []):
  95. data = ''
  96. for relation in relations:
  97. args_array = list(filter(lambda x: x in entities, relations[relation]["args"]))
  98. if (len(args_array) > 0):
  99. data += '\n'+relation+'\t'
  100. for n in range(0, len(args_array)):
  101. data += int(bool(n))*', '+'Arg'+str(n+1)+':'+args_array[n]
  102. return data
  103. def read_conf (self):
  104. """
  105. Get the current Brat configuration.
  106. Output :
  107. Dict containing "entities" and "relations" configurations.
  108. """
  109. if (os.path.isfile(self.folder+self.conf_file)):
  110. # Reading file
  111. file = open(self.folder+self.conf_file)
  112. conf_str = file.read()
  113. file.close()
  114. # Splitting conf_str
  115. conf_data = re.split(re.compile(r"\[[a-zA-Z]+\]", re.DOTALL), conf_str)[1:]
  116. data = {}
  117. # Reading enteties
  118. data["entities"] = self._writeEntitiesLevel(conf_data[0].split("\n"), {})[1]
  119. # Reading relations
  120. entitiesKeys = _getDictionnaryKeys(data["entities"])
  121. data["relations"] = self._readRelations(conf_data[1], entitiesKeys)
  122. return(data)
  123. else:
  124. self.write_conf()
  125. self.read_conf()
  126. def write_conf(self, entities = {}, relations = {}, events = {}, attributes = {}):
  127. """
  128. Write or overwrite configuration file.
  129. It actually doesn't suppport events and attributes configuration data.
  130. inputs :
  131. entities, dict : dict containing the entities. If an entities do have children, his value is an other dict, otherwise, it is set as True.
  132. relations, dict : dict containing the relations between entities, each key is a relation name, the value is a dict with a "args" key containing the list of related entities.
  133. """
  134. # TODO : Add events and attributes support.
  135. conf_str = ''
  136. # Entities
  137. conf_str += '\n\n[entities]'
  138. conf_str += self._generateEntitiesStr(entities)
  139. # relations
  140. conf_str += '\n\n[relations]'
  141. entitiesKeys = _getDictionnaryKeys(entities)
  142. conf_str += self._writeRelations(relations, entitiesKeys)
  143. # attributes
  144. conf_str += '\n\n[attributes]'
  145. # events
  146. conf_str += '\n\n[events]'
  147. # Write conf file
  148. file = open(self.folder+self.conf_file,'w')
  149. file.write(conf_str)
  150. file.close()
  151. def _getFileList(self):
  152. # Listing files
  153. filesDF = pd.DataFrame({'filename':pd.Series(os.listdir(self.folder))})
  154. filesDFSplitted = filesDF["filename"].str.split(".", expand = True)
  155. filesDF["id"] = filesDFSplitted[0]
  156. filesDF["filetype"] = filesDFSplitted[1]
  157. filesDF = filesDF[filesDF["filetype"].isin(["txt","ann"])]
  158. return(filesDF)
  159. def _parseData(self):
  160. # Listing files
  161. filesDF = self._getFileList()
  162. # Getting data from txt and ann
  163. filesDF_txt = filesDF.rename(columns = {"filename":"text_data"}).loc[filesDF["filetype"] == "txt", ["id","text_data"]]
  164. filesDF_ann = filesDF.rename(columns = {"filename":"annotation"}).loc[filesDF["filetype"] == "ann", ["id","annotation"]]
  165. dataDF = filesDF_txt.join(filesDF_ann.set_index("id"), on = "id")
  166. dataDF["text_data"] = dataDF["text_data"].apply(lambda x: open(self.folder+x).read())
  167. dataDF["annotation"] = dataDF["annotation"].apply(lambda x: open(self.folder+x).read())
  168. return(dataDF)
  169. def read_text(self):
  170. """
  171. read_text
  172. Get a pandas DataFrame containing the brat documents.
  173. Input : None
  174. Output : Pandas dataframe
  175. """
  176. dataDF = self._parseData()
  177. return(dataDF[["id","text_data"]])
  178. def read_annotation(self, ids = []):
  179. """
  180. read_annotation
  181. Get annotations from the brat folder.
  182. You can get specific annotation by filtering by id.
  183. input :
  184. ids, list (optionnal) : list of id for which you want the annotation data, if empty all annotations are returned.
  185. output :
  186. dict containing an annotations and relations data.
  187. """
  188. data = {}
  189. data["annotations"] = pd.DataFrame(columns=self.emptyDFCols["annotations"])
  190. data["relations"] = pd.DataFrame(columns=self.emptyDFCols["relations"])
  191. dataDF = self._parseData()[["id","annotation"]]
  192. dataDF = dataDF[(dataDF["annotation"].isna() == False) & (dataDF["annotation"] != '')] # Removing empty annotation
  193. # Filtering by ids
  194. if (len(ids) > 0):
  195. dataDF = dataDF[dataDF["id"].isin(pd.Series(ids).astype(str))]
  196. if (dataDF.shape[0] > 0):
  197. # Ann data to pandas
  198. dataDF = dataDF.join(dataDF["annotation"].str.split("\n").apply(pd.Series).stack().reset_index(level = 0).set_index("level_0")).reset_index(drop = True).drop("annotation", axis = 1).rename(columns = {0: "annotation"})
  199. dataDF = dataDF[dataDF["annotation"].str.len() > 0].reset_index(drop = True)
  200. dataDF = dataDF.join(dataDF["annotation"].str.split("\t", expand = True).rename(columns = {0: 'type_id', 1: 'data', 2: 'word'})).drop("annotation", axis = 1)
  201. dataDF["type"] = dataDF["type_id"].str.slice(0,1)
  202. ## Annotations
  203. data["annotations"] = dataDF[dataDF["type"] == 'T']
  204. if (data["annotations"].shape[0] > 0):
  205. data["annotations"] = data["annotations"].join(data["annotations"]["data"].str.split(" ", expand = True).rename(columns = {0: "label", 1: "start", 2: "end"})).drop(columns = ["data","type"])
  206. ## Relations
  207. data["relations"] = dataDF[dataDF["type"] == 'R']
  208. if (data["relations"].shape[0] > 0):
  209. tmp_splitted = data["relations"]["data"].str.split(" ", expand = True).rename(columns = {0: "relation"})
  210. ### Col names
  211. rename_dict = dict(zip(list(tmp_splitted.columns.values[1:]), list("Arg"+tmp_splitted.columns.values[1:].astype(str).astype(object))))
  212. tmp_splitted = tmp_splitted.rename(columns = rename_dict)
  213. ### Merging data
  214. tmp_splitted = tmp_splitted[["relation"]].join(tmp_splitted.loc[:,tmp_splitted.columns[tmp_splitted.columns != 'relation']].applymap(lambda x: x.split(":")[1]))
  215. data["relations"] = data["relations"].join(tmp_splitted).drop(columns = ["data","type","word"])
  216. return(data)
  217. def _write_function(self, x, filetype = "txt", overwrite = False):
  218. filenames = []
  219. if (filetype == 'txt' or filetype == 'both'):
  220. filenames.append(self.folder+str(x["filename"])+'.txt')
  221. if (filetype == 'ann' or filetype == 'both'):
  222. filenames.append(self.folder+str(x["filename"])+'.ann')
  223. for filename in filenames:
  224. try:
  225. open(str(filename), "r")
  226. is_file = True
  227. except FileNotFoundError:
  228. is_file = False
  229. if ((is_file == False) or (overwrite == True)):
  230. file = open(str(filename), "w")
  231. file.write(x["content"])
  232. file.close()
  233. def write_text(self, text_id, text, empty = False, overWriteAnnotations = False):
  234. """
  235. write_text
  236. Send text data from the brat folder.
  237. input :
  238. text_id, pd.Series : pandas series containing documents ids
  239. text, pd.Series : pandas series containing documents text in the same order as text_id
  240. empty, boolean : if True the brat folder is emptyied of all but configuration data (text and ann files) before writting
  241. overwriteAnnotations, boolean : if True, the current annotation files are replaced by blank one
  242. """
  243. if overWriteAnnotations == True: # On controle la façon dont la variable est écrite
  244. overwriteAnn = True
  245. else:
  246. overwriteAnn = False
  247. if (type(text) == type(pd.Series()) and type(text_id) == type(pd.Series()) and text.shape[0] == text_id.shape[0]):
  248. # ID check : check should be smaller than text : check if not inverted
  249. if (text_id.astype(str).str.len().max() < text.astype(str).str.len().max()):
  250. # empty : option to erase existing data
  251. if (empty):
  252. self._emptyData()
  253. # Writting data
  254. print("Writting data")
  255. df_text = pd.DataFrame({"filename":text_id, "content":text})
  256. df_ann = pd.DataFrame({"filename":text_id, "content":""})
  257. df_text.apply(lambda x: self._write_function(x, filetype = "txt", overwrite = True), axis = 1)
  258. df_ann.apply(lambda x: self._write_function(x, filetype = "ann", overwrite = overwriteAnn), axis = 1)
  259. print("data written.")
  260. else:
  261. raise ValueError('ID is larger than text, maybe you inverted them.')
  262. else:
  263. raise ValueError('Incorrect variable type, expected two Pandas Series of same shape.')
  264. def write_annotations(self, df, text_id, word, label, start, end, overwrite = False):
  265. """
  266. write_annotations
  267. Send annotation data from the brat folder. Useful to pre-anotate some data.
  268. input :
  269. df, pd.Dataframe : dataframe containing annotations data, should contains the text id, the annotated word, the annotated label, the start and end offset.
  270. text_id, str : name of the column in df which contains the document id
  271. word, str : name of the column in df which contains the annotated word
  272. label, str : name of the column in df which contains the label of the annotated word
  273. start, str : name of the column in df which contains the start offset
  274. end, str : name of the column in df which contains the end offset
  275. overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
  276. """
  277. # Checking data types
  278. if (type(df) == type(pd.DataFrame())):
  279. # Loading df
  280. df = df.rename(columns = {text_id:"id",word:"word",label:"label",start:"start",end:"end"})
  281. df["type_id"] = df.groupby("id").cumcount()+1
  282. # List of ids
  283. ids = df["id"].unique()
  284. # Loading current data
  285. current_annotation = self.read_annotation(ids)
  286. current_annotations = current_annotation["annotations"]
  287. tmaxDFAnnotations = current_annotations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Tmax"})
  288. if (overwrite == True):
  289. df["type_id"] = "T"+df["type_id"].astype(str)
  290. new_annotations = df
  291. else:
  292. df = df.join(tmaxDFAnnotations, on = "id").fillna(0)
  293. df["type_id"] = "T"+(df["type_id"]+df["Tmax"]).astype(int).astype(str)
  294. df = df.drop(columns = ["Tmax"])
  295. new_annotations = pd.concat((current_annotations, df[self.emptyDFCols["annotations"]])).reset_index(drop = True)
  296. new_annotations.drop_duplicates() ## Removing duplicates
  297. # Injecting new annotations
  298. current_annotation["annotations"] = new_annotations
  299. # Calling write function
  300. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  301. else:
  302. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  303. def write_relations(self, df, text_id, relation, overwrite = False):
  304. # Checking data types
  305. if (type(df) == type(pd.DataFrame())):
  306. # Loading df
  307. df = df.rename(columns = {text_id:"id",relation:"relation"})
  308. df["type_id"] = df.groupby("id").cumcount()+1 # type_id
  309. # Columns names
  310. old_columns = df.columns[np.isin(df.columns, ["id", "relation","type_id"]) == False]
  311. new_columns = "Arg"+np.array(list(range(1,len(old_columns)+1))).astype(str).astype(object)
  312. df = df.rename(columns = dict(zip(old_columns, new_columns)))
  313. # List of ids
  314. ids = df["id"].unique()
  315. # Loading current data
  316. current_annotation = self.read_annotation(ids)
  317. current_relations = current_annotation["relations"]
  318. rmaxDFrelations = current_relations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Rmax"})
  319. if (overwrite == True):
  320. df["type_id"] = "R"+df["type_id"].astype(str)
  321. new_relations = df
  322. else:
  323. df = df.join(rmaxDFrelations, on = "id").fillna(0)
  324. df["type_id"] = "R"+(df["type_id"]+df["Rmax"]).astype(int).astype(str)
  325. df = df.drop(columns = ["Rmax"])
  326. # Adding missing columns
  327. if (len(df.columns) > len(current_relations.columns)):
  328. for column in df.columns[np.isin(df.columns, current_relations.columns) == False]:
  329. current_relations[column] = np.nan
  330. else:
  331. for column in current_relations.columns[np.isin(current_relations.columns, df.columns) == False]:
  332. df[column] = np.nan
  333. new_relations = pd.concat((current_relations, df[current_relations.columns])).reset_index(drop = True)
  334. new_relations.drop_duplicates() ## Removing duplicates
  335. # Injecting new annotations
  336. current_annotation["relations"] = new_relations
  337. # Calling write function
  338. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  339. else:
  340. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  341. def _generate_annotations_str (self, annotations):
  342. annotations = annotations.reset_index(drop = True)
  343. annotations["label_span"] = annotations[["label","start","end"]].apply(lambda x: ' '.join(x.astype(str).values), axis = 1)
  344. annotations_str = '\n'.join(annotations[["type_id","label_span","word"]].apply(lambda x: '\t'.join(x.astype(str).values), axis = 1).values)
  345. return(annotations_str)
  346. def _generate_relations_str (self, relations):
  347. relations = relations.fillna('').applymap(lambda x: '' if x == 'nan' else x) #cleaning data
  348. columns = relations.columns[np.isin(relations.columns, ["id","type_id","relation"]) == False].values.tolist()
  349. boolmap = relations[columns].transpose().applymap(lambda x: int(x != ''))
  350. rct = relations[columns].transpose()
  351. temp_relations = (boolmap*(np.array(np.repeat(rct.index,rct.shape[1])).reshape(rct.shape)+':')+rct.astype(str)).transpose()
  352. relations_str = '\n'.join(relations[["type_id","relation"]].join(temp_relations[columns]).apply(lambda x: '\t'.join(x.values), axis = 1).values)
  353. return(relations_str)
  354. def _write_file(self, data):
  355. file = open(self.folder+str(data["id"])+".ann", "w")
  356. file.write(data["str_to_write"])
  357. file.close()
  358. def _write_annotation(self,annotations,relations):
  359. # Checking data types
  360. if (type(annotations) == type(pd.DataFrame()) and type(relations) == type(pd.DataFrame())):
  361. # Gerenating str
  362. data_annotations = annotations.groupby("id").agg(lambda x: self._generate_annotations_str(x)).iloc[:,0]
  363. data_relations = relations.groupby("id").agg(lambda x: self._generate_relations_str(x)).iloc[:,0]
  364. # Merging data
  365. data = pd.DataFrame({"annotations":data_annotations, "relations":data_relations}).fillna('')
  366. data["str_to_write"] = data.apply(lambda x : '\n'.join(x.values), axis = 1)
  367. data = data.reset_index().rename(columns = {"index":"id"})
  368. # Writting files
  369. data.apply(self._write_file, axis = 1)
  370. return(data)
  371. else:
  372. raise ValueError('Incorrect variable type, expected a Pandas DF.')