pandasToBrat.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. import re
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. def _getDictionnaryKeys(dictionnary):
  6. """
  7. Function that get keys from a dict object and flatten sub dict.
  8. """
  9. keys_array = []
  10. for key in dictionnary.keys():
  11. keys_array.append(key)
  12. if (type(dictionnary[key]) == type({})):
  13. keys_array = keys_array+_getDictionnaryKeys(dictionnary[key])
  14. return(keys_array)
  15. class pandasToBrat:
  16. """
  17. Class for Pandas brat folder management.
  18. For each brat folder, there is an instance of pandasToBrat.
  19. It supports importation and exportation of configurations for relations and entities.
  20. Documents importation and exportation.
  21. Annotations and entities importation and exportation.
  22. Inputs :
  23. folder, str : path of brat folder
  24. """
  25. def __init__(self, folder):
  26. self.folder = folder
  27. self.conf_file = 'annotation.conf'
  28. self.emptyDFCols = {
  29. "annotations":["id","type_id", "word", "label", "start", "end"],
  30. "relations":["id","type_id","relation","Arg1","Arg2"]
  31. }
  32. # Adding '/' to folder path if missing
  33. if(self.folder[-1] != '/'):
  34. self.folder += '/'
  35. # Creating folder if do not exist
  36. if (os.path.isdir(self.folder)) == False:
  37. os.mkdir(self.folder)
  38. # Loading conf file if exists | creating empty conf file if not
  39. self.read_conf()
  40. def _emptyData(self):
  41. fileList = self._getFileList()
  42. nb_files = fileList.shape[0]
  43. confirmation = input("Deleting all data ({} files), press y to confirm :".format(nb_files))
  44. if confirmation == 'y':
  45. fileList["filename"].apply(lambda x: os.remove(self.folder+x))
  46. print("{} files deleted.".format(nb_files))
  47. def _generateEntitiesStr (self, conf, data = '', level = 0):
  48. if (type(conf) != type({})):
  49. return data
  50. # Parsing keys
  51. for key in conf.keys():
  52. value = conf[key]
  53. if value == True:
  54. data += '\n'+level*'\t'+key
  55. elif value == False:
  56. data += '\n'+level*'\t'+'!'+key
  57. elif type(value) == type({}):
  58. data += '\n'+level*'\t'+key
  59. data = self._generateEntitiesStr(value, data, level+1)
  60. return data
  61. def _writeEntitiesLevel (self, conf, data, last_n = -1):
  62. for n in range(last_n,len(conf)):
  63. # If empty : pass, if not the last line : pass
  64. if (conf[n] != '' and n > last_n):
  65. level = len(conf[n].split("\t"))-1
  66. if (n+1 <= len(conf)): # Level of next item
  67. next_level = len(conf[n+1].split("\t"))-1
  68. else:
  69. next_level = level
  70. splitted_str = conf[n].split("\t")
  71. str_clean = splitted_str[len(splitted_str)-1]
  72. if (level >= next_level): # On écrit les lignes de même niveau
  73. if (str_clean[0] == '!'):
  74. data[str_clean[1:]] = False
  75. else:
  76. data[str_clean] = True
  77. if (level > next_level):
  78. # On casse la boucle
  79. break
  80. elif (level < next_level): # On écrit les lignes inférieurs par récurence
  81. splitted_str = conf[n].split("\t")
  82. last_n, data[str_clean] = self._writeEntitiesLevel(conf, {}, n)
  83. return(n, data)
  84. def _readRelations(self, relations, entities = []):
  85. data = {}
  86. for relation in relations.split("\n"):
  87. if relation != '':
  88. relation_data = relation.split("\t")[0]
  89. args = list(map(lambda x: x.split(":")[1], relation.split("\t")[1].split(", ")))
  90. args_valid = list(filter(lambda x: x in entities, args))
  91. if (len(args_valid) > 0):
  92. data[relation_data] = {"args":args_valid}
  93. return data
  94. def _writeRelations(self, relations, entities = []):
  95. data = ''
  96. for relation in relations:
  97. args_array = list(filter(lambda x: x in entities, relations[relation]["args"]))
  98. if (len(args_array) > 0):
  99. data += '\n'+relation+'\t'
  100. for n in range(0, len(args_array)):
  101. data += int(bool(n))*', '+'Arg'+str(n+1)+':'+args_array[n]
  102. return data
  103. def read_conf (self):
  104. """
  105. Get the current Brat configuration.
  106. Output :
  107. Dict containing "entities" and "relations" configurations.
  108. """
  109. if (os.path.isfile(self.folder+self.conf_file)):
  110. # Reading file
  111. file = open(self.folder+self.conf_file)
  112. conf_str = file.read()
  113. file.close()
  114. # Splitting conf_str
  115. conf_data = re.split(re.compile(r"\[[a-zA-Z]+\]", re.DOTALL), conf_str)[1:]
  116. data = {}
  117. # Reading enteties
  118. data["entities"] = self._writeEntitiesLevel(conf_data[0].split("\n"), {})[1]
  119. # Reading relations
  120. entitiesKeys = _getDictionnaryKeys(data["entities"])
  121. data["relations"] = self._readRelations(conf_data[1], entitiesKeys)
  122. return(data)
  123. else:
  124. self.write_conf()
  125. self.read_conf()
  126. def write_conf(self, entities = {}, relations = {}, events = {}, attributes = {}):
  127. """
  128. Write or overwrite configuration file.
  129. It actually doesn't suppport events and attributes configuration data.
  130. inputs :
  131. entities, dict : dict containing the entities. If an entities do have children, his value is an other dict, otherwise, it is set as True.
  132. relations, dict : dict containing the relations between entities, each key is a relation name, the value is a dict with a "args" key containing the list of related entities.
  133. """
  134. # TODO : Add events and attributes support.
  135. conf_str = ''
  136. # Entities
  137. conf_str += '\n\n[entities]'
  138. conf_str += self._generateEntitiesStr(entities)
  139. # relations
  140. conf_str += '\n\n[relations]'
  141. entitiesKeys = _getDictionnaryKeys(entities)
  142. conf_str += self._writeRelations(relations, entitiesKeys)
  143. # attributes
  144. conf_str += '\n\n[attributes]'
  145. # events
  146. conf_str += '\n\n[events]'
  147. # Write conf file
  148. file = open(self.folder+self.conf_file,'w')
  149. file.write(conf_str)
  150. file.close()
  151. def _getFileList(self):
  152. # Listing files
  153. filesDF = pd.DataFrame({'filename':pd.Series(os.listdir(self.folder))})
  154. filesDFSplitted = filesDF["filename"].str.split(".", expand = True)
  155. filesDF["id"] = filesDFSplitted[0]
  156. filesDF["filetype"] = filesDFSplitted[1]
  157. filesDF = filesDF[filesDF["filetype"].isin(["txt","ann"])]
  158. return(filesDF)
  159. def _parseData(self):
  160. # Listing files
  161. filesDF = self._getFileList()
  162. # Getting data from txt and ann
  163. filesDF_txt = filesDF.rename(columns = {"filename":"text_data"}).loc[filesDF["filetype"] == "txt", ["id","text_data"]]
  164. filesDF_ann = filesDF.rename(columns = {"filename":"annotation"}).loc[filesDF["filetype"] == "ann", ["id","annotation"]]
  165. dataDF = filesDF_txt.join(filesDF_ann.set_index("id"), on = "id")
  166. dataDF["text_data"] = dataDF["text_data"].apply(lambda x: open(self.folder+x).read())
  167. dataDF["annotation"] = dataDF["annotation"].apply(lambda x: open(self.folder+x).read())
  168. return(dataDF)
  169. def read_text(self):
  170. """
  171. read_text
  172. Get a pandas DataFrame containing the brat documents.
  173. Input : None
  174. Output : Pandas dataframe
  175. """
  176. dataDF = self._parseData()
  177. return(dataDF[["id","text_data"]])
  178. def read_annotation(self, ids = []):
  179. """
  180. read_annotation
  181. Get annotations from the brat folder.
  182. You can get specific annotation by filtering by id.
  183. input :
  184. ids, list (optionnal) : list of id for which you want the annotation data, if empty all annotations are returned.
  185. output :
  186. dict containing an annotations and relations data.
  187. """
  188. data = {}
  189. data["annotations"] = pd.DataFrame(columns=self.emptyDFCols["annotations"])
  190. data["relations"] = pd.DataFrame(columns=self.emptyDFCols["relations"])
  191. dataDF = self._parseData()[["id","annotation"]]
  192. dataDF = dataDF[(dataDF["annotation"].isna() == False) & (dataDF["annotation"] != '')] # Removing empty annotation
  193. # Filtering by ids
  194. if (len(ids) > 0):
  195. dataDF = dataDF[dataDF["id"].isin(pd.Series(ids).astype(str))]
  196. if (dataDF.shape[0] > 0):
  197. # Ann data to pandas
  198. dataDF = dataDF.join(dataDF["annotation"].str.split("\n").apply(pd.Series).stack().reset_index(level = 0).set_index("level_0")).reset_index(drop = True).drop("annotation", axis = 1).rename(columns = {0: "annotation"})
  199. dataDF = dataDF[dataDF["annotation"].str.len() > 0].reset_index(drop = True)
  200. dataDF = dataDF.join(dataDF["annotation"].str.split("\t", expand = True).rename(columns = {0: 'type_id', 1: 'data', 2: 'word'})).drop("annotation", axis = 1)
  201. dataDF["type"] = dataDF["type_id"].str.slice(0,1)
  202. ## Annotations
  203. data["annotations"] = dataDF[dataDF["type"] == 'T']
  204. if (data["annotations"].shape[0] > 0):
  205. data["annotations"] = data["annotations"].join(data["annotations"]["data"].str.split(" ", expand = True).rename(columns = {0: "label", 1: "start", 2: "end"})).drop(columns = ["data","type"])
  206. ## Relations
  207. data["relations"] = dataDF[dataDF["type"] == 'R']
  208. if (data["relations"].shape[0] > 0):
  209. tmp_splitted = data["relations"]["data"].str.split(" ", expand = True).rename(columns = {0: "relation"})
  210. ### Col names
  211. rename_dict = dict(zip(list(tmp_splitted.columns.values[1:]), list("Arg"+tmp_splitted.columns.values[1:].astype(str).astype(object))))
  212. tmp_splitted = tmp_splitted.rename(columns = rename_dict)
  213. ### Merging data
  214. tmp_splitted = tmp_splitted[["relation"]].join(tmp_splitted.loc[:,tmp_splitted.columns[tmp_splitted.columns != 'relation']].applymap(lambda x: x.split(":")[1]))
  215. data["relations"] = data["relations"].join(tmp_splitted).drop(columns = ["data","type","word"])
  216. return(data)
  217. def _write_function(self, x, filetype = "txt", overwrite = False):
  218. filenames = []
  219. if (filetype == 'txt' or filetype == 'both'):
  220. filenames.append(self.folder+str(x["filename"])+'.txt')
  221. if (filetype == 'ann' or filetype == 'both'):
  222. filenames.append(self.folder+str(x["filename"])+'.ann')
  223. for filename in filenames:
  224. try:
  225. open(str(filename), "r")
  226. is_file = True
  227. except FileNotFoundError:
  228. is_file = False
  229. if ((is_file == False) or (overwrite == True)):
  230. file = open(str(filename), "w")
  231. file.write(x["content"])
  232. file.close()
  233. def write_text(self, text_id, text, empty = False, overWriteAnnotations = False):
  234. """
  235. write_text
  236. Send text data from the brat folder.
  237. input :
  238. text_id, pd.Series : pandas series containing documents ids
  239. text, pd.Series : pandas series containing documents text in the same order as text_id
  240. empty, boolean : if True the brat folder is emptyied of all but configuration data (text and ann files) before writting
  241. overwriteAnnotations, boolean : if True, the current annotation files are replaced by blank one
  242. """
  243. if overWriteAnnotations == True: # On controle la façon dont la variable est écrite
  244. overwriteAnn = True
  245. else:
  246. overwriteAnn = False
  247. if (type(text) == type(pd.Series()) and type(text_id) == type(pd.Series()) and text.shape[0] == text_id.shape[0]):
  248. # ID check : check should be smaller than text : check if not inverted
  249. if (text_id.astype(str).str.len().max() < text.astype(str).str.len().max()):
  250. # empty : option to erase existing data
  251. if (empty):
  252. self._emptyData()
  253. # Writting data
  254. print("Writting data")
  255. df_text = pd.DataFrame({"filename":text_id, "content":text})
  256. df_ann = pd.DataFrame({"filename":text_id, "content":""})
  257. df_text.apply(lambda x: self._write_function(x, filetype = "txt", overwrite = True), axis = 1)
  258. df_ann.apply(lambda x: self._write_function(x, filetype = "ann", overwrite = overwriteAnn), axis = 1)
  259. print("data written.")
  260. else:
  261. raise ValueError('ID is larger than text, maybe you inverted them.')
  262. else:
  263. raise ValueError('Incorrect variable type, expected two Pandas Series of same shape.')
  264. def write_annotations(self, df, text_id, word, label, start, end, overwrite = False):
  265. """
  266. write_annotations
  267. Send annotation data from the brat folder. Useful to pre-anotate some data.
  268. input :
  269. df, pd.Dataframe : dataframe containing annotations data, should contains the text id, the annotated word, the annotated label, the start and end offset.
  270. text_id, str : name of the column in df which contains the document id
  271. word, str : name of the column in df which contains the annotated word
  272. label, str : name of the column in df which contains the label of the annotated word
  273. start, str : name of the column in df which contains the start offset
  274. end, str : name of the column in df which contains the end offset
  275. overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
  276. """
  277. # Checking data types
  278. if (type(df) == type(pd.DataFrame())):
  279. # Loading df
  280. df = df.rename(columns = {text_id:"id",word:"word",label:"label",start:"start",end:"end"})
  281. df["type_id"] = df.groupby("id").cumcount()+1
  282. # List of ids
  283. ids = df["id"].unique()
  284. # Loading current data
  285. current_annotation = self.read_annotation(ids)
  286. current_annotations = current_annotation["annotations"]
  287. tmaxDFAnnotations = current_annotations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Tmax"})
  288. if (overwrite == True):
  289. df["type_id"] = "T"+df["type_id"].astype(str)
  290. new_annotations = df
  291. else:
  292. df = df.join(tmaxDFAnnotations, on = "id").fillna(0)
  293. df["type_id"] = "T"+(df["type_id"]+df["Tmax"]).astype(int).astype(str)
  294. df = df.drop(columns = ["Tmax"])
  295. new_annotations = pd.concat((current_annotations, df[self.emptyDFCols["annotations"]])).reset_index(drop = True)
  296. new_annotations.drop_duplicates() ## Removing duplicates
  297. # Injecting new annotations
  298. current_annotation["annotations"] = new_annotations
  299. # Calling write function
  300. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  301. else:
  302. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  303. def write_relations(self, df, text_id, relation, overwrite = False):
  304. """
  305. write_relations
  306. Send relations data from the brat folder. Useful to pre-anotate some data.
  307. input :
  308. df, pd.Dataframe : dataframe containing relations data, should contains the text id, the relation name, the if of the linked annotations.
  309. text_id, str : name of the column in df which contains the document id
  310. relation, str : name of the column in df which contains the relation name
  311. overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
  312. The other columns should contains the type_id of related entities, as outputed by the read_annotation method.
  313. """
  314. # Checking data types
  315. if (type(df) == type(pd.DataFrame())):
  316. # Loading df
  317. df = df.rename(columns = {text_id:"id",relation:"relation"})
  318. df["type_id"] = df.groupby("id").cumcount()+1 # type_id
  319. # Columns names
  320. old_columns = df.columns[np.isin(df.columns, ["id", "relation","type_id"]) == False]
  321. new_columns = "Arg"+np.array(list(range(1,len(old_columns)+1))).astype(str).astype(object)
  322. df = df.rename(columns = dict(zip(old_columns, new_columns)))
  323. # List of ids
  324. ids = df["id"].unique()
  325. # Loading current data
  326. current_annotation = self.read_annotation(ids)
  327. current_relations = current_annotation["relations"]
  328. rmaxDFrelations = current_relations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Rmax"})
  329. if (overwrite == True):
  330. df["type_id"] = "R"+df["type_id"].astype(str)
  331. new_relations = df
  332. else:
  333. df = df.join(rmaxDFrelations, on = "id").fillna(0)
  334. df["type_id"] = "R"+(df["type_id"]+df["Rmax"]).astype(int).astype(str)
  335. df = df.drop(columns = ["Rmax"])
  336. # Adding missing columns
  337. if (len(df.columns) > len(current_relations.columns)):
  338. for column in df.columns[np.isin(df.columns, current_relations.columns) == False]:
  339. current_relations[column] = np.nan
  340. else:
  341. for column in current_relations.columns[np.isin(current_relations.columns, df.columns) == False]:
  342. df[column] = np.nan
  343. new_relations = pd.concat((current_relations, df[current_relations.columns])).reset_index(drop = True)
  344. new_relations.drop_duplicates() ## Removing duplicates
  345. # Injecting new annotations
  346. current_annotation["relations"] = new_relations
  347. # Calling write function
  348. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  349. else:
  350. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  351. def _generate_annotations_str (self, annotations):
  352. annotations = annotations.reset_index(drop = True)
  353. annotations["label_span"] = annotations[["label","start","end"]].apply(lambda x: ' '.join(x.astype(str).values), axis = 1)
  354. annotations_str = '\n'.join(annotations[["type_id","label_span","word"]].apply(lambda x: '\t'.join(x.astype(str).values), axis = 1).values)
  355. return(annotations_str)
  356. def _generate_relations_str (self, relations):
  357. relations = relations.fillna('').applymap(lambda x: '' if x == 'nan' else x) #cleaning data
  358. columns = relations.columns[np.isin(relations.columns, ["id","type_id","relation"]) == False].values.tolist()
  359. boolmap = relations[columns].transpose().applymap(lambda x: int(x != ''))
  360. rct = relations[columns].transpose()
  361. temp_relations = (boolmap*(np.array(np.repeat(rct.index,rct.shape[1])).reshape(rct.shape)+':')+rct.astype(str)).transpose()
  362. relations_str = '\n'.join(relations[["type_id","relation"]].join(temp_relations[columns]).apply(lambda x: '\t'.join(x.values), axis = 1).values)
  363. return(relations_str)
  364. def _write_file(self, data):
  365. file = open(self.folder+str(data["id"])+".ann", "w")
  366. file.write(data["str_to_write"])
  367. file.close()
  368. def _write_annotation(self,annotations,relations):
  369. # Checking data types
  370. if (type(annotations) == type(pd.DataFrame()) and type(relations) == type(pd.DataFrame())):
  371. # Gerenating str
  372. data_annotations = annotations.groupby("id").agg(lambda x: self._generate_annotations_str(x)).iloc[:,0]
  373. data_relations = relations.groupby("id").agg(lambda x: self._generate_relations_str(x)).iloc[:,0]
  374. # Merging data
  375. data = pd.DataFrame({"annotations":data_annotations, "relations":data_relations}).fillna('')
  376. data["str_to_write"] = data.apply(lambda x : '\n'.join(x.values), axis = 1)
  377. data = data.reset_index().rename(columns = {"index":"id"})
  378. # Writting files
  379. data.apply(self._write_file, axis = 1)
  380. return(data)
  381. else:
  382. raise ValueError('Incorrect variable type, expected a Pandas DF.')