pandasToBrat.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. import re
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. from pandasToBrat.extract_tools import default_tokenizer
  6. def _getDictionnaryKeys(dictionnary):
  7. """
  8. Function that get keys from a dict object and flatten sub dict.
  9. """
  10. keys_array = []
  11. for key in dictionnary.keys():
  12. keys_array.append(key)
  13. if (type(dictionnary[key]) == type({})):
  14. keys_array = keys_array+_getDictionnaryKeys(dictionnary[key])
  15. return(keys_array)
  16. class pandasToBrat:
  17. """
  18. Class for Pandas brat folder management.
  19. For each brat folder, there is an instance of pandasToBrat.
  20. It supports importation and exportation of configurations for relations and entities.
  21. Documents importation and exportation.
  22. Annotations and entities importation and exportation.
  23. Inputs :
  24. folder, str : path of brat folder
  25. """
  26. def __init__(self, folder):
  27. self.folder = folder
  28. self.conf_file = 'annotation.conf'
  29. self.emptyDFCols = {
  30. "annotations":["id","type_id", "word", "label", "start", "end"],
  31. "relations":["id","type_id","relation","Arg1","Arg2"]
  32. }
  33. # Adding '/' to folder path if missing
  34. if(self.folder[-1] != '/'):
  35. self.folder += '/'
  36. # Creating folder if do not exist
  37. if (os.path.isdir(self.folder)) == False:
  38. os.mkdir(self.folder)
  39. # Loading conf file if exists | creating empty conf file if not
  40. self.read_conf()
  41. def _emptyData(self):
  42. fileList = self._getFileList()
  43. nb_files = fileList.shape[0]
  44. confirmation = input("Deleting all data ({} files), press y to confirm :".format(nb_files))
  45. if confirmation == 'y':
  46. fileList["filename"].apply(lambda x: os.remove(self.folder+x))
  47. print("{} files deleted.".format(nb_files))
  48. def _generateEntitiesStr (self, conf, data = '', level = 0):
  49. if (type(conf) != type({})):
  50. return data
  51. # Parsing keys
  52. for key in conf.keys():
  53. value = conf[key]
  54. if value == True:
  55. data += '\n'+level*'\t'+key
  56. elif value == False:
  57. data += '\n'+level*'\t'+'!'+key
  58. elif type(value) == type({}):
  59. data += '\n'+level*'\t'+key
  60. data = self._generateEntitiesStr(value, data, level+1)
  61. return data
  62. def _writeEntitiesLevel (self, conf, data, last_n = -1):
  63. for n in range(last_n,len(conf)):
  64. # If empty : pass, if not the last line : pass
  65. if (conf[n] != '' and n > last_n):
  66. level = len(conf[n].split("\t"))-1
  67. if (n+1 <= len(conf)): # Level of next item
  68. next_level = len(conf[n+1].split("\t"))-1
  69. else:
  70. next_level = level
  71. splitted_str = conf[n].split("\t")
  72. str_clean = splitted_str[len(splitted_str)-1]
  73. if (level >= next_level): # On écrit les lignes de même niveau
  74. if (str_clean[0] == '!'):
  75. data[str_clean[1:]] = False
  76. else:
  77. data[str_clean] = True
  78. if (level > next_level):
  79. # On casse la boucle
  80. break
  81. elif (level < next_level): # On écrit les lignes inférieurs par récurence
  82. splitted_str = conf[n].split("\t")
  83. last_n, data[str_clean] = self._writeEntitiesLevel(conf, {}, n)
  84. return(n, data)
  85. def _readRelations(self, relations, entities = []):
  86. data = {}
  87. for relation in relations.split("\n"):
  88. if relation != '':
  89. relation_data = relation.split("\t")[0]
  90. args = list(map(lambda x: x.split(":")[1], relation.split("\t")[1].split(", ")))
  91. args_valid = list(filter(lambda x: x in entities, args))
  92. if (len(args_valid) > 0):
  93. data[relation_data] = {"args":args_valid}
  94. return data
  95. def _writeRelations(self, relations, entities = []):
  96. data = ''
  97. for relation in relations:
  98. args_array = list(filter(lambda x: x in entities, relations[relation]["args"]))
  99. if (len(args_array) > 0):
  100. data += '\n'+relation+'\t'
  101. for n in range(0, len(args_array)):
  102. data += int(bool(n))*', '+'Arg'+str(n+1)+':'+args_array[n]
  103. return data
  104. def read_conf (self):
  105. """
  106. Get the current Brat configuration.
  107. Output :
  108. Dict containing "entities" and "relations" configurations.
  109. """
  110. if (os.path.isfile(self.folder+self.conf_file)):
  111. # Reading file
  112. file = open(self.folder+self.conf_file)
  113. conf_str = file.read()
  114. file.close()
  115. # Splitting conf_str
  116. conf_data = re.split(re.compile(r"\[[a-zA-Z]+\]", re.DOTALL), conf_str)[1:]
  117. data = {}
  118. # Reading enteties
  119. data["entities"] = self._writeEntitiesLevel(conf_data[0].split("\n"), {})[1]
  120. # Reading relations
  121. entitiesKeys = _getDictionnaryKeys(data["entities"])
  122. data["relations"] = self._readRelations(conf_data[1], entitiesKeys)
  123. return(data)
  124. else:
  125. self.write_conf()
  126. self.read_conf()
  127. def write_conf(self, entities = {}, relations = {}, events = {}, attributes = {}):
  128. """
  129. Write or overwrite configuration file.
  130. It actually doesn't suppport events and attributes configuration data.
  131. inputs :
  132. entities, dict : dict containing the entities. If an entities do have children, his value is an other dict, otherwise, it is set as True.
  133. relations, dict : dict containing the relations between entities, each key is a relation name, the value is a dict with a "args" key containing the list of related entities.
  134. """
  135. # TODO : Add events and attributes support.
  136. conf_str = ''
  137. # Entities
  138. conf_str += '\n\n[entities]'
  139. conf_str += self._generateEntitiesStr(entities)
  140. # relations
  141. conf_str += '\n\n[relations]'
  142. entitiesKeys = _getDictionnaryKeys(entities)
  143. conf_str += self._writeRelations(relations, entitiesKeys)
  144. # attributes
  145. conf_str += '\n\n[attributes]'
  146. # events
  147. conf_str += '\n\n[events]'
  148. # Write conf file
  149. file = open(self.folder+self.conf_file,'w')
  150. file.write(conf_str)
  151. file.close()
  152. def _getFileList(self):
  153. # Listing files
  154. filesDF = pd.DataFrame({'filename':pd.Series(os.listdir(self.folder))})
  155. filesDFSplitted = filesDF["filename"].str.split(".", expand = True)
  156. filesDF["id"] = filesDFSplitted[0]
  157. filesDF["filetype"] = filesDFSplitted[1]
  158. filesDF = filesDF[filesDF["filetype"].isin(["txt","ann"])]
  159. return(filesDF)
  160. def _parseData(self):
  161. # Listing files
  162. filesDF = self._getFileList()
  163. # Getting data from txt and ann
  164. filesDF_txt = filesDF.rename(columns = {"filename":"text_data"}).loc[filesDF["filetype"] == "txt", ["id","text_data"]]
  165. filesDF_ann = filesDF.rename(columns = {"filename":"annotation"}).loc[filesDF["filetype"] == "ann", ["id","annotation"]]
  166. dataDF = filesDF_txt.join(filesDF_ann.set_index("id"), on = "id")
  167. dataDF["text_data"] = dataDF["text_data"].apply(lambda x: open(self.folder+x).read())
  168. dataDF["annotation"] = dataDF["annotation"].apply(lambda x: open(self.folder+x).read())
  169. return(dataDF)
  170. def read_text(self):
  171. """
  172. read_text
  173. Get a pandas DataFrame containing the brat documents.
  174. Input : None
  175. Output : Pandas dataframe
  176. """
  177. dataDF = self._parseData()
  178. return(dataDF[["id","text_data"]])
  179. def read_annotation(self, ids = []):
  180. """
  181. read_annotation
  182. Get annotations from the brat folder.
  183. You can get specific annotation by filtering by id.
  184. input :
  185. ids, list (optionnal) : list of id for which you want the annotation data, if empty all annotations are returned.
  186. output :
  187. dict containing an annotations and relations data.
  188. """
  189. data = {}
  190. data["annotations"] = pd.DataFrame(columns=self.emptyDFCols["annotations"])
  191. data["relations"] = pd.DataFrame(columns=self.emptyDFCols["relations"])
  192. dataDF = self._parseData()[["id","annotation"]]
  193. dataDF = dataDF[(dataDF["annotation"].isna() == False) & (dataDF["annotation"] != '')] # Removing empty annotation
  194. # Filtering by ids
  195. if (len(ids) > 0):
  196. dataDF = dataDF[dataDF["id"].isin(pd.Series(ids).astype(str))]
  197. if (dataDF.shape[0] > 0):
  198. # Ann data to pandas
  199. dataDF = dataDF.join(dataDF["annotation"].str.split("\n").apply(pd.Series).stack().reset_index(level = 0).set_index("level_0")).reset_index(drop = True).drop("annotation", axis = 1).rename(columns = {0: "annotation"})
  200. dataDF = dataDF[dataDF["annotation"].str.len() > 0].reset_index(drop = True)
  201. dataDF = dataDF.join(dataDF["annotation"].str.split("\t", expand = True).rename(columns = {0: 'type_id', 1: 'data', 2: 'word'})).drop("annotation", axis = 1)
  202. dataDF["type"] = dataDF["type_id"].str.slice(0,1)
  203. ## Annotations
  204. data["annotations"] = dataDF[dataDF["type"] == 'T']
  205. if (data["annotations"].shape[0] > 0):
  206. data["annotations"] = data["annotations"].join(data["annotations"]["data"].str.split(" ", expand = True).rename(columns = {0: "label", 1: "start", 2: "end"})).drop(columns = ["data","type"])
  207. ## Relations
  208. data["relations"] = dataDF[dataDF["type"] == 'R']
  209. if (data["relations"].shape[0] > 0):
  210. tmp_splitted = data["relations"]["data"].str.split(" ", expand = True).rename(columns = {0: "relation"})
  211. ### Col names
  212. rename_dict = dict(zip(list(tmp_splitted.columns.values[1:]), list("Arg"+tmp_splitted.columns.values[1:].astype(str).astype(object))))
  213. tmp_splitted = tmp_splitted.rename(columns = rename_dict)
  214. ### Merging data
  215. tmp_splitted = tmp_splitted[["relation"]].join(tmp_splitted.loc[:,tmp_splitted.columns[tmp_splitted.columns != 'relation']].applymap(lambda x: x.split(":")[1]))
  216. data["relations"] = data["relations"].join(tmp_splitted).drop(columns = ["data","type","word"])
  217. return(data)
  218. def _write_function(self, x, filetype = "txt", overwrite = False):
  219. filenames = []
  220. if (filetype == 'txt' or filetype == 'both'):
  221. filenames.append(self.folder+str(x["filename"])+'.txt')
  222. if (filetype == 'ann' or filetype == 'both'):
  223. filenames.append(self.folder+str(x["filename"])+'.ann')
  224. for filename in filenames:
  225. try:
  226. open(str(filename), "r")
  227. is_file = True
  228. except FileNotFoundError:
  229. is_file = False
  230. if ((is_file == False) or (overwrite == True)):
  231. file = open(str(filename), "w")
  232. file.write(x["content"])
  233. file.close()
  234. def write_text(self, text_id, text, empty = False, overWriteAnnotations = False):
  235. """
  236. write_text
  237. Send text data from the brat folder.
  238. input :
  239. text_id, pd.Series : pandas series containing documents ids
  240. text, pd.Series : pandas series containing documents text in the same order as text_id
  241. empty, boolean : if True the brat folder is emptyied of all but configuration data (text and ann files) before writting
  242. overwriteAnnotations, boolean : if True, the current annotation files are replaced by blank one
  243. """
  244. if overWriteAnnotations == True: # On controle la façon dont la variable est écrite
  245. overwriteAnn = True
  246. else:
  247. overwriteAnn = False
  248. if (type(text) == type(pd.Series()) and type(text_id) == type(pd.Series()) and text.shape[0] == text_id.shape[0]):
  249. # ID check : check should be smaller than text : check if not inverted
  250. if (text_id.astype(str).str.len().max() < text.astype(str).str.len().max()):
  251. # empty : option to erase existing data
  252. if (empty):
  253. self._emptyData()
  254. # Writting data
  255. print("Writting data")
  256. df_text = pd.DataFrame({"filename":text_id, "content":text})
  257. df_ann = pd.DataFrame({"filename":text_id, "content":""})
  258. df_text.apply(lambda x: self._write_function(x, filetype = "txt", overwrite = True), axis = 1)
  259. df_ann.apply(lambda x: self._write_function(x, filetype = "ann", overwrite = overwriteAnn), axis = 1)
  260. print("data written.")
  261. else:
  262. raise ValueError('ID is larger than text, maybe you inverted them.')
  263. else:
  264. raise ValueError('Incorrect variable type, expected two Pandas Series of same shape.')
  265. def write_annotations(self, df, text_id, word, label, start, end, overwrite = False):
  266. """
  267. write_annotations
  268. Send annotation data from the brat folder. Useful to pre-anotate some data.
  269. input :
  270. df, pd.Dataframe : dataframe containing annotations data, should contains the text id, the annotated word, the annotated label, the start and end offset.
  271. text_id, str : name of the column in df which contains the document id
  272. word, str : name of the column in df which contains the annotated word
  273. label, str : name of the column in df which contains the label of the annotated word
  274. start, str : name of the column in df which contains the start offset
  275. end, str : name of the column in df which contains the end offset
  276. overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
  277. """
  278. # Checking data types
  279. if (type(df) == type(pd.DataFrame())):
  280. # Loading df
  281. df = df.rename(columns = {text_id:"id",word:"word",label:"label",start:"start",end:"end"})
  282. df["type_id"] = df.groupby("id").cumcount()+1
  283. # List of ids
  284. ids = df["id"].unique()
  285. # Loading current data
  286. current_annotation = self.read_annotation(ids)
  287. current_annotations = current_annotation["annotations"]
  288. tmaxDFAnnotations = current_annotations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Tmax"})
  289. if (overwrite == True):
  290. df["type_id"] = "T"+df["type_id"].astype(str)
  291. new_annotations = df
  292. else:
  293. df = df.join(tmaxDFAnnotations, on = "id").fillna(0)
  294. df["type_id"] = "T"+(df["type_id"]+df["Tmax"]).astype(int).astype(str)
  295. df = df.drop(columns = ["Tmax"])
  296. new_annotations = pd.concat((current_annotations, df[self.emptyDFCols["annotations"]])).reset_index(drop = True)
  297. new_annotations.drop_duplicates() ## Removing duplicates
  298. # Injecting new annotations
  299. current_annotation["annotations"] = new_annotations
  300. # Calling write function
  301. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  302. else:
  303. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  304. def write_relations(self, df, text_id, relation, overwrite = False):
  305. """
  306. write_relations
  307. Send relations data from the brat folder. Useful to pre-anotate some data.
  308. input :
  309. df, pd.Dataframe : dataframe containing relations data, should contains the text id, the relation name, the if of the linked annotations.
  310. text_id, str : name of the column in df which contains the document id
  311. relation, str : name of the column in df which contains the relation name
  312. overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
  313. The other columns should contains the type_id of related entities, as outputed by the read_annotation method.
  314. """
  315. # Checking data types
  316. if (type(df) == type(pd.DataFrame())):
  317. # Loading df
  318. df = df.rename(columns = {text_id:"id",relation:"relation"})
  319. df["type_id"] = df.groupby("id").cumcount()+1 # type_id
  320. # Columns names
  321. old_columns = df.columns[np.isin(df.columns, ["id", "relation","type_id"]) == False]
  322. new_columns = "Arg"+np.array(list(range(1,len(old_columns)+1))).astype(str).astype(object)
  323. df = df.rename(columns = dict(zip(old_columns, new_columns)))
  324. # List of ids
  325. ids = df["id"].unique()
  326. # Loading current data
  327. current_annotation = self.read_annotation(ids)
  328. current_relations = current_annotation["relations"]
  329. rmaxDFrelations = current_relations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Rmax"})
  330. if (overwrite == True):
  331. df["type_id"] = "R"+df["type_id"].astype(str)
  332. new_relations = df
  333. else:
  334. df = df.join(rmaxDFrelations, on = "id").fillna(0)
  335. df["type_id"] = "R"+(df["type_id"]+df["Rmax"]).astype(int).astype(str)
  336. df = df.drop(columns = ["Rmax"])
  337. # Adding missing columns
  338. if (len(df.columns) > len(current_relations.columns)):
  339. for column in df.columns[np.isin(df.columns, current_relations.columns) == False]:
  340. current_relations[column] = np.nan
  341. else:
  342. for column in current_relations.columns[np.isin(current_relations.columns, df.columns) == False]:
  343. df[column] = np.nan
  344. new_relations = pd.concat((current_relations, df[current_relations.columns])).reset_index(drop = True)
  345. new_relations.drop_duplicates() ## Removing duplicates
  346. # Injecting new annotations
  347. current_annotation["relations"] = new_relations
  348. # Calling write function
  349. self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
  350. else:
  351. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  352. def _generate_annotations_str (self, annotations):
  353. annotations = annotations.reset_index(drop = True)
  354. annotations["label_span"] = annotations[["label","start","end"]].apply(lambda x: ' '.join(x.astype(str).values), axis = 1)
  355. annotations_str = '\n'.join(annotations[["type_id","label_span","word"]].apply(lambda x: '\t'.join(x.astype(str).values), axis = 1).values)
  356. return(annotations_str)
  357. def _generate_relations_str (self, relations):
  358. relations = relations.fillna('').applymap(lambda x: '' if x == 'nan' else x) #cleaning data
  359. columns = relations.columns[np.isin(relations.columns, ["id","type_id","relation"]) == False].values.tolist()
  360. boolmap = relations[columns].transpose().applymap(lambda x: int(x != ''))
  361. rct = relations[columns].transpose()
  362. temp_relations = (boolmap*(np.array(np.repeat(rct.index,rct.shape[1])).reshape(rct.shape)+':')+rct.astype(str)).transpose()
  363. relations_str = '\n'.join(relations[["type_id","relation"]].join(temp_relations[columns]).apply(lambda x: '\t'.join(x.values), axis = 1).values)
  364. return(relations_str)
  365. def _write_file(self, data):
  366. file = open(self.folder+str(data["id"])+".ann", "w")
  367. file.write(data["str_to_write"])
  368. file.close()
  369. def _write_annotation(self,annotations,relations):
  370. # Checking data types
  371. if (type(annotations) == type(pd.DataFrame()) and type(relations) == type(pd.DataFrame())):
  372. # Gerenating str
  373. data_annotations = annotations.groupby("id").agg(lambda x: self._generate_annotations_str(x)).iloc[:,0]
  374. data_relations = relations.groupby("id").agg(lambda x: self._generate_relations_str(x)).iloc[:,0]
  375. # Merging data
  376. data = pd.DataFrame({"annotations":data_annotations, "relations":data_relations}).fillna('')
  377. data["str_to_write"] = data.apply(lambda x : '\n'.join(x.values), axis = 1)
  378. data = data.reset_index().rename(columns = {"index":"id"})
  379. # Writting files
  380. data.apply(self._write_file, axis = 1)
  381. return(data)
  382. else:
  383. raise ValueError('Incorrect variable type, expected a Pandas DF.')
  384. def _export_conll_2003 (self, data):
  385. '''
  386. Internal function for export in conll format.
  387. '''
  388. # Creating i-label
  389. data["i-label"] = (data["label"] != "O").astype(int)*(data["i-type"]+'-')+data["label"]
  390. # Creating string
  391. data["str"] = data[["token","pos","chunks","i-label"]].apply(lambda x: ' '.join(x), axis = 1)
  392. connll_str = "-DOCSTART- -X- -X- O"+"\n\n"+"\n\n".join(
  393. data.groupby("id").agg(lambda x: "\n".join(x))["str"].values.tolist()
  394. )
  395. return(connll_str)
  396. def _get_tokenized_data(self, text_data, annotations_data, tokenizer = default_tokenizer, keep_empty = False):
  397. '''
  398. Internal function that process text and annotation data to calculate token, pos and chunks.
  399. Input :
  400. text_data : text data exported from current class
  401. annotations_data : annotations data exported from current class
  402. tokenizer : tokenizer function from extract_tools
  403. keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
  404. Output :
  405. Aggreged data in Pandas DataFrame.
  406. '''
  407. # Applying tokenizer to text
  408. text_data["tokens"] = text_data["text_data"].apply(tokenizer)
  409. # Exploding dataframe by tokens and rename column
  410. exploded_text_data = text_data[["id", "tokens"]].explode("tokens").reset_index(drop = True)
  411. exploded_text_data = exploded_text_data.join(
  412. exploded_text_data["tokens"] \
  413. .apply(pd.Series) \
  414. .rename(columns = {
  415. 0:'token',1:'start_offset',2:'end_offset', 3:'pos'
  416. })
  417. ) \
  418. .drop(columns = ["tokens"])
  419. # Getting entities from annotations
  420. ## We merge by offset
  421. ### Creating a word id and annotation id
  422. exploded_text_data = exploded_text_data \
  423. .reset_index(drop = True) \
  424. .reset_index() \
  425. .rename(columns = {"index":"word_id"})
  426. annotations_data = annotations_data \
  427. .reset_index() \
  428. .rename(columns = {"index":"ann_id"})
  429. ### Offset of string
  430. text_offsets = pd.DataFrame(exploded_text_data[["id","word_id","start_offset","end_offset"]])
  431. text_offsets["start_offset"] = text_offsets["start_offset"].astype(int)
  432. text_offsets["end_offset"] = text_offsets["end_offset"].astype(int)
  433. text_offsets["offsets"] = text_offsets \
  434. .apply(
  435. lambda x: list(range(x["start_offset"], x["end_offset"]+1)), axis = 1
  436. )
  437. text_offsets = text_offsets[["id","word_id", "offsets"]] \
  438. .explode("offsets")
  439. ### Offset of annotations
  440. ann_offsets = pd.DataFrame(
  441. annotations_data[["id", "ann_id", "start", "end"]]
  442. )
  443. ann_offsets["start"] = ann_offsets["start"].astype(int)
  444. ann_offsets["end"] = ann_offsets["end"].astype(int)
  445. if (ann_offsets.shape[0] > 0):
  446. ann_offsets["offsets"] = ann_offsets \
  447. .apply(
  448. lambda x: list(range(x["start"], x["end"]+1)), axis = 1
  449. )
  450. else:
  451. ann_offsets["offsets"] = ''
  452. ann_offsets = ann_offsets[["id","ann_id", "offsets"]] \
  453. .explode("offsets")
  454. # Merging by term
  455. text_offsets["uid"] = text_offsets["id"].astype(str) \
  456. + text_offsets["offsets"].astype(str)
  457. ann_offsets["uid"] = ann_offsets["id"].astype(str) \
  458. + ann_offsets["offsets"].astype(str)
  459. merged_id = text_offsets \
  460. .join(
  461. ann_offsets[["ann_id","uid"]].set_index("uid"),
  462. on = "uid"
  463. ) \
  464. .dropna()
  465. merged_id["ann_id"] = merged_id["ann_id"].astype(int)
  466. merged_id = merged_id[["word_id", "ann_id"]] \
  467. .set_index("ann_id") \
  468. .drop_duplicates() \
  469. .join(annotations_data, on = "ann_id")
  470. # Keeping last when duplicate word_id
  471. merged_id = merged_id \
  472. .drop_duplicates("word_id", keep = "last")
  473. # Joining annotation with word id
  474. output_df = exploded_text_data \
  475. .join(merged_id[["label","word_id"]] \
  476. .set_index("word_id"),
  477. on = "word_id",
  478. how = "left") \
  479. .fillna("O")[["id", "token","label", "pos"]]
  480. # Creation of i-type : if O : i-type is B
  481. output_df["i-type"] = output_df \
  482. .groupby("id").agg(lambda x: ["B"]+["I"]*(len(x)-1))["label"].explode().reset_index()["label"]
  483. output_df.loc[output_df["label"] == "O", "i-type"] = 'B'
  484. # Empty chunks
  485. output_df["chunks"] = 'O'
  486. # Post - processing
  487. if (keep_empty == False):
  488. output_df = output_df[output_df["token"] != ''].reset_index(drop = True)
  489. return(output_df)
  490. def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False, entities = None):
  491. '''
  492. Function that generate an export file.
  493. Supported export format are :
  494. - conll-2003
  495. input :
  496. export_format : name of the export format
  497. tokenizer : tokenizer function from extract_tools
  498. keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
  499. entities : if None, all entities are send to the export file, if there is the conflict the most recent is used, otherwise the entities are selected before
  500. Output :
  501. str : output string in selected export format
  502. '''
  503. supported_export_format = {
  504. "conll-2003":self._export_conll_2003
  505. }
  506. # Check the export format
  507. if (export_format not in supported_export_format.keys()):
  508. raise Exception(str(export_format)+" format not supported. Export format should be one of these : {}".format(
  509. ", ".join(supported_export_format.keys())
  510. ))
  511. # Create dataframe of tokenized word associated with annotations
  512. ## Getting data from brat
  513. text_data = self.read_text()
  514. annotations_data = self.read_annotation()["annotations"]
  515. ## Filtering entities
  516. if entities is not None:
  517. if type(entities) != type(list()):
  518. raise Exception("entities should be of type list")
  519. annotations_data = annotations_data[annotations_data["label"].isin(entities)] \
  520. .reset_index(drop = True)
  521. ## Parsing data
  522. data = self._get_tokenized_data(tokenizer=tokenizer,
  523. text_data = text_data,
  524. annotations_data = annotations_data,
  525. keep_empty = keep_empty)
  526. # Execute the export format associated function
  527. data_str = supported_export_format[export_format](data = data)
  528. return(data_str)