123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713 |
- import re
- import os
- import pandas as pd
- import numpy as np
- from pandasToBrat.extract_tools import default_tokenizer
- def _getDictionnaryKeys(dictionnary):
- """
- Function that get keys from a dict object and flatten sub dict.
- """
-
- keys_array = []
- for key in dictionnary.keys():
- keys_array.append(key)
- if (type(dictionnary[key]) == type({})):
- keys_array = keys_array+_getDictionnaryKeys(dictionnary[key])
- return(keys_array)
- class pandasToBrat:
- """
- Class for Pandas brat folder management.
- For each brat folder, there is an instance of pandasToBrat.
- It supports importation and exportation of configurations for relations and entities.
- Documents importation and exportation.
- Annotations and entities importation and exportation.
- Inputs :
- folder, str : path of brat folder
- """
-
- def __init__(self, folder):
- self.folder = folder
- self.conf_file = 'annotation.conf'
-
- self.emptyDFCols = {
- "annotations":["id","type_id", "word", "label", "start", "end"],
- "relations":["id","type_id","relation","Arg1","Arg2"]
- }
-
- # Adding '/' to folder path if missing
- if(self.folder[-1] != '/'):
- self.folder += '/'
-
- # Creating folder if do not exist
- if (os.path.isdir(self.folder)) == False:
- os.mkdir(self.folder)
-
- # Loading conf file if exists | creating empty conf file if not
- self.read_conf()
-
- def _emptyData(self):
- fileList = self._getFileList()
- nb_files = fileList.shape[0]
-
- confirmation = input("Deleting all data ({} files), press y to confirm :".format(nb_files))
- if confirmation == 'y':
- fileList["filename"].apply(lambda x: os.remove(self.folder+x))
- print("{} files deleted.".format(nb_files))
-
- def _generateEntitiesStr (self, conf, data = '', level = 0):
-
- if (type(conf) != type({})):
- return data
-
- # Parsing keys
- for key in conf.keys():
- value = conf[key]
- if value == True:
- data += '\n'+level*'\t'+key
- elif value == False:
- data += '\n'+level*'\t'+'!'+key
- elif type(value) == type({}):
- data += '\n'+level*'\t'+key
- data = self._generateEntitiesStr(value, data, level+1)
- return data
-
- def _writeEntitiesLevel (self, conf, data, last_n = -1):
-
- for n in range(last_n,len(conf)):
- # If empty : pass, if not the last line : pass
- if (conf[n] != '' and n > last_n):
- level = len(conf[n].split("\t"))-1
- if (n+1 <= len(conf)): # Level of next item
- next_level = len(conf[n+1].split("\t"))-1
- else:
- next_level = level
-
- splitted_str = conf[n].split("\t")
- str_clean = splitted_str[len(splitted_str)-1]
-
- if (level >= next_level): # On écrit les lignes de même niveau
- if (str_clean[0] == '!'):
- data[str_clean[1:]] = False
- else:
- data[str_clean] = True
-
- if (level > next_level):
- # On casse la boucle
- break
- elif (level < next_level): # On écrit les lignes inférieurs par récurence
- splitted_str = conf[n].split("\t")
- last_n, data[str_clean] = self._writeEntitiesLevel(conf, {}, n)
- return(n, data)
-
- def _readRelations(self, relations, entities = []):
- data = {}
- for relation in relations.split("\n"):
- if relation != '':
- relation_data = relation.split("\t")[0]
- args = list(map(lambda x: x.split(":")[1], relation.split("\t")[1].split(", ")))
- args_valid = list(filter(lambda x: x in entities, args))
- if (len(args_valid) > 0):
- data[relation_data] = {"args":args_valid}
-
- return data
-
- def _writeRelations(self, relations, entities = []):
- data = ''
- for relation in relations:
- args_array = list(filter(lambda x: x in entities, relations[relation]["args"]))
- if (len(args_array) > 0):
- data += '\n'+relation+'\t'
- for n in range(0, len(args_array)):
- data += int(bool(n))*', '+'Arg'+str(n+1)+':'+args_array[n]
-
- return data
-
- def read_conf (self):
- """
- Get the current Brat configuration.
- Output :
- Dict containing "entities" and "relations" configurations.
- """
-
- if (os.path.isfile(self.folder+self.conf_file)):
-
- # Reading file
- file = open(self.folder+self.conf_file)
- conf_str = file.read()
- file.close()
-
- # Splitting conf_str
- conf_data = re.split(re.compile(r"\[[a-zA-Z]+\]", re.DOTALL), conf_str)[1:]
-
- data = {}
-
- # Reading enteties
- data["entities"] = self._writeEntitiesLevel(conf_data[0].split("\n"), {})[1]
-
- # Reading relations
- entitiesKeys = _getDictionnaryKeys(data["entities"])
- data["relations"] = self._readRelations(conf_data[1], entitiesKeys)
-
- return(data)
-
- else:
- self.write_conf()
- self.read_conf()
-
- def write_conf(self, entities = {}, relations = {}, events = {}, attributes = {}):
- """
- Write or overwrite configuration file.
- It actually doesn't suppport events and attributes configuration data.
- inputs :
- entities, dict : dict containing the entities. If an entities do have children, his value is an other dict, otherwise, it is set as True.
- relations, dict : dict containing the relations between entities, each key is a relation name, the value is a dict with a "args" key containing the list of related entities.
- """
-
- # TODO : Add events and attributes support.
- conf_str = ''
-
- # Entities
- conf_str += '\n\n[entities]'
- conf_str += self._generateEntitiesStr(entities)
-
- # relations
- conf_str += '\n\n[relations]'
- entitiesKeys = _getDictionnaryKeys(entities)
- conf_str += self._writeRelations(relations, entitiesKeys)
-
- # attributes
- conf_str += '\n\n[attributes]'
- # events
- conf_str += '\n\n[events]'
-
- # Write conf file
- file = open(self.folder+self.conf_file,'w')
- file.write(conf_str)
- file.close()
-
- def _getFileList(self):
- # Listing files
- filesDF = pd.DataFrame({'filename':pd.Series(os.listdir(self.folder))})
- filesDFSplitted = filesDF["filename"].str.split(".", expand = True)
- filesDF["id"] = filesDFSplitted[0]
- filesDF["filetype"] = filesDFSplitted[1]
-
- filesDF = filesDF[filesDF["filetype"].isin(["txt","ann"])]
-
- return(filesDF)
-
- def _parseData(self):
-
- # Listing files
- filesDF = self._getFileList()
-
- # Getting data from txt and ann
- filesDF_txt = filesDF.rename(columns = {"filename":"text_data"}).loc[filesDF["filetype"] == "txt", ["id","text_data"]]
- filesDF_ann = filesDF.rename(columns = {"filename":"annotation"}).loc[filesDF["filetype"] == "ann", ["id","annotation"]]
- dataDF = filesDF_txt.join(filesDF_ann.set_index("id"), on = "id")
- dataDF["text_data"] = dataDF["text_data"].apply(lambda x: open(self.folder+x).read())
- dataDF["annotation"] = dataDF["annotation"].apply(lambda x: open(self.folder+x).read())
-
- return(dataDF)
- def read_text(self):
- """
- read_text
- Get a pandas DataFrame containing the brat documents.
- Input : None
- Output : Pandas dataframe
- """
-
- dataDF = self._parseData()
-
- return(dataDF[["id","text_data"]])
- def read_annotation(self, ids = []):
- """
- read_annotation
- Get annotations from the brat folder.
- You can get specific annotation by filtering by id.
- input :
- ids, list (optionnal) : list of id for which you want the annotation data, if empty all annotations are returned.
- output :
- dict containing an annotations and relations data.
- """
-
- data = {}
- data["annotations"] = pd.DataFrame(columns=self.emptyDFCols["annotations"])
- data["relations"] = pd.DataFrame(columns=self.emptyDFCols["relations"])
-
- dataDF = self._parseData()[["id","annotation"]]
- dataDF = dataDF[(dataDF["annotation"].isna() == False) & (dataDF["annotation"] != '')] # Removing empty annotation
-
- # Filtering by ids
- if (len(ids) > 0):
- dataDF = dataDF[dataDF["id"].isin(pd.Series(ids).astype(str))]
-
- if (dataDF.shape[0] > 0):
-
- # Ann data to pandas
- dataDF = dataDF.join(dataDF["annotation"].str.split("\n").apply(pd.Series).stack().reset_index(level = 0).set_index("level_0")).reset_index(drop = True).drop("annotation", axis = 1).rename(columns = {0: "annotation"})
- dataDF = dataDF[dataDF["annotation"].str.len() > 0].reset_index(drop = True)
- dataDF = dataDF.join(dataDF["annotation"].str.split("\t", expand = True).rename(columns = {0: 'type_id', 1: 'data', 2: 'word'})).drop("annotation", axis = 1)
- dataDF["type"] = dataDF["type_id"].str.slice(0,1)
-
- ## Annotations
- data["annotations"] = dataDF[dataDF["type"] == 'T']
- if (data["annotations"].shape[0] > 0):
- data["annotations"] = data["annotations"].join(data["annotations"]["data"].str.split(" ", expand = True).rename(columns = {0: "label", 1: "start", 2: "end"})).drop(columns = ["data","type"])
- ## Relations
- data["relations"] = dataDF[dataDF["type"] == 'R']
- if (data["relations"].shape[0] > 0):
- tmp_splitted = data["relations"]["data"].str.split(" ", expand = True).rename(columns = {0: "relation"})
- ### Col names
- rename_dict = dict(zip(list(tmp_splitted.columns.values[1:]), list("Arg"+tmp_splitted.columns.values[1:].astype(str).astype(object))))
- tmp_splitted = tmp_splitted.rename(columns = rename_dict)
- ### Merging data
- tmp_splitted = tmp_splitted[["relation"]].join(tmp_splitted.loc[:,tmp_splitted.columns[tmp_splitted.columns != 'relation']].applymap(lambda x: x.split(":")[1]))
- data["relations"] = data["relations"].join(tmp_splitted).drop(columns = ["data","type","word"])
-
- return(data)
-
- def _write_function(self, x, filetype = "txt", overwrite = False):
-
- filenames = []
-
- if (filetype == 'txt' or filetype == 'both'):
- filenames.append(self.folder+str(x["filename"])+'.txt')
-
- if (filetype == 'ann' or filetype == 'both'):
- filenames.append(self.folder+str(x["filename"])+'.ann')
-
- for filename in filenames:
- try:
- open(str(filename), "r")
- is_file = True
- except FileNotFoundError:
- is_file = False
-
- if ((is_file == False) or (overwrite == True)):
- file = open(str(filename), "w")
- file.write(x["content"])
- file.close()
-
- def write_text(self, text_id, text, empty = False, overWriteAnnotations = False):
-
- """
- write_text
- Send text data from the brat folder.
- input :
- text_id, pd.Series : pandas series containing documents ids
- text, pd.Series : pandas series containing documents text in the same order as text_id
- empty, boolean : if True the brat folder is emptyied of all but configuration data (text and ann files) before writting
- overwriteAnnotations, boolean : if True, the current annotation files are replaced by blank one
- """
- if overWriteAnnotations == True: # On controle la façon dont la variable est écrite
- overwriteAnn = True
- else:
- overwriteAnn = False
-
- if (type(text) == type(pd.Series()) and type(text_id) == type(pd.Series()) and text.shape[0] == text_id.shape[0]):
-
- # ID check : check should be smaller than text : check if not inverted
- if (text_id.astype(str).str.len().max() < text.astype(str).str.len().max()):
-
- # empty : option to erase existing data
- if (empty):
- self._emptyData()
- # Writting data
- print("Writting data")
- df_text = pd.DataFrame({"filename":text_id, "content":text})
- df_ann = pd.DataFrame({"filename":text_id, "content":""})
-
- df_text.apply(lambda x: self._write_function(x, filetype = "txt", overwrite = True), axis = 1)
- df_ann.apply(lambda x: self._write_function(x, filetype = "ann", overwrite = overwriteAnn), axis = 1)
- print("data written.")
-
- else:
- raise ValueError('ID is larger than text, maybe you inverted them.')
-
- else:
- raise ValueError('Incorrect variable type, expected two Pandas Series of same shape.')
-
- def write_annotations(self, df, text_id, word, label, start, end, overwrite = False):
-
- """
- write_annotations
- Send annotation data from the brat folder. Useful to pre-anotate some data.
- input :
- df, pd.Dataframe : dataframe containing annotations data, should contains the text id, the annotated word, the annotated label, the start and end offset.
- text_id, str : name of the column in df which contains the document id
- word, str : name of the column in df which contains the annotated word
- label, str : name of the column in df which contains the label of the annotated word
- start, str : name of the column in df which contains the start offset
- end, str : name of the column in df which contains the end offset
- overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
- """
-
- # Checking data types
- if (type(df) == type(pd.DataFrame())):
-
- # Loading df
- df = df.rename(columns = {text_id:"id",word:"word",label:"label",start:"start",end:"end"})
- df["type_id"] = df.groupby("id").cumcount()+1
-
- # List of ids
- ids = df["id"].unique()
- # Loading current data
- current_annotation = self.read_annotation(ids)
- current_annotations = current_annotation["annotations"]
- tmaxDFAnnotations = current_annotations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Tmax"})
-
- if (overwrite == True):
- df["type_id"] = "T"+df["type_id"].astype(str)
- new_annotations = df
- else:
- df = df.join(tmaxDFAnnotations, on = "id").fillna(0)
- df["type_id"] = "T"+(df["type_id"]+df["Tmax"]).astype(int).astype(str)
- df = df.drop(columns = ["Tmax"])
-
- new_annotations = pd.concat((current_annotations, df[self.emptyDFCols["annotations"]])).reset_index(drop = True)
-
- new_annotations.drop_duplicates() ## Removing duplicates
-
- # Injecting new annotations
- current_annotation["annotations"] = new_annotations
-
- # Calling write function
- self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
-
- else:
- raise ValueError('Incorrect variable type, expected a Pandas DF.')
-
-
- def write_relations(self, df, text_id, relation, overwrite = False):
-
- """
- write_relations
- Send relations data from the brat folder. Useful to pre-anotate some data.
- input :
- df, pd.Dataframe : dataframe containing relations data, should contains the text id, the relation name, the if of the linked annotations.
- text_id, str : name of the column in df which contains the document id
- relation, str : name of the column in df which contains the relation name
- overwrite, boolean : if True, the current annotation files are replaced by new data, otherwise, the new annotations are merged with existing one
- The other columns should contains the type_id of related entities, as outputed by the read_annotation method.
- """
-
- # Checking data types
- if (type(df) == type(pd.DataFrame())):
-
- # Loading df
- df = df.rename(columns = {text_id:"id",relation:"relation"})
- df["type_id"] = df.groupby("id").cumcount()+1 # type_id
-
- # Columns names
- old_columns = df.columns[np.isin(df.columns, ["id", "relation","type_id"]) == False]
- new_columns = "Arg"+np.array(list(range(1,len(old_columns)+1))).astype(str).astype(object)
- df = df.rename(columns = dict(zip(old_columns, new_columns)))
-
- # List of ids
- ids = df["id"].unique()
- # Loading current data
- current_annotation = self.read_annotation(ids)
- current_relations = current_annotation["relations"]
- rmaxDFrelations = current_relations.set_index(["id"])["type_id"].str.slice(1,).astype(int).reset_index().groupby("id").max().rename(columns = {"type_id":"Rmax"})
- if (overwrite == True):
- df["type_id"] = "R"+df["type_id"].astype(str)
- new_relations = df
- else:
- df = df.join(rmaxDFrelations, on = "id").fillna(0)
- df["type_id"] = "R"+(df["type_id"]+df["Rmax"]).astype(int).astype(str)
- df = df.drop(columns = ["Rmax"])
-
- # Adding missing columns
- if (len(df.columns) > len(current_relations.columns)):
- for column in df.columns[np.isin(df.columns, current_relations.columns) == False]:
- current_relations[column] = np.nan
- else:
- for column in current_relations.columns[np.isin(current_relations.columns, df.columns) == False]:
- df[column] = np.nan
-
- new_relations = pd.concat((current_relations, df[current_relations.columns])).reset_index(drop = True)
-
- new_relations.drop_duplicates() ## Removing duplicates
-
- # Injecting new annotations
- current_annotation["relations"] = new_relations
-
- # Calling write function
- self._write_annotation(current_annotation["annotations"], current_annotation["relations"])
-
- else:
- raise ValueError('Incorrect variable type, expected a Pandas DF.')
-
- def _generate_annotations_str (self, annotations):
-
- annotations = annotations.reset_index(drop = True)
- annotations["label_span"] = annotations[["label","start","end"]].apply(lambda x: ' '.join(x.astype(str).values), axis = 1)
- annotations_str = '\n'.join(annotations[["type_id","label_span","word"]].apply(lambda x: '\t'.join(x.astype(str).values), axis = 1).values)
- return(annotations_str)
-
- def _generate_relations_str (self, relations):
-
-
- relations = relations.fillna('').applymap(lambda x: '' if x == 'nan' else x) #cleaning data
- columns = relations.columns[np.isin(relations.columns, ["id","type_id","relation"]) == False].values.tolist()
- boolmap = relations[columns].transpose().applymap(lambda x: int(x != ''))
- rct = relations[columns].transpose()
- temp_relations = (boolmap*(np.array(np.repeat(rct.index,rct.shape[1])).reshape(rct.shape)+':')+rct.astype(str)).transpose()
- relations_str = '\n'.join(relations[["type_id","relation"]].join(temp_relations[columns]).apply(lambda x: '\t'.join(x.values), axis = 1).values)
- return(relations_str)
-
- def _write_file(self, data):
- file = open(self.folder+str(data["id"])+".ann", "w")
- file.write(data["str_to_write"])
- file.close()
-
- def _write_annotation(self,annotations,relations):
-
- # Checking data types
- if (type(annotations) == type(pd.DataFrame()) and type(relations) == type(pd.DataFrame())):
-
- # Gerenating str
- data_annotations = annotations.groupby("id").agg(lambda x: self._generate_annotations_str(x)).iloc[:,0]
- data_relations = relations.groupby("id").agg(lambda x: self._generate_relations_str(x)).iloc[:,0]
- # Merging data
- data = pd.DataFrame({"annotations":data_annotations, "relations":data_relations}).fillna('')
- data["str_to_write"] = data.apply(lambda x : '\n'.join(x.values), axis = 1)
- data = data.reset_index().rename(columns = {"index":"id"})
-
- # Writting files
- data.apply(self._write_file, axis = 1)
-
- return(data)
-
- else:
- raise ValueError('Incorrect variable type, expected a Pandas DF.')
- def _export_conll_2003 (self, data):
- '''
- Internal function for export in conll format.
- '''
- # Creating i-label
- data["i-label"] = (data["label"] != "O").astype(int)*(data["i-type"]+'-')+data["label"]
-
- # Creating string
- data["str"] = data[["token","pos","chunks","i-label"]].apply(lambda x: ' '.join(x), axis = 1)
-
- connll_str = "-DOCSTART- -X- -X- O"+"\n\n"+"\n\n".join(
- data.groupby("id").agg(lambda x: "\n".join(x))["str"].values.tolist()
- )
-
- return(connll_str)
- def _get_tokenized_data(self, text_data, annotations_data, tokenizer = default_tokenizer, keep_empty = False):
-
- '''
- Internal function that process text and annotation data to calculate token, pos and chunks.
-
- Input :
- text_data : text data exported from current class
- annotations_data : annotations data exported from current class
- tokenizer : tokenizer function from extract_tools
- keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
- Output :
- Aggreged data in Pandas DataFrame.
- '''
- # Applying tokenizer to text
- text_data["tokens"] = text_data["text_data"].apply(tokenizer)
- # Exploding dataframe by tokens and rename column
- exploded_text_data = text_data[["id", "tokens"]].explode("tokens").reset_index(drop = True)
- exploded_text_data = exploded_text_data.join(
- exploded_text_data["tokens"] \
- .apply(pd.Series) \
- .rename(columns = {
- 0:'token',1:'start_offset',2:'end_offset', 3:'pos'
- })
- ) \
- .drop(columns = ["tokens"])
-
- # Getting entities from annotations
-
- ## We merge by offset
-
- ### Creating a word id and annotation id
- exploded_text_data = exploded_text_data \
- .reset_index(drop = True) \
- .reset_index() \
- .rename(columns = {"index":"word_id"})
-
- annotations_data = annotations_data \
- .reset_index() \
- .rename(columns = {"index":"ann_id"})
-
- ### Offset of string
- text_offsets = pd.DataFrame(exploded_text_data[["id","word_id","start_offset","end_offset"]])
- text_offsets["start_offset"] = text_offsets["start_offset"].astype(int)
- text_offsets["end_offset"] = text_offsets["end_offset"].astype(int)
- text_offsets["offsets"] = text_offsets \
- .apply(
- lambda x: list(range(x["start_offset"], x["end_offset"]+1)), axis = 1
- )
- text_offsets = text_offsets[["id","word_id", "offsets"]] \
- .explode("offsets")
-
- ### Offset of annotations
-
- ann_offsets = pd.DataFrame(
- annotations_data[["id", "ann_id", "start", "end"]]
- )
- ann_offsets["start"] = ann_offsets["start"].astype(int)
- ann_offsets["end"] = ann_offsets["end"].astype(int)
- if (ann_offsets.shape[0] > 0):
- ann_offsets["offsets"] = ann_offsets \
- .apply(
- lambda x: list(range(x["start"], x["end"]+1)), axis = 1
- )
- else:
- ann_offsets["offsets"] = ''
- ann_offsets = ann_offsets[["id","ann_id", "offsets"]] \
- .explode("offsets")
-
- # Merging by term
-
- text_offsets["uid"] = text_offsets["id"].astype(str) \
- + text_offsets["offsets"].astype(str)
-
- ann_offsets["uid"] = ann_offsets["id"].astype(str) \
- + ann_offsets["offsets"].astype(str)
-
- merged_id = text_offsets \
- .join(
- ann_offsets[["ann_id","uid"]].set_index("uid"),
- on = "uid"
- ) \
- .dropna()
- merged_id["ann_id"] = merged_id["ann_id"].astype(int)
-
- merged_id = merged_id[["word_id", "ann_id"]] \
- .set_index("ann_id") \
- .drop_duplicates() \
- .join(annotations_data, on = "ann_id")
-
- # Keeping last when duplicate word_id
-
- merged_id = merged_id \
- .drop_duplicates("word_id", keep = "last")
-
- # Joining annotation with word id
- output_df = exploded_text_data \
- .join(merged_id[["label","word_id"]] \
- .set_index("word_id"),
- on = "word_id",
- how = "left") \
- .fillna("O")[["id", "token","label", "pos"]]
-
- # Creation of i-type : if O : i-type is B
- output_df["i-type"] = output_df \
- .groupby("id").agg(lambda x: ["B"]+["I"]*(len(x)-1))["label"].explode().reset_index()["label"]
- output_df.loc[output_df["label"] == "O", "i-type"] = 'B'
-
- # Empty chunks
- output_df["chunks"] = 'O'
- # Post - processing
- if (keep_empty == False):
- output_df = output_df[output_df["token"] != ''].reset_index(drop = True)
-
- return(output_df)
- def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False, entities = None):
- '''
- Function that generate an export file.
- Supported export format are :
- - conll-2003
- input :
- export_format : name of the export format
- tokenizer : tokenizer function from extract_tools
- keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
- entities : if None, all entities are send to the export file, if there is the conflict the most recent is used, otherwise the entities are selected before
- Output :
- str : output string in selected export format
- '''
- supported_export_format = {
- "conll-2003":self._export_conll_2003
- }
-
- # Check the export format
- if (export_format not in supported_export_format.keys()):
- raise Exception(str(export_format)+" format not supported. Export format should be one of these : {}".format(
- ", ".join(supported_export_format.keys())
- ))
-
- # Create dataframe of tokenized word associated with annotations
- ## Getting data from brat
- text_data = self.read_text()
- annotations_data = self.read_annotation()["annotations"]
- ## Filtering entities
- if entities is not None:
- if type(entities) != type(list()):
- raise Exception("entities should be of type list")
- annotations_data = annotations_data[annotations_data["label"].isin(entities)] \
- .reset_index(drop = True)
-
- ## Parsing data
- data = self._get_tokenized_data(tokenizer=tokenizer,
- text_data = text_data,
- annotations_data = annotations_data,
- keep_empty = keep_empty)
- # Execute the export format associated function
- data_str = supported_export_format[export_format](data = data)
- return(data_str)
|