5 năm trước cách đây · 90726874f6
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,5 @@
 
				+1.1.1 (11-10-2020)
			
 
				+    Adding the option to filter export for specified entities.
			
 
				 1.1 (11-10-2020)
			
 
				     Adding : export feature to ConLL format
			
 
				     Adding : LICENSE
			
--- a/README.md
+++ b/README.md
@@ -229,7 +229,7 @@ The only currently supported format is ConLL-2003.
 
				 To export data, you can use the export method.
			
 
				 
			
 
				 ```
			
 
				-    bratData.export(export_format = EXPORT_FORMAT, tokenizer = TOKENIZER, keep_empty = KEEP_EMPTY_OPTION)
			
 
				+    bratData.export(export_format = EXPORT_FORMAT, tokenizer = TOKENIZER, entities = ENTITIES_OPTION, keep_empty = KEEP_EMPTY_OPTION)
			
 
				 ```
			
 
				 
			
 
				 The export_format parameter is used to specify the export format. The only one, which is the default one, supported is ConLL-2003.
			
@@ -246,5 +246,7 @@ You can also use Spacy tokenizer, in that case you should import the spacy_token
 
				     bratData.export(tokenizer = spacy_tokenizer_loaded)
			
 
				 ```
			
 
				 
			
 
				+You can restrict the export to a limited set of entities. For that, the list of entities are specified in the entities parameter. If set as None, which is the default value, all entities will we considered. If a word contains many entities, the last one is kept.
			
 
				+
			
 
				 Finally, the keep_empty option is defaultly set as False. This means that every empty tokens will be removed from the exported data.
			
 
				 You can set it as True if you want to keep empty tokens.
			
--- a/__pycache__/extract_tools.cpython-37.pyc
+++ b/__pycache__/extract_tools.cpython-37.pyc
--- a/__pycache__/pandasToBrat.cpython-37.pyc
+++ b/__pycache__/pandasToBrat.cpython-37.pyc
--- a/pandasToBrat.py
+++ b/pandasToBrat.py
@@ -601,11 +601,15 @@ class pandasToBrat:
 
				         )
			
 
				         ann_offsets["start"] = ann_offsets["start"].astype(int)
			
 
				         ann_offsets["end"] = ann_offsets["end"].astype(int)
			
 
				-        ann_offsets["offsets"] = ann_offsets \
			
 
				-            .apply(
			
 
				-                lambda x: list(range(x["start"], x["end"]+1)), axis = 1
			
 
				-            )
			
 
				-        
			
 
				+
			
 
				+        if (ann_offsets.shape[0] > 0):
			
 
				+            ann_offsets["offsets"] = ann_offsets \
			
 
				+                .apply(
			
 
				+                    lambda x: list(range(x["start"], x["end"]+1)), axis = 1
			
 
				+                )
			
 
				+        else:
			
 
				+            ann_offsets["offsets"] = ''
			
 
				+
			
 
				         ann_offsets = ann_offsets[["id","ann_id", "offsets"]] \
			
 
				                         .explode("offsets")
			
 
				             
			
@@ -658,7 +662,7 @@ class pandasToBrat:
 
				         
			
 
				         return(output_df)
			
 
				 
			
 
				-    def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False):
			
 
				+    def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False, entities = None):
			
 
				 
			
 
				         '''
			
 
				             Function that generate an export file.
			
@@ -669,6 +673,7 @@ class pandasToBrat:
 
				                 export_format : name of the export format
			
 
				                 tokenizer : tokenizer function from extract_tools
			
 
				                 keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
			
 
				+                entities : if None, all entities are send to the export file, if there is the conflict the most recent is used, otherwise the entities are selected before
			
 
				             Output :
			
 
				                 str : output string in selected export format
			
 
				         '''
			
@@ -687,6 +692,14 @@ class pandasToBrat:
 
				         ## Getting data from brat
			
 
				         text_data = self.read_text()
			
 
				         annotations_data = self.read_annotation()["annotations"]
			
 
				+
			
 
				+        ## Filtering entities
			
 
				+        if entities is not None:
			
 
				+            if type(entities) != type(list()):
			
 
				+                raise Exception("entities should be of type list")
			
 
				+
			
 
				+            annotations_data = annotations_data[annotations_data["label"].isin(entities)] \
			
 
				+                                                                         .reset_index(drop = True)
			
 
				         
			
 
				         ## Parsing data
			
 
				         data = self._get_tokenized_data(tokenizer=tokenizer,