Browse Source

Upgrade to 1.1.1 : Adding filter of the extracted entities

Ali 4 years ago
parent
commit
c3890ba16f
5 changed files with 24 additions and 7 deletions
  1. 2 0
      CHANGES.txt
  2. 3 1
      README.md
  3. BIN
      __pycache__/extract_tools.cpython-37.pyc
  4. BIN
      __pycache__/pandasToBrat.cpython-37.pyc
  5. 19 6
      pandasToBrat.py

+ 2 - 0
CHANGES.txt

@@ -1,3 +1,5 @@
+1.1.1 (11-10-2020)
+    Adding the option to filter export for specified entities.
 1.1 (11-10-2020)
     Adding : export feature to ConLL format
     Adding : LICENSE

+ 3 - 1
README.md

@@ -229,7 +229,7 @@ The only currently supported format is ConLL-2003.
 To export data, you can use the export method.
 
 ```
-    bratData.export(export_format = EXPORT_FORMAT, tokenizer = TOKENIZER, keep_empty = KEEP_EMPTY_OPTION)
+    bratData.export(export_format = EXPORT_FORMAT, tokenizer = TOKENIZER, entities = ENTITIES_OPTION, keep_empty = KEEP_EMPTY_OPTION)
 ```
 
 The export_format parameter is used to specify the export format. The only one, which is the default one, supported is ConLL-2003.
@@ -246,5 +246,7 @@ You can also use Spacy tokenizer, in that case you should import the spacy_token
     bratData.export(tokenizer = spacy_tokenizer_loaded)
 ```
 
+You can restrict the export to a limited set of entities. For that, the list of entities are specified in the entities parameter. If set as None, which is the default value, all entities will we considered. If a word contains many entities, the last one is kept.
+
 Finally, the keep_empty option is defaultly set as False. This means that every empty tokens will be removed from the exported data.
 You can set it as True if you want to keep empty tokens.

BIN
__pycache__/extract_tools.cpython-37.pyc


BIN
__pycache__/pandasToBrat.cpython-37.pyc


+ 19 - 6
pandasToBrat.py

@@ -601,11 +601,15 @@ class pandasToBrat:
         )
         ann_offsets["start"] = ann_offsets["start"].astype(int)
         ann_offsets["end"] = ann_offsets["end"].astype(int)
-        ann_offsets["offsets"] = ann_offsets \
-            .apply(
-                lambda x: list(range(x["start"], x["end"]+1)), axis = 1
-            )
-        
+
+        if (ann_offsets.shape[0] > 0):
+            ann_offsets["offsets"] = ann_offsets \
+                .apply(
+                    lambda x: list(range(x["start"], x["end"]+1)), axis = 1
+                )
+        else:
+            ann_offsets["offsets"] = ''
+
         ann_offsets = ann_offsets[["id","ann_id", "offsets"]] \
                         .explode("offsets")
             
@@ -658,7 +662,7 @@ class pandasToBrat:
         
         return(output_df)
 
-    def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False):
+    def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False, entities = None):
 
         '''
             Function that generate an export file.
@@ -669,6 +673,7 @@ class pandasToBrat:
                 export_format : name of the export format
                 tokenizer : tokenizer function from extract_tools
                 keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
+                entities : if None, all entities are send to the export file, if there is the conflict the most recent is used, otherwise the entities are selected before
             Output :
                 str : output string in selected export format
         '''
@@ -687,6 +692,14 @@ class pandasToBrat:
         ## Getting data from brat
         text_data = self.read_text()
         annotations_data = self.read_annotation()["annotations"]
+
+        ## Filtering entities
+        if entities is not None:
+            if type(entities) != type(list()):
+                raise Exception("entities should be of type list")
+
+            annotations_data = annotations_data[annotations_data["label"].isin(entities)] \
+                                                                         .reset_index(drop = True)
         
         ## Parsing data
         data = self._get_tokenized_data(tokenizer=tokenizer,