Browse Source

Upgrade to 1.1.

Ali 4 years ago
parent
commit
cef03a905d
8 changed files with 308 additions and 3 deletions
  1. 3 0
      CHANGES.txt
  2. 21 0
      LICENSE
  3. 29 1
      README.md
  4. BIN
      __pycache__/extract_tools.cpython-37.pyc
  5. BIN
      __pycache__/pandasToBrat.cpython-37.pyc
  6. 75 0
      extract_tools.py
  7. 179 1
      pandasToBrat.py
  8. 1 1
      setup.py

+ 3 - 0
CHANGES.txt

@@ -1 +1,4 @@
+1.1 (11-10-2020)
+    Adding : export feature to ConLL format
+    Adding : LICENSE
 1.0 Initial release

+ 21 - 0
LICENSE

@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

+ 29 - 1
README.md

@@ -15,6 +15,7 @@ pandasToBrat is a library to manage brat configuration and brat data from a Pyth
 - Writting brat text file from python pandas Series
 - Reading brat annotations and relations 
 - Writting brat annotations and relations from python pandas DataFrame
+- Export data to ConLL-2003 format
 
 ### What it doesn't support ?
 
@@ -219,4 +220,31 @@ It should be formated as described in the "Relations format" subpart.
 The text_id and relation are the name of the column inside the dataframe which contains the related data.
 The other columns should contains the type_id of related entities, as outputed by the read_annotation method.
 
-The overwrite option can be set as True to overwrite existing annotations, otherwise the dataframe's data are added to existing annotations data.
+The overwrite option can be set as True to overwrite existing annotations, otherwise the dataframe's data are added to existing annotations data.
+
+### Export data to standard format
+
+The only currently supported format is ConLL-2003.
+
+To export data, you can use the export method.
+
+```
+    bratData.export(export_format = EXPORT_FORMAT, tokenizer = TOKENIZER, keep_empty = KEEP_EMPTY_OPTION)
+```
+
+The export_format parameter is used to specify the export format. The only one, which is the default one, supported is ConLL-2003.
+The tokenizer parameter contains the tokenizer functions. Tokenizers functions are stored in pandasToBrat.extract_tools. The aim of the function is to generate tokens and pos tag from text. The default one, _default_tokenizer_, is the simplest one, that split on space and new line character.
+You can also use Spacy tokenizer, in that case you should import the spacy_tokenizer functions as demonstrated in this example :
+
+```
+    from pandasToBrat.extract_tools import spacy_tokenizer
+    import spacy
+
+    nlp = spacy.load(SPACY_MODEL)
+    spacy_tokenizer_loaded = spacy_tokenizer(nlp)
+
+    bratData.export(tokenizer = spacy_tokenizer_loaded)
+```
+
+Finally, the keep_empty option is defaultly set as False. This means that every empty tokens will be removed from the exported data.
+You can set it as True if you want to keep empty tokens.

BIN
__pycache__/extract_tools.cpython-37.pyc


BIN
__pycache__/pandasToBrat.cpython-37.pyc


+ 75 - 0
extract_tools.py

@@ -0,0 +1,75 @@
+import pandas as pd
+import numpy as np
+import re
+
+##
+# extract_tools : functions used in extract functionnality
+##
+
+
+###             ###
+### TOKENIZERS  ###
+###             ###
+
+#
+# Tokenizers : Functions that cut string to tokens and calculate pos tags
+#
+# How to write a Tokenizer ?
+#   input : should be a sentence
+#   output : should be an array [[token, start offset, end offset, pos tag], ...]
+#
+
+def default_tokenizer (x):
+
+    '''
+        default_tokenizer
+        The minimal tokenizer, cut when a blanc space or an new line exists.
+
+        input : str, sentence
+        output : array, [[token, start offset, end offset], ...]
+    '''
+
+    split_catacters = " |\n"
+        
+    tokens = pd.DataFrame(re.split(split_catacters, x)).rename(columns = {0:'token'})
+    
+    # Splitted word size
+    tokens["size"] = tokens["token"].apply(len)
+    
+    # Cutting caracter size after the spitted word
+    tokens["empty_space"] = 1
+    
+    # Cum sum of empty space and size
+    temp_cum_sum = tokens.cumsum()
+    
+    tokens["start_offset"] = (temp_cum_sum["size"]-tokens["size"])+temp_cum_sum["empty_space"]-1
+    tokens["end_offset"] = tokens["size"]+tokens["start_offset"]
+    tokens["pos_tag"] = "O"
+    
+    tokens_list = tokens[["token", "start_offset","end_offset", "pos_tag"]].values.tolist()
+    
+    return (tokens_list)
+
+def spacy_tokenizer(nlp):
+    
+    '''
+        Function that generate a tokenizer from Spacy object.
+        
+        input : spacy nlp function
+        output : tokenizer function for export function of pandasToBrat
+    '''
+    
+    def _spacy_tokenizer(x):
+        
+        tokens_data = pd.DataFrame(nlp(x))
+        tokens_data["tokens"] = tokens_data[0].apply(lambda x: x.text)
+        tokens_data["size"] = tokens_data["tokens"].str.len()
+        tokens_data["start_offset"] = tokens_data[0].apply(lambda x: x.idx)
+        tokens_data["end_offset"] = tokens_data["start_offset"]+tokens_data["size"]
+        tokens_data["pos"] = tokens_data[0].apply(lambda x: x.pos_)        
+        
+        output_list = tokens_data[["tokens", "start_offset", "end_offset", "pos"]].values.tolist()
+        
+        return(output_list)
+    
+    return(_spacy_tokenizer)

+ 179 - 1
pandasToBrat.py

@@ -2,6 +2,7 @@ import re
 import os
 import pandas as pd
 import numpy as np
+from pandasToBrat.extract_tools import default_tokenizer
 
 def _getDictionnaryKeys(dictionnary):
     """
@@ -519,4 +520,181 @@ class pandasToBrat:
             return(data)
             
         else:
-            raise ValueError('Incorrect variable type, expected a Pandas DF.')
+            raise ValueError('Incorrect variable type, expected a Pandas DF.')
+
+    def _export_conll_2003 (self, data):
+
+        '''
+            Internal function for export in conll format.
+        '''
+
+        # Creating i-label
+        data["i-label"] = (data["label"] != "O").astype(int)*(data["i-type"]+'-')+data["label"]
+        
+        # Creating string
+        data["str"] = data[["token","pos","chunks","i-label"]].apply(lambda x: ' '.join(x), axis = 1)
+        
+        connll_str = "-DOCSTART- -X- -X- O"+"\n\n"+"\n\n".join(
+            data.groupby("id").agg(lambda x: "\n".join(x))["str"].values.tolist()
+        )
+        
+        return(connll_str)
+
+    def _get_tokenized_data(self, text_data, annotations_data, tokenizer = default_tokenizer, keep_empty = False):
+        
+        '''
+            Internal function that process text and annotation data to calculate token, pos and chunks.
+            
+            Input :
+                text_data : text data exported from current class
+                annotations_data : annotations data exported from current class
+                tokenizer : tokenizer function from extract_tools
+                keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
+            Output :
+                Aggreged data in Pandas DataFrame.
+        '''
+
+        # Applying tokenizer to text
+        text_data["tokens"] = text_data["text_data"].apply(tokenizer)
+
+        # Exploding dataframe by tokens and rename column
+        exploded_text_data = text_data[["id", "tokens"]].explode("tokens").reset_index(drop = True)
+        exploded_text_data = exploded_text_data.join(
+            exploded_text_data["tokens"] \
+                .apply(pd.Series) \
+                .rename(columns = {
+                    0:'token',1:'start_offset',2:'end_offset', 3:'pos'
+                })
+        ) \
+        .drop(columns = ["tokens"])
+                
+        # Getting entities from annotations
+        
+        ## We merge by offset
+        
+        ### Creating a word id and annotation id
+        exploded_text_data = exploded_text_data \
+                            .reset_index(drop = True) \
+                            .reset_index() \
+                            .rename(columns = {"index":"word_id"})
+        
+        annotations_data = annotations_data \
+                            .reset_index() \
+                            .rename(columns = {"index":"ann_id"})
+        
+        ### Offset of string
+        text_offsets = pd.DataFrame(exploded_text_data[["id","word_id","start_offset","end_offset"]])
+
+        text_offsets["start_offset"] = text_offsets["start_offset"].astype(int)
+        text_offsets["end_offset"] = text_offsets["end_offset"].astype(int)
+        text_offsets["offsets"] = text_offsets \
+            .apply(
+                lambda x: list(range(x["start_offset"], x["end_offset"]+1)), axis = 1
+            )
+        text_offsets = text_offsets[["id","word_id", "offsets"]] \
+                        .explode("offsets")
+        
+        ### Offset of annotations
+        
+        ann_offsets = pd.DataFrame(
+            annotations_data[["id", "ann_id", "start", "end"]]
+        )
+        ann_offsets["start"] = ann_offsets["start"].astype(int)
+        ann_offsets["end"] = ann_offsets["end"].astype(int)
+        ann_offsets["offsets"] = ann_offsets \
+            .apply(
+                lambda x: list(range(x["start"], x["end"]+1)), axis = 1
+            )
+        
+        ann_offsets = ann_offsets[["id","ann_id", "offsets"]] \
+                        .explode("offsets")
+            
+        # Merging by term
+        
+        text_offsets["uid"] = text_offsets["id"].astype(str) \
+                            + text_offsets["offsets"].astype(str)
+        
+        ann_offsets["uid"] = ann_offsets["id"].astype(str) \
+                            + ann_offsets["offsets"].astype(str)
+        
+        merged_id = text_offsets \
+                    .join(
+                        ann_offsets[["ann_id","uid"]].set_index("uid"), 
+                        on = "uid"
+                    ) \
+                    .dropna()
+        merged_id["ann_id"] = merged_id["ann_id"].astype(int)
+        
+        merged_id = merged_id[["word_id", "ann_id"]] \
+            .set_index("ann_id") \
+            .drop_duplicates() \
+            .join(annotations_data, on = "ann_id")
+        
+        # Keeping last when duplicate word_id
+        
+        merged_id = merged_id \
+            .drop_duplicates("word_id", keep = "last")
+        
+        # Joining annotation with word id
+
+        output_df = exploded_text_data \
+            .join(merged_id[["label","word_id"]] \
+                .set_index("word_id"), 
+                on = "word_id",
+                how = "left") \
+            .fillna("O")[["id", "token","label", "pos"]]
+        
+        # Creation of i-type : if O : i-type is B
+        output_df["i-type"] = output_df \
+            .groupby("id").agg(lambda x: ["B"]+["I"]*(len(x)-1))["label"].explode().reset_index()["label"]
+        output_df.loc[output_df["label"] == "O", "i-type"] = 'B'
+        
+        # Empty chunks
+        output_df["chunks"] = 'O'
+
+        # Post - processing
+        if (keep_empty == False):
+            output_df = output_df[output_df["token"] != ''].reset_index(drop = True)
+        
+        return(output_df)
+
+    def export(self, export_format = "conll-2003", tokenizer = default_tokenizer, keep_empty = False):
+
+        '''
+            Function that generate an export file.
+            Supported export format are :
+                - conll-2003
+
+            input :
+                export_format : name of the export format
+                tokenizer : tokenizer function from extract_tools
+                keep_empty : default False, parameter boolean, if True empty token are not removed, otherwise they are removed
+            Output :
+                str : output string in selected export format
+        '''
+
+        supported_export_format = {
+            "conll-2003":self._export_conll_2003
+        }
+            
+        # Check the export format
+        if (export_format not in supported_export_format.keys()):
+            raise Exception(str(export_format)+" format not supported. Export format should be one of these : {}".format(
+            ", ".join(supported_export_format.keys())  
+            ))
+        
+        # Create dataframe of tokenized word associated with annotations
+        ## Getting data from brat
+        text_data = self.read_text()
+        annotations_data = self.read_annotation()["annotations"]
+        
+        ## Parsing data
+        data = self._get_tokenized_data(tokenizer=tokenizer, 
+                                    text_data = text_data, 
+                                    annotations_data = annotations_data, 
+                                    keep_empty = keep_empty)
+
+        # Execute the export format associated function
+        data_str = supported_export_format[export_format](data = data)
+
+        return(data_str)

+ 1 - 1
setup.py

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
  
 setup(name='pandasToBrat',
  
-      version='1.0',
+      version='1.1',
       license='',
       author='Ali BELLAMINE',
       author_email='contact@alibellamine.me',