alibell
/
pandasToBrat


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
							import pandas as pd
import numpy as np
import re

##
# extract_tools : functions used in extract functionnality
##


###             ###
### TOKENIZERS  ###
###             ###

#
# Tokenizers : Functions that cut string to tokens and calculate pos tags
#
# How to write a Tokenizer ?
#   input : should be a sentence
#   output : should be an array [[token, start offset, end offset, pos tag], ...]
#

def default_tokenizer (x):

    '''
        default_tokenizer
        The minimal tokenizer, cut when a blanc space or an new line exists.

        input : str, sentence
        output : array, [[token, start offset, end offset], ...]
    '''

    split_catacters = " |\n"
        
    tokens = pd.DataFrame(re.split(split_catacters, x)).rename(columns = {0:'token'})
    
    # Splitted word size
    tokens["size"] = tokens["token"].apply(len)
    
    # Cutting caracter size after the spitted word
    tokens["empty_space"] = 1
    
    # Cum sum of empty space and size
    temp_cum_sum = tokens.cumsum()
    
    tokens["start_offset"] = (temp_cum_sum["size"]-tokens["size"])+temp_cum_sum["empty_space"]-1
    tokens["end_offset"] = tokens["size"]+tokens["start_offset"]
    tokens["pos_tag"] = "O"
    
    tokens_list = tokens[["token", "start_offset","end_offset", "pos_tag"]].values.tolist()
    
    return (tokens_list)

def spacy_tokenizer(nlp):
    
    '''
        Function that generate a tokenizer from Spacy object.
        
        input : spacy nlp function
        output : tokenizer function for export function of pandasToBrat
    '''
    
    def _spacy_tokenizer(x):
        
        tokens_data = pd.DataFrame(nlp(x))
        tokens_data["tokens"] = tokens_data[0].apply(lambda x: x.text)
        tokens_data["size"] = tokens_data["tokens"].str.len()
        tokens_data["start_offset"] = tokens_data[0].apply(lambda x: x.idx)
        tokens_data["end_offset"] = tokens_data["start_offset"]+tokens_data["size"]
        tokens_data["pos"] = tokens_data[0].apply(lambda x: x.pos_)        
        
        output_list = tokens_data[["tokens", "start_offset", "end_offset", "pos"]].values.tolist()
        
        return(output_list)
    
    return(_spacy_tokenizer)