extract_tools.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import pandas as pd
  2. import numpy as np
  3. import re
  4. ##
  5. # extract_tools : functions used in extract functionnality
  6. ##
  7. ### ###
  8. ### TOKENIZERS ###
  9. ### ###
  10. #
  11. # Tokenizers : Functions that cut string to tokens and calculate pos tags
  12. #
  13. # How to write a Tokenizer ?
  14. # input : should be a sentence
  15. # output : should be an array [[token, start offset, end offset, pos tag], ...]
  16. #
  17. def default_tokenizer (x):
  18. '''
  19. default_tokenizer
  20. The minimal tokenizer, cut when a blanc space or an new line exists.
  21. input : str, sentence
  22. output : array, [[token, start offset, end offset], ...]
  23. '''
  24. split_catacters = " |\n"
  25. tokens = pd.DataFrame(re.split(split_catacters, x)).rename(columns = {0:'token'})
  26. # Splitted word size
  27. tokens["size"] = tokens["token"].apply(len)
  28. # Cutting caracter size after the spitted word
  29. tokens["empty_space"] = 1
  30. # Cum sum of empty space and size
  31. temp_cum_sum = tokens.cumsum()
  32. tokens["start_offset"] = (temp_cum_sum["size"]-tokens["size"])+temp_cum_sum["empty_space"]-1
  33. tokens["end_offset"] = tokens["size"]+tokens["start_offset"]
  34. tokens["pos_tag"] = "O"
  35. tokens_list = tokens[["token", "start_offset","end_offset", "pos_tag"]].values.tolist()
  36. return (tokens_list)
  37. def spacy_tokenizer(nlp):
  38. '''
  39. Function that generate a tokenizer from Spacy object.
  40. input : spacy nlp function
  41. output : tokenizer function for export function of pandasToBrat
  42. '''
  43. def _spacy_tokenizer(x):
  44. tokens_data = pd.DataFrame(nlp(x))
  45. tokens_data["tokens"] = tokens_data[0].apply(lambda x: x.text)
  46. tokens_data["size"] = tokens_data["tokens"].str.len()
  47. tokens_data["start_offset"] = tokens_data[0].apply(lambda x: x.idx)
  48. tokens_data["end_offset"] = tokens_data["start_offset"]+tokens_data["size"]
  49. tokens_data["pos"] = tokens_data[0].apply(lambda x: x.pos_)
  50. output_list = tokens_data[["tokens", "start_offset", "end_offset", "pos"]].values.tolist()
  51. return(output_list)
  52. return(_spacy_tokenizer)