lex_features.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from featureforge.feature import output_schema
  2. @output_schema({str})
  3. def bag_of_left_entity_IOB_chain(datapoint):
  4. print (hash(datapoint))
  5. eo = datapoint.left_entity_occurrence
  6. return set()
  7. def _bag_of_eo_IOB_chain(datapoint, eo):
  8. tokens = datapoint.segment.tokens
  9. eo_tokens = tokens[eo.segment_offset: eo.segment_offset_end]
  10. result = set()
  11. lex_trees = datapoint.segment.let_trees
  12. if not lex_trees:
  13. return set() # was not parsed on preprocess
  14. sentences = datapoint.segment.sentences
  15. for idx, eo_tk in enumerate(eo_tokens, eo.segment_offset):
  16. sentence = max(filter(lambda x: x<=idx, sentences))
  17. sentence_idx = sentences.index(sentence)
  18. tree = lex_trees[sentence_idx]
  19. tk_actual_idx = idx - sentence
  20. assert tk_actual_idx >= 0
  21. path = tree.leaf_treeposition(tk_actual_idx)
  22. #chain =
  23. #######
  24. def walk_tree(tree, path):
  25. result = tree
  26. for i in path:
  27. result = result[i]
  28. return result
  29. def chunk_tag(evidence):
  30. result = set()
  31. tree = evidence.segment.lex_trees[0]
  32. for i, _ in enumerate(tree.leaves()):
  33. path = tree.leaf_treeposition(i)
  34. parent = walk_tree(tree, path[:-2])
  35. parent_label = parent.label()
  36. position_in_sentence = path[-2]
  37. if parent_label == "S":
  38. tag = "O"
  39. else:
  40. modifier = "B" if position_in_sentence == 0 else "I"
  41. tag = "{}-{}".format(modifier, parent_label)
  42. result.add(tag)
  43. return result
  44. def iob_chain(evidence):
  45. result = set()
  46. tree = evidence.segment.lex_trees[0]
  47. for i, _ in enumerate(tree.leaves()):
  48. path = tree.leaf_treeposition(i)[:-1]
  49. chain = []
  50. subtree = tree
  51. for (step, next_step) in zip(path, path[1:]):
  52. subtree = subtree[step]
  53. modifier = "B" if next_step == 0 else "I"
  54. tag = "{}-{}".format(modifier, subtree.label())
  55. chain.append(tag)
  56. result.add("/".join(chain))
  57. return result