feature_entity_vec.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. #!/usr/bin/env python
  2. from deepdive import *
  3. import ddlib
  4. @tsv_extractor
  5. @returns(lambda
  6. entity_id = "text",
  7. feature = "text",
  8. window_size = "int",
  9. :[])
  10. def extract(
  11. entity_id = "text",
  12. entity_begin_index = "int",
  13. entity_end_index = "int",
  14. doc_id = "text",
  15. sent_index = "int",
  16. tokens = "text[]",
  17. pos_tags = "text[]",
  18. ner_tags = "text[]",
  19. window_size = "int",
  20. ):
  21. """
  22. Uses DDLIB to generate features for the spouse relation.
  23. """
  24. # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
  25. '''
  26. sent = []
  27. for i,t in enumerate(tokens):
  28. sent.append(ddlib.Word(
  29. begin_char_offset=None,
  30. end_char_offset=None,
  31. word=t,
  32. lemma=tokens[i],
  33. pos=pos_tags[i],
  34. ner=ner_tags[i],
  35. dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
  36. dep_label=''))
  37. # Create DDLIB Spans for the two person mentions
  38. entity_span = ddlib.Span(begin_word_id=entity_begin_index,length=(entity_end_index-entity_begin_index+1))
  39. for feature in ddlib.get_generic_features_mention(sent, entity_span,window=window_size):
  40. yield [entity_id, feature,window_size]
  41. '''
  42. maxlen = len(tokens)
  43. words = []
  44. if entity_begin_index>window_size:
  45. start = entity_begin_index-window_size
  46. else:
  47. start = 0
  48. for i in tokens[start:entity_begin_index]:
  49. words.append(i)
  50. if maxlen-window_size>entity_end_index:
  51. end = entity_end_index+window_size
  52. else:
  53. end = maxlen
  54. for i in tokens[entity_end_index:end]:
  55. words.append(i)
  56. for item in words:
  57. yield[entity_id,item,window_size]