feature_entity.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. #!/usr/bin/env python
  2. from deepdive import *
  3. import ddlib
  4. @tsv_extractor
  5. @returns(lambda
  6. entity_id = "text",
  7. feature = "text",
  8. window_size = "int",
  9. :[])
  10. def extract(
  11. entity_id = "text",
  12. entity_begin_index = "int",
  13. entity_end_index = "int",
  14. doc_id = "text",
  15. sent_index = "int",
  16. tokens = "text[]",
  17. pos_tags = "text[]",
  18. ner_tags = "text[]",
  19. window_size = "int",
  20. ):
  21. """
  22. Uses DDLIB to generate features for the spouse relation.
  23. """
  24. # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
  25. sent = []
  26. for i,t in enumerate(tokens):
  27. sent.append(ddlib.Word(
  28. begin_char_offset=None,
  29. end_char_offset=None,
  30. word=t,
  31. lemma=tokens[i],
  32. pos=pos_tags[i],
  33. ner=ner_tags[i],
  34. dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
  35. dep_label=''))
  36. # Create DDLIB Spans for the two person mentions
  37. entity_span = ddlib.Span(begin_word_id=entity_begin_index,length=(entity_end_index-entity_begin_index+1))
  38. for feature in ddlib.get_generic_features_mention(sent, entity_span,window=window_size):
  39. yield [entity_id, feature,window_size]