feature_entity_pair.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env python
  2. from deepdive import *
  3. import ddlib
  4. @tsv_extractor
  5. @returns(lambda
  6. p1_id = "text",
  7. p2_id = "text",
  8. feature = "text",
  9. window_size = "int",
  10. :[])
  11. def extract(
  12. p1_id = "text",
  13. p2_id = "text",
  14. p1_begin_index = "int",
  15. p1_end_index = "int",
  16. p2_begin_index = "int",
  17. p2_end_index = "int",
  18. doc_id = "text",
  19. sent_index = "int",
  20. tokens = "text[]",
  21. lemmas = "text[]",
  22. pos_tags = "text[]",
  23. ner_tags = "text[]",
  24. dep_types = "text[]",
  25. dep_parents = "int[]",
  26. window_size = "int",
  27. ):
  28. """
  29. Uses DDLIB to generate features for the spouse relation.
  30. """
  31. # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
  32. sent = []
  33. for i,t in enumerate(tokens):
  34. sent.append(ddlib.Word(
  35. begin_char_offset=None,
  36. end_char_offset=None,
  37. word=t,
  38. lemma=lemmas[i],
  39. pos=pos_tags[i],
  40. ner=ner_tags[i],
  41. dep_par=-1 , # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
  42. dep_label=''))
  43. # Create DDLIB Spans for the two person mentions
  44. p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1))
  45. p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1))
  46. # Generate the generic features using DDLIB
  47. for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span,window=window_size):
  48. yield [p1_id, p2_id, feature,window_size]