supervise_guest_Person.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. import re
  8. Label = namedtuple('Label', 'entity_id, label, type')
  9. @tsv_extractor
  10. @returns(lambda
  11. entity_id = "text",
  12. label = "int",
  13. rule_id = "text",
  14. :[])
  15. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  16. def supervise(
  17. entity_id="text", entity_begin="int", entity_end="int",
  18. doc_id="text", sentence_index="int", sentence_text="text",
  19. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  20. ):
  21. # Constants
  22. label = Label(entity_id=entity_id,label=None,type=None)
  23. MAX_DIST = 10
  24. TYPE_MENTION = frozenset(["org","company"])
  25. # Common data objects
  26. if entity_begin>MAX_DIST:
  27. begin = entity_begin-MAX_DIST
  28. else:
  29. begin = 0
  30. if len(tokens)-entity_end>MAX_DIST:
  31. end = entity_end+MAX_DIST
  32. else:
  33. end = -1
  34. front_tokens = tokens[begin:entity_begin]
  35. end_tokens = tokens[entity_end:end]
  36. front_ner = ner_tags[begin:entity_begin]
  37. end_ner = ner_tags[end:entity_end]
  38. pattern_person = re.compile("联(\s*|,|,)?系(\s*|,|,)?(人|方式)|项目(经理|负责人)|经办人")
  39. if re.search(pattern_person,"".join(front_tokens)) is not None:
  40. yield label._replace(label=0,type="match")
  41. else:
  42. yield label._replace(label=1,type="not match")