supervise_guest_SecondCandidate.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. import re
  8. Label = namedtuple('Label', 'entity_id, label, type')
  9. @tsv_extractor
  10. @returns(lambda
  11. entity_id = "text",
  12. label = "int",
  13. rule_id = "text",
  14. :[])
  15. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  16. def supervise(
  17. entity_id="text", entity_begin="int", entity_end="int",
  18. doc_id="text", sentence_index="int", sentence_text="text",
  19. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  20. ):
  21. # Constants
  22. label = Label(entity_id=entity_id,label=None,type=None)
  23. SORT = frozenset(["第二"])
  24. Bidding = frozenset(["中标",])
  25. CANDIDATE = frozenset(["候选人","候选单位"])
  26. MAX_DIST = 10
  27. TYPE_MENTION = frozenset(["org","company"])
  28. # Common data objects
  29. if entity_begin>MAX_DIST:
  30. begin = entity_begin-MAX_DIST
  31. else:
  32. begin = 0
  33. if len(tokens)-entity_end>MAX_DIST:
  34. end = entity_end+MAX_DIST
  35. else:
  36. end = -1
  37. front_tokens = tokens[begin:entity_begin]
  38. end_tokens = tokens[entity_end:end]
  39. front_ner = ner_tags[begin:entity_begin]
  40. end_ner = ner_tags[end:entity_end]
  41. log(sentence_text)
  42. patten1 = "[^#]*第[2二](名|[^#]{0,5}中标[^#]{0,5}候选[^#]{0,5}(机构|人|单位|企业|供应商))"
  43. patten2 = "(排名)?第[2二]的(?:是)?(?P<element0>.{3,30})(?:作)?为第[2二](?:中标)?(?:候选)?人"
  44. if re.match(patten1,"".join(front_tokens)):
  45. yield label._replace(label=1,type="第二中标候选人:entity")
  46. elif re.match(patten2,"".join(end_tokens)):
  47. yield label._replace(label=1,type="entity:作为第二中标候选人")
  48. else:
  49. yield label._replace(label=-1,type="pos:no match")
  50. # Rule: Candidates that are too far apart
  51. '''
  52. if len(SORT.intersection(front_tokens))>0 and len(Bidding.intersection(front_tokens))>0 and len(CANDIDATE.intersection(front_tokens))>0:
  53. yield label._replace(label=1,type="第二中标候选人")
  54. else:
  55. yield label._replace(label=-1,type="pos:no match")
  56. '''