supervise_guest_BiddingAgency.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. import logging
  7. from commonutil import *
  8. Label = namedtuple('Label', 'entity_id, label, type')
  9. @tsv_extractor
  10. @returns(lambda
  11. entity_id = "text",
  12. label = "int",
  13. rule_id = "text",
  14. :[])
  15. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  16. def supervise(
  17. entity_id="text", entity_begin="int", entity_end="int",
  18. doc_id="text", sentence_index="int", sentence_text="text",
  19. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  20. ):
  21. # Constants
  22. label = Label(entity_id=entity_id,label=None,type=None)
  23. Bidding = frozenset(["招标"])
  24. Agency = frozenset(["代理"])
  25. UNIT = frozenset(["单位","机构"])
  26. R_append = frozenset(["作为","以下","简称"])
  27. MAX_DIST = 15
  28. TYPE_MENTION = frozenset(["org","company"])
  29. # Common data objects
  30. if entity_begin>MAX_DIST:
  31. begin = entity_begin-MAX_DIST
  32. else:
  33. begin = 0
  34. if len(tokens)-entity_end>MAX_DIST:
  35. end = entity_end+MAX_DIST
  36. else:
  37. end = -1
  38. front_tokens = tokens[begin:entity_begin]
  39. end_tokens = tokens[entity_end:end]
  40. front_ner = ner_tags[begin:entity_begin]
  41. end_ner = ner_tags[end:entity_end]
  42. logging.info(sentence_text)
  43. # Rule: Candidates that are too far apart
  44. inter_bidd_front = Bidding.intersection(front_tokens)
  45. inter_unit_front = UNIT.intersection(front_tokens)
  46. inter_agent_front = Agency.intersection(front_tokens)
  47. inter_bidd_end = Bidding.intersection(end_tokens)
  48. inter_unit_end = UNIT.intersection(end_tokens)
  49. inter_agent_end = Agency.intersection(end_tokens)
  50. if len(inter_bidd_front)>0 and len(inter_unit_front)>0:
  51. if len(inter_agent_front)>0:
  52. if len(R_append.intersection(end_tokens))==0:
  53. index = find_index(combine(combine(list(inter_bidd_front),list(inter_agent_front)),list(inter_unit_front)),''.join(front_tokens))
  54. if index>=0:
  55. if len(TYPE_MENTION.intersection(front_ner[index:entity_begin]))>0:
  56. yield label._replace(label=-1,type="pos:third entity between")
  57. else:
  58. yield label._replace(label=1,type="招标代理机构:entity")
  59. else:
  60. yield label._replace(label=-1,type="pos:no match")
  61. else:
  62. yield label._replace(label=-1,type="pos:非代理机构")
  63. elif len(inter_bidd_end)>0 and len(inter_unit_end)>0:
  64. if len(inter_agent_end)>0:
  65. if len(R_append.intersection(end_tokens))>0:
  66. index = find_index(combine(combine(list(inter_bidd_end),list(inter_agent_end)),list(inter_unit_end)),''.join(end_tokens))
  67. if index>=0:
  68. if len(TYPE_MENTION.intersection(end_ner[entity_end+1:index]))>0:
  69. yield label._replace(label=-1,type="pos:third entity between")
  70. else:
  71. yield label._replace(label=1,type="entity:招标代理机构")
  72. else:
  73. yield label._replace(label=-1,type="pos:no match")
  74. else:
  75. yield label._replace(label=-1,type="pos:非代理机构")
  76. else:
  77. yield label._replace(label=-1,type="no match")