supervise_guest_Tenderer.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. Label = namedtuple('Label', 'entity_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. entity_id = "text",
  11. label = "int",
  12. rule_id = "text",
  13. :[])
  14. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  15. def supervise(
  16. entity_id="text", entity_begin="int", entity_end="int",
  17. doc_id="text", sentence_index="int", sentence_text="text",
  18. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  19. ):
  20. # Constants
  21. label = Label(entity_id=entity_id,label=None,type=None)
  22. Bidding = frozenset(["招标"])
  23. Agency = frozenset(["代理"])
  24. UNIT = frozenset(["人","单位","机构"])
  25. R_append = frozenset(["作为","以下","简称"])
  26. MAX_DIST = 15
  27. TYPE_MENTION = frozenset(["org","company"])
  28. # Common data objects
  29. if entity_begin>MAX_DIST:
  30. begin = entity_begin-MAX_DIST
  31. else:
  32. begin = 0
  33. if len(tokens)-entity_end>MAX_DIST:
  34. end = entity_end+MAX_DIST
  35. else:
  36. end = -1
  37. front_tokens = tokens[begin:entity_begin]
  38. end_tokens = tokens[entity_end:end]
  39. front_ner = ner_tags[begin:entity_begin]
  40. end_ner = ner_tags[end:entity_end]
  41. log(sentence_text)
  42. # Rule: Candidates that are too far apart
  43. '''
  44. if len(intermediate_lemmas) > MAX_DIST:
  45. yield transaction._replace(label=-1, type='neg:far_apart')
  46. # Rule: Candidates that have a third company in between
  47. if 'company' in intermediate_ner_tags:
  48. yield transaction._replace(label=-1, type='neg:third_company_between')
  49. # Rule: Sentences that contain wife/husband in between
  50. # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
  51. #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
  52. # yield transaction._replace(label=1, type='A购买股权B')
  53. '''
  54. inter_bidd_front = Bidding.intersection(front_tokens)
  55. inter_unit_fron = UNIT.intersection(front_tokens)
  56. inter_bidd_end = Bidding.intersection(end_tokens)
  57. inter_unit_end = UNIT.intersection(end_tokens)
  58. #实体前向判断
  59. if len(inter_bidd_front)>0 and len(inter_unit_fron)>0:
  60. if len(Agency.intersection(front_tokens))==0:
  61. if len(R_append.intersection(end_tokens))==0:
  62. index = find_index(combine(list(inter_bidd_front),list(inter_unit_fron)),''.join(front_tokens))
  63. if index>=0:
  64. if len(TYPE_MENTION.intersection(ner_tags[index:entity_begin]))>0:
  65. yield label._replace(label=-1,type="pos: third entity between")
  66. else:
  67. yield label._replace(label=1,type="招标人/机构:entity")
  68. else:
  69. yield label._replace(label=-1,type="pos:no match")
  70. else:
  71. yield label._replace(label=-1,type="pos:代理机构")
  72. #实体后向判断
  73. elif len(inter_bidd_end)>0 and len(inter_unit_end)>0:
  74. if len(Agency.intersection(end_tokens))==0:
  75. if len(R_append.intersection(end_tokens))>0:
  76. index = find_index(combine(list(inter_bidd_end),list(inter_unit_end)),''.join(end_tokens))
  77. if index>=0:
  78. if len(TYPE_MENTION.intersection(ner_tags[entity_end+1:index]))>0:
  79. yield label._replace(label=-1,type="post:third entity between")
  80. else:
  81. yield label._replace(label=1,type="entity:以下简称招标人/机构")
  82. else:
  83. yield label._replace(label=-1,type="pos:代理机构")
  84. else:
  85. yield label._replace(label=-1,type="no match")