supervise_guest_WinnerOfBidding.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. import re
  8. Label = namedtuple('Label', 'entity_id, label, type')
  9. @tsv_extractor
  10. @returns(lambda
  11. entity_id = "text",
  12. label = "int",
  13. rule_id = "text",
  14. :[])
  15. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  16. def supervise(
  17. entity_id="text", entity_begin="int", entity_end="int",
  18. doc_id="text", sentence_index="int", sentence_text="text",
  19. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  20. ):
  21. # Constants
  22. label = Label(entity_id=entity_id,label=None,type=None)
  23. Bidding = frozenset(["中标","中标人","中"])
  24. LOCA = frozenset(["单位","人","中标人","标人"])
  25. MAX_DIST = 15
  26. TYPE_MENTION = frozenset(["org","company"])
  27. # Common data objects
  28. if entity_begin>MAX_DIST:
  29. begin = entity_begin-MAX_DIST
  30. else:
  31. begin = 0
  32. if len(tokens)-entity_end>MAX_DIST:
  33. end = entity_end+MAX_DIST
  34. else:
  35. end = -1
  36. front_tokens = tokens[begin:entity_begin]
  37. end_tokens = tokens[entity_end:end]
  38. front_ner = ner_tags[begin:entity_begin]
  39. end_ner = ner_tags[end:entity_end]
  40. log(sentence_text)
  41. # Rule: Candidates that are too far apart
  42. patten = "(中标[人|单位|机构|])"
  43. sear = re.search(patten,"".join(front_tokens))
  44. if sear:
  45. if len(TYPE_MENTION.intersection(front_ner[get_index_in_tokens(sear.end(0),front_tokens):]))>0:
  46. yield label._replace(label=-1,type="pos:third entity between")
  47. else:
  48. yield label._replace(label=1,type="中标单位:entity")
  49. else:
  50. yield label._replace(label=-1,type="pos:no match")
  51. '''
  52. if len(LOCA.intersection(front_tokens))>0:
  53. if len(Bidding.intersection(front_tokens))>0:
  54. comb = combine(list(Bidding),list(LOCA))
  55. index = find_index(comb,"".join(front_tokens))
  56. if index>0:
  57. if len(TYPE_MENTION.intersection(front_ner[index:entity_begin]))>0:
  58. yield label._replace(label=-1,type="pos:third entity between")
  59. else:
  60. yield label._replace(label=1,type="中标单位")
  61. else:
  62. yield label._replace(label=-1,type="pos:not match all")
  63. else:
  64. yield label._replace(label=-1,type="pos:no match 单位")
  65. else:
  66. yield label._replace(label=-1,type="pos:no match")
  67. '''