supervise_guest_BiddingOT.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. Label = namedtuple('Label', 'entity_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. entity_id = "text",
  11. label = "int",
  12. rule_id = "text",
  13. :[])
  14. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  15. def supervise(
  16. entity_id="text", entity_begin="int", entity_end="int",
  17. doc_id="text", sentence_index="int", sentence_text="text",
  18. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  19. ):
  20. # Constants
  21. label = Label(entity_id=entity_id,label=None,type=None)
  22. Bidding = frozenset(["开标"])
  23. TIME = frozenset(["时间"])
  24. SpecificTime = frozenset(["年","月"])
  25. MAX_DIST = 10
  26. TYPE_MENTION = frozenset(["org","company"])
  27. # Common data objects
  28. if entity_begin>MAX_DIST:
  29. begin = entity_begin-MAX_DIST
  30. else:
  31. begin = 0
  32. if len(tokens)-entity_end>MAX_DIST:
  33. end = entity_end+MAX_DIST
  34. else:
  35. end = -1
  36. front_tokens = tokens[begin:entity_begin]
  37. end_tokens = tokens[entity_end:end]
  38. front_ner = ner_tags[begin:entity_begin]
  39. end_ner = ner_tags[end:entity_end]
  40. log(sentence_text)
  41. # Rule: Candidates that are too far apart
  42. '''
  43. if len(intermediate_lemmas) > MAX_DIST:
  44. yield transaction._replace(label=-1, type='neg:far_apart')
  45. # Rule: Candidates that have a third company in between
  46. if 'company' in intermediate_ner_tags:
  47. yield transaction._replace(label=-1, type='neg:third_company_between')
  48. # Rule: Sentences that contain wife/husband in between
  49. # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
  50. #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
  51. # yield transaction._replace(label=1, type='A购买股权B')
  52. '''
  53. if len(TIME.intersection(front_tokens))>0:
  54. if len(Bidding.intersection(front_tokens))>0:
  55. if find_index(list(SpecificTime),"".join(tokens[entity_begin:entity_end+1]))>=0:
  56. yield label._replace(label=1,type="开标:time")
  57. else:
  58. yield label._replace(label=-1,type="pos:非具体时间")
  59. else:
  60. yield label._replace(label=-1,type="pos:no match 开标")
  61. else:
  62. yield label._replace(label=-1,type="pos:no match")