supervise_guest_SignUpStart.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. Label = namedtuple('Label', 'entity_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. entity_id = "text",
  11. label = "int",
  12. rule_id = "text",
  13. :[])
  14. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  15. def supervise(
  16. entity_id="text", entity_begin="int", entity_end="int",
  17. doc_id="text", sentence_index="int", sentence_text="text",
  18. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  19. ):
  20. # Constants
  21. label = Label(entity_id=entity_id,label=None,type=None)
  22. Bidding = frozenset(["招标","报名","投标"])
  23. TIME_PREFIX = frozenset(["开始"])
  24. NOT_TIME_PREFIX = frozenset(["结束","截止"])
  25. TIME = frozenset(["时间"])
  26. SpecificTime = frozenset(["年","月"])
  27. MAX_DIST = 10
  28. TYPE_MENTION = frozenset(["org","company"])
  29. # Common data objects
  30. if entity_begin>MAX_DIST:
  31. begin = entity_begin-MAX_DIST
  32. else:
  33. begin = 0
  34. if len(tokens)-entity_end>MAX_DIST:
  35. end = entity_end+MAX_DIST
  36. else:
  37. end = -1
  38. front_tokens = tokens[begin:entity_begin]
  39. end_tokens = tokens[entity_end:end]
  40. front_ner = ner_tags[begin:entity_begin]
  41. end_ner = ner_tags[end:entity_end]
  42. log(sentence_text)
  43. # Rule: Candidates that are too far apart
  44. '''
  45. if len(intermediate_lemmas) > MAX_DIST:
  46. yield transaction._replace(label=-1, type='neg:far_apart')
  47. # Rule: Candidates that have a third company in between
  48. if 'company' in intermediate_ner_tags:
  49. yield transaction._replace(label=-1, type='neg:third_company_between')
  50. # Rule: Sentences that contain wife/husband in between
  51. # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
  52. #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
  53. # yield transaction._replace(label=1, type='A购买股权B')
  54. '''
  55. if len(Bidding.intersection(front_tokens))>0:
  56. if len(TIME.intersection(front_tokens))>0:
  57. if len(TIME_PREFIX.intersection(front_tokens))>=0 and len(NOT_TIME_PREFIX.intersection(front_tokens))==0:
  58. if find_index(list(SpecificTime),"".join(tokens[entity_begin:entity_end+1]))>=0:
  59. yield label._replace(label=1,type="投标开始时间:time")
  60. else:
  61. yield label._replace(label=-1,type="pos:非具体时间")
  62. else:
  63. yield label._replace(label=-1,type="pos:截止时间")
  64. else:
  65. yield label._replace(label=-1,type="pos:no match")
  66. else:
  67. yield label._replace(label=-1,type="pos:no match")