supervise_guest_SignUpEnd.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. Label = namedtuple('Label', 'entity_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. entity_id = "text",
  11. label = "int",
  12. rule_id = "text",
  13. :[])
  14. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  15. def supervise(
  16. entity_id="text", entity_begin="int", entity_end="int",
  17. doc_id="text", sentence_index="int", sentence_text="text",
  18. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  19. ):
  20. # Constants
  21. label = Label(entity_id=entity_id,label=None,type=None)
  22. Bidding = frozenset(["招标","报名","投标"])
  23. TIME_PREFIX = frozenset(["结束","截止"])
  24. TIME = frozenset(["时间"])
  25. SpecificTime = frozenset(["年","月"])
  26. MAX_DIST = 10
  27. TYPE_MENTION = frozenset(["org","company"])
  28. # Common data objects
  29. if entity_begin>MAX_DIST:
  30. begin = entity_begin-MAX_DIST
  31. else:
  32. begin = 0
  33. if len(tokens)-entity_end>MAX_DIST:
  34. end = entity_end+MAX_DIST
  35. else:
  36. end = -1
  37. front_tokens = tokens[begin:entity_begin]
  38. end_tokens = tokens[entity_end:end]
  39. front_ner = ner_tags[begin:entity_begin]
  40. end_ner = ner_tags[end:entity_end]
  41. log(sentence_text)
  42. # Rule: Candidates that are too far apart
  43. '''
  44. if len(intermediate_lemmas) > MAX_DIST:
  45. yield transaction._replace(label=-1, type='neg:far_apart')
  46. # Rule: Candidates that have a third company in between
  47. if 'company' in intermediate_ner_tags:
  48. yield transaction._replace(label=-1, type='neg:third_company_between')
  49. # Rule: Sentences that contain wife/husband in between
  50. # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
  51. #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
  52. # yield transaction._replace(label=1, type='A购买股权B')
  53. '''
  54. if len(Bidding.intersection(front_tokens))>0:
  55. if len(TIME.intersection(front_tokens))>0:
  56. if len(TIME_PREFIX.intersection(front_tokens))>=0:
  57. if find_index(list(SpecificTime),"".join(tokens[entity_begin:entity_end+1]))>=0:
  58. yield label._replace(label=1,type="投标截止时间:time")
  59. else:
  60. yield label._replace(label=-1,type="pos:非具体时间")
  61. else:
  62. yield label._replace(label=-1,type="pos:其他时间")
  63. else:
  64. yield label._replace(label=-1,type="pos:no match")
  65. else:
  66. yield label._replace(label=-1,type="pos:no match")