supervise_guest_BiddingAddress.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. from commonutil import *
  7. Label = namedtuple('Label', 'entity_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. entity_id = "text",
  11. label = "int",
  12. rule_id = "text",
  13. :[])
  14. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  15. def supervise(
  16. entity_id="text", entity_begin="int", entity_end="int",
  17. doc_id="text", sentence_index="int", sentence_text="text",
  18. tokens="text[]", pos_tags="text[]", ner_tags="text[]",
  19. ):
  20. # Constants
  21. label = Label(entity_id=entity_id,label=None,type=None)
  22. Bidding = frozenset(["开标"])
  23. LOCA = frozenset(["地点","地址"])
  24. MAX_DIST = 10
  25. TYPE_MENTION = frozenset(["org","company"])
  26. # Common data objects
  27. if entity_begin>MAX_DIST:
  28. begin = entity_begin-MAX_DIST
  29. else:
  30. begin = 0
  31. if len(tokens)-entity_end>MAX_DIST:
  32. end = entity_end+MAX_DIST
  33. else:
  34. end = -1
  35. front_tokens = tokens[begin:entity_begin]
  36. end_tokens = tokens[entity_end:end]
  37. front_ner = ner_tags[begin:entity_begin]
  38. end_ner = ner_tags[end:entity_end]
  39. log(sentence_text)
  40. # Rule: Candidates that are too far apart
  41. '''
  42. if len(intermediate_lemmas) > MAX_DIST:
  43. yield transaction._replace(label=-1, type='neg:far_apart')
  44. # Rule: Candidates that have a third company in between
  45. if 'company' in intermediate_ner_tags:
  46. yield transaction._replace(label=-1, type='neg:third_company_between')
  47. # Rule: Sentences that contain wife/husband in between
  48. # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
  49. #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
  50. # yield transaction._replace(label=1, type='A购买股权B')
  51. '''
  52. if len(LOCA.intersection(front_tokens))>0:
  53. if len(Bidding.intersection(front_tokens))>0:
  54. yield label._replace(label=1,type="开标地点:location")
  55. else:
  56. yield label._replace(label=-1,type="pos:no match 开标")
  57. else:
  58. yield label._replace(label=-1,type="pos:no match")