supervise_entrust.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/env python
  2. #encoding:utf-8
  3. from deepdive import *
  4. import random
  5. from collections import namedtuple
  6. import logging
  7. TransactionLabel = namedtuple('TransactionLabel', 'p1_id, p2_id, label, type')
  8. @tsv_extractor
  9. @returns(lambda
  10. p1_id = "text",
  11. p2_id = "text",
  12. label = "int",
  13. rule_id = "text",
  14. :[])
  15. # heuristic rules for finding positive/negative examples of transaction relationship mentions
  16. def supervise(
  17. p1_id="text", p1_begin="int", p1_end="int",
  18. p2_id="text", p2_begin="int", p2_end="int",
  19. doc_id="text", sentence_index="int", sentence_text="text",
  20. tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]",
  21. dep_types="text[]", dep_token_indexes="int[]",
  22. ):
  23. # Constants
  24. TRANSLATION = frozenset(["转让", "交易", "卖出", "购买","收购","购入","拥有", "持有", "卖给", "买入", "获得"])
  25. STOCK = frozenset(["股权", "股份", "股"])
  26. TRANSLATION_COM = frozenset(["持股","买股","卖股"])
  27. TRANSLATION_AFTER = frozenset(["融资", "投资"])
  28. OTHERS_AFTER = frozenset(["产品", "委托", "贷款", "保险"])
  29. OTHERS_COM = frozenset(["购买", "提供", "申请", "销售"])
  30. CONF = frozenset(["对", "向"])
  31. OTHERS = frozenset(["销售产品", "提供担保","提供服务"])
  32. COMMAS = frozenset([":", ":","1","2","3","4","5","6","7","8","9","0","、", ";", ";"])
  33. #FAMILY = frozenset(["mother", "father", "sister", "brother", "brother-in-law"])
  34. ENTRUST = frozenset(["委托"])
  35. BEEN = frozenset(["受"])
  36. MAX_DIST = 20
  37. # Common data objects
  38. p1_end_idx = min(p1_end, p2_end)
  39. p2_start_idx = max(p1_begin, p2_begin)
  40. if p2_begin==p2_start_idx:
  41. reverse = False
  42. else:
  43. reverse = True
  44. p2_end_idx = max(p1_end,p2_end)
  45. intermediate_lemmas = lemmas[p1_end_idx+1:p2_start_idx]
  46. intermediate_ner_tags = ner_tags[p1_end_idx+1:p2_start_idx]
  47. tail_lemmas = lemmas[p2_end_idx+1:]
  48. transaction = TransactionLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None)
  49. logging.info(sentence_text)
  50. # Rule: Candidates that are too far apart
  51. if len(intermediate_lemmas) > MAX_DIST:
  52. yield transaction._replace(label=-1, type='neg:far_apart')
  53. if 'company' in intermediate_ner_tags:
  54. yield transaction._replace(label=-1, type='neg:third_company_between')
  55. if len(BEEN.intersection(intermediate_lemmas))>0:
  56. if len(ENTRUST.intersection(tail_lemmas))>0:
  57. if reverse:
  58. yield transaction._replace(p1_id=p2_id,p2_id=p1_id,label=1,type="pos:A受B委托")
  59. else:
  60. yield transaction._replace(p1_id=p1_id,p2_id=p2_id,label=1,type="pos:A受B委托")