123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- import random
- from collections import namedtuple
- from commonutil import *
- Label = namedtuple('Label', 'entity_id, label, type')
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- label = "int",
- rule_id = "text",
- :[])
- # heuristic rules for finding positive/negative examples of transaction relationship mentions
- def supervise(
- entity_id="text", entity_begin="int", entity_end="int",
- doc_id="text", sentence_index="int", sentence_text="text",
- tokens="text[]", pos_tags="text[]", ner_tags="text[]",
- ):
- # Constants
- label = Label(entity_id=entity_id,label=None,type=None)
- Bidding = frozenset(["招标"])
- Agency = frozenset(["代理"])
- UNIT = frozenset(["人","单位","机构"])
- R_append = frozenset(["作为","以下","简称"])
- MAX_DIST = 15
- TYPE_MENTION = frozenset(["org","company"])
- # Common data objects
- if entity_begin>MAX_DIST:
- begin = entity_begin-MAX_DIST
- else:
- begin = 0
- if len(tokens)-entity_end>MAX_DIST:
- end = entity_end+MAX_DIST
- else:
- end = -1
- front_tokens = tokens[begin:entity_begin]
- end_tokens = tokens[entity_end:end]
- front_ner = ner_tags[begin:entity_begin]
- end_ner = ner_tags[end:entity_end]
-
- log(sentence_text)
-
- # Rule: Candidates that are too far apart
- '''
- if len(intermediate_lemmas) > MAX_DIST:
- yield transaction._replace(label=-1, type='neg:far_apart')
- # Rule: Candidates that have a third company in between
- if 'company' in intermediate_ner_tags:
- yield transaction._replace(label=-1, type='neg:third_company_between')
- # Rule: Sentences that contain wife/husband in between
- # (<P1>)([ A-Za-z]+)(wife|husband)([ A-Za-z]+)(<P2>)
- #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0:
- # yield transaction._replace(label=1, type='A购买股权B')
- '''
- inter_bidd_front = Bidding.intersection(front_tokens)
- inter_unit_fron = UNIT.intersection(front_tokens)
- inter_bidd_end = Bidding.intersection(end_tokens)
- inter_unit_end = UNIT.intersection(end_tokens)
- #实体前向判断
- if len(inter_bidd_front)>0 and len(inter_unit_fron)>0:
- if len(Agency.intersection(front_tokens))==0:
- if len(R_append.intersection(end_tokens))==0:
- index = find_index(combine(list(inter_bidd_front),list(inter_unit_fron)),''.join(front_tokens))
- if index>=0:
- if len(TYPE_MENTION.intersection(ner_tags[index:entity_begin]))>0:
- yield label._replace(label=-1,type="pos: third entity between")
- else:
- yield label._replace(label=1,type="招标人/机构:entity")
- else:
- yield label._replace(label=-1,type="pos:no match")
- else:
- yield label._replace(label=-1,type="pos:代理机构")
- #实体后向判断
- elif len(inter_bidd_end)>0 and len(inter_unit_end)>0:
- if len(Agency.intersection(end_tokens))==0:
- if len(R_append.intersection(end_tokens))>0:
- index = find_index(combine(list(inter_bidd_end),list(inter_unit_end)),''.join(end_tokens))
- if index>=0:
- if len(TYPE_MENTION.intersection(ner_tags[entity_end+1:index]))>0:
- yield label._replace(label=-1,type="post:third entity between")
- else:
- yield label._replace(label=1,type="entity:以下简称招标人/机构")
- else:
- yield label._replace(label=-1,type="pos:代理机构")
- else:
- yield label._replace(label=-1,type="no match")
-
|