#!/usr/bin/env python #encoding:utf-8 from deepdive import * import random from collections import namedtuple from commonutil import * Label = namedtuple('Label', 'entity_id, label, type') @tsv_extractor @returns(lambda entity_id = "text", label = "int", rule_id = "text", :[]) # heuristic rules for finding positive/negative examples of transaction relationship mentions def supervise( entity_id="text", entity_begin="int", entity_end="int", doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", ): # Constants label = Label(entity_id=entity_id,label=None,type=None) Bidding = frozenset(["招标"]) Agency = frozenset(["代理"]) UNIT = frozenset(["人","单位","机构"]) R_append = frozenset(["作为","以下","简称"]) MAX_DIST = 15 TYPE_MENTION = frozenset(["org","company"]) # Common data objects if entity_begin>MAX_DIST: begin = entity_begin-MAX_DIST else: begin = 0 if len(tokens)-entity_end>MAX_DIST: end = entity_end+MAX_DIST else: end = -1 front_tokens = tokens[begin:entity_begin] end_tokens = tokens[entity_end:end] front_ner = ner_tags[begin:entity_begin] end_ner = ner_tags[end:entity_end] log(sentence_text) # Rule: Candidates that are too far apart ''' if len(intermediate_lemmas) > MAX_DIST: yield transaction._replace(label=-1, type='neg:far_apart') # Rule: Candidates that have a third company in between if 'company' in intermediate_ner_tags: yield transaction._replace(label=-1, type='neg:third_company_between') # Rule: Sentences that contain wife/husband in between # ()([ A-Za-z]+)(wife|husband)([ A-Za-z]+)() #if len(TRANSLATION.intersection(intermediate_lemmas)) > 0 and len(STOCK.intersection(intermediate_lemmas)) > 0: # yield transaction._replace(label=1, type='A购买股权B') ''' inter_bidd_front = Bidding.intersection(front_tokens) inter_unit_fron = UNIT.intersection(front_tokens) inter_bidd_end = Bidding.intersection(end_tokens) inter_unit_end = UNIT.intersection(end_tokens) #实体前向判断 if len(inter_bidd_front)>0 and len(inter_unit_fron)>0: if len(Agency.intersection(front_tokens))==0: if len(R_append.intersection(end_tokens))==0: index = find_index(combine(list(inter_bidd_front),list(inter_unit_fron)),''.join(front_tokens)) if index>=0: if len(TYPE_MENTION.intersection(ner_tags[index:entity_begin]))>0: yield label._replace(label=-1,type="pos: third entity between") else: yield label._replace(label=1,type="招标人/机构:entity") else: yield label._replace(label=-1,type="pos:no match") else: yield label._replace(label=-1,type="pos:代理机构") #实体后向判断 elif len(inter_bidd_end)>0 and len(inter_unit_end)>0: if len(Agency.intersection(end_tokens))==0: if len(R_append.intersection(end_tokens))>0: index = find_index(combine(list(inter_bidd_end),list(inter_unit_end)),''.join(end_tokens)) if index>=0: if len(TYPE_MENTION.intersection(ner_tags[entity_end+1:index]))>0: yield label._replace(label=-1,type="post:third entity between") else: yield label._replace(label=1,type="entity:以下简称招标人/机构") else: yield label._replace(label=-1,type="pos:代理机构") else: yield label._replace(label=-1,type="no match")