#!/usr/bin/env python #encoding:utf-8 from deepdive import * import random from collections import namedtuple from commonutil import * import re Label = namedtuple('Label', 'entity_id, label, type') @tsv_extractor @returns(lambda entity_id = "text", label = "int", rule_id = "text", :[]) # heuristic rules for finding positive/negative examples of transaction relationship mentions def supervise( entity_id="text", entity_begin="int", entity_end="int", doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", ): # Constants label = Label(entity_id=entity_id,label=None,type=None) SORT = frozenset(["第三"]) Bidding = frozenset(["中标",]) CANDIDATE = frozenset(["候选人","候选单位"]) MAX_DIST = 10 TYPE_MENTION = frozenset(["org","company"]) # Common data objects if entity_begin>MAX_DIST: begin = entity_begin-MAX_DIST else: begin = 0 if len(tokens)-entity_end>MAX_DIST: end = entity_end+MAX_DIST else: end = -1 front_tokens = tokens[begin:entity_begin] end_tokens = tokens[entity_end:end] front_ner = ner_tags[begin:entity_begin] end_ner = ner_tags[end:entity_end] log(sentence_text) # Rule: Candidates that are too far apart ''' if len(SORT.intersection(front_tokens))>0 and len(Bidding.intersection(front_tokens))>0 and len(CANDIDATE.intersection(front_tokens))>0: yield label._replace(label=1,type="第三中标候选人") else: yield label._replace(label=-1,type="pos:no match") ''' patten1 = "[^#]*第[3三](名|[^#]{0,5}中标[^#]{0,5}候选[^#]{0,5}(机构|人|单位|企业|供应商))" patten2 = "(排名)?第[3三]的(?:是)?(?P.{3,30})(?:作)?为第[3三](?:中标)?(?:候选)?人" match_text = "".join(tokens[begin:end+1]) if re.match(patten1,"".join(front_tokens)): yield label._replace(label=1,type="第三中标候选人:entity") elif re.match(patten2,"".join(end_tokens)): yield label._replace(label=1,type="entity:作为第三中标候选人") else: yield label._replace(label=-1,type="pos:no match")