#!/usr/bin/env python #encoding:utf-8 from deepdive import * import random from collections import namedtuple from commonutil import * import re Label = namedtuple('Label', 'entity_id, label, type') @tsv_extractor @returns(lambda entity_id = "text", label = "int", rule_id = "text", :[]) # heuristic rules for finding positive/negative examples of transaction relationship mentions def supervise( entity_id="text", entity_begin="int", entity_end="int", doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", pos_tags="text[]", ner_tags="text[]", ): # Constants label = Label(entity_id=entity_id,label=None,type=None) MAX_DIST = 10 TYPE_MENTION = frozenset(["org","company"]) # Common data objects if entity_begin>MAX_DIST: begin = entity_begin-MAX_DIST else: begin = 0 if len(tokens)-entity_end>MAX_DIST: end = entity_end+MAX_DIST else: end = -1 front_tokens = tokens[begin:entity_begin] end_tokens = tokens[entity_end:end] front_ner = ner_tags[begin:entity_begin] end_ner = ner_tags[end:entity_end] pattern_tenderee = re.compile("报价上限|限价|造价|控制(总?价|金额)|预算|概算|(?:造?价|投资|规模)预?估?算|预?估算?(?:造?价|投资|规模|金额)|(?:总|项目|计划)(?:[估预概]算|投资)|(?:投资|采购)(?:单价|总)?(?:额|金额)|投资约") pattern_wintenderer = re.compile("[\((]?(?:中标|成交|评标|评审|投标|报价|合同|入围)[\),)]?(候选人|单位|人|候选单位|供应商|候选)?后?[单总]?(?:价|金额|价格|报价|标价)|报价|第[一二三](中标|投标|候选|名|成交)|(公司\s*[::]?$)") if re.search(pattern_tenderee,"".join(front_tokens)) is not None: yield label._replace(label=0,type="match") elif re.search(pattern_wintenderer,"".join(front_tokens)) is not None: yield label._replace(label=1,type="match") else: yield label._replace(label=2,type="not match")