123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- import random
- from collections import namedtuple
- from commonutil import *
- import re
- Label = namedtuple('Label', 'entity_id, label, type')
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- label = "int",
- rule_id = "text",
- :[])
- # heuristic rules for finding positive/negative examples of transaction relationship mentions
- def supervise(
- entity_id="text", entity_begin="int", entity_end="int",
- doc_id="text", sentence_index="int", sentence_text="text",
- tokens="text[]", pos_tags="text[]", ner_tags="text[]",
- ):
- # Constants
- label = Label(entity_id=entity_id,label=None,type=None)
- MAX_DIST = 10
- TYPE_MENTION = frozenset(["org","company"])
- # Common data objects
- if entity_begin>MAX_DIST:
- begin = entity_begin-MAX_DIST
- else:
- begin = 0
- if len(tokens)-entity_end>MAX_DIST:
- end = entity_end+MAX_DIST
- else:
- end = -1
- front_tokens = tokens[begin:entity_begin]
- end_tokens = tokens[entity_end:end]
- front_ner = ner_tags[begin:entity_begin]
- end_ner = ner_tags[end:entity_end]
-
-
- pattern_tenderee = re.compile("报价上限|限价|造价|控制(总?价|金额)|预算|概算|(?:造?价|投资|规模)预?估?算|预?估算?(?:造?价|投资|规模|金额)|(?:总|项目|计划)(?:[估预概]算|投资)|(?:投资|采购)(?:单价|总)?(?:额|金额)|投资约")
- pattern_wintenderer = re.compile("[\((]?(?:中标|成交|评标|评审|投标|报价|合同|入围)[\),)]?(候选人|单位|人|候选单位|供应商|候选)?后?[单总]?(?:价|金额|价格|报价|标价)|报价|第[一二三](中标|投标|候选|名|成交)|(公司\s*[::]?$)")
-
- if re.search(pattern_tenderee,"".join(front_tokens)) is not None:
- yield label._replace(label=0,type="match")
- elif re.search(pattern_wintenderer,"".join(front_tokens)) is not None:
- yield label._replace(label=1,type="match")
- else:
- yield label._replace(label=2,type="not match")
-
|