12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- import random
- from collections import namedtuple
- from commonutil import *
- import re
- Label = namedtuple('Label', 'entity_id, label, type')
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- label = "int",
- rule_id = "text",
- :[])
- # heuristic rules for finding positive/negative examples of transaction relationship mentions
- def supervise(
- entity_id="text", entity_begin="int", entity_end="int",
- doc_id="text", sentence_index="int", sentence_text="text",
- tokens="text[]", pos_tags="text[]", ner_tags="text[]",
- ):
- # Constants
- label = Label(entity_id=entity_id,label=None,type=None)
- SORT = frozenset(["第一"])
- Bidding = frozenset(["中标",])
- CANDIDATE = frozenset(["候选人","候选单位"])
- MAX_DIST = 10
- TYPE_MENTION = frozenset(["org","company"])
- # Common data objects
- if entity_begin>MAX_DIST:
- begin = entity_begin-MAX_DIST
- else:
- begin = 0
- if len(tokens)-entity_end>MAX_DIST:
- end = entity_end+MAX_DIST
- else:
- end = -1
- front_tokens = tokens[begin:entity_begin]
- end_tokens = tokens[entity_end:end]
- front_ner = ner_tags[begin:entity_begin]
- end_ner = ner_tags[end:entity_end]
-
- log(sentence_text)
-
- patten1 = "[^#]*第[1一](名|[^#]{0,5}中标[^#]{0,5}候选[^#]{0,5}(机构|人|单位|企业|供应商))"
-
- patten2 = "(排名)?第[1一]的(?:是)?(?P<element0>.{3,30})(?:作)?为第[1一](?:中标)?(?:候选)?人"
- match_text = "".join(tokens[begin:end+1])
- if re.match(patten1,"".join(front_tokens)):
- yield label._replace(label=1,type="第一中标候选人:entity")
-
- elif re.match(patten2,"".join(end_tokens)):
- yield label._replace(label=1,type="entity:作为第一中标候选人")
- else:
- yield label._replace(label=-1,type="pos:no match")
- # Rule: Candidates that are too far apart
- '''
- if len(SORT.intersection(front_tokens))>0 and len(Bidding.intersection(front_tokens))>0 and len(CANDIDATE.intersection(front_tokens))>0:
- yield label._replace(label=1,type="第一中标候选人")
- else:
- yield label._replace(label=-1,type="pos:no match")
- '''
|