1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- import random
- from collections import namedtuple
- from commonutil import *
- import re
- Label = namedtuple('Label', 'entity_id, label, type')
- @tsv_extractor
- @returns(lambda
- entity_id = "text",
- label = "int",
- rule_id = "text",
- :[])
- # heuristic rules for finding positive/negative examples of transaction relationship mentions
- def supervise(
- entity_id="text", entity_begin="int", entity_end="int",
- doc_id="text", sentence_index="int", sentence_text="text",
- tokens="text[]", pos_tags="text[]", ner_tags="text[]",
- ):
- # Constants
- label = Label(entity_id=entity_id,label=None,type=None)
- Bidding = frozenset(["中标","中标人","中"])
- LOCA = frozenset(["单位","人","中标人","标人"])
- MAX_DIST = 15
- TYPE_MENTION = frozenset(["org","company"])
- # Common data objects
- if entity_begin>MAX_DIST:
- begin = entity_begin-MAX_DIST
- else:
- begin = 0
- if len(tokens)-entity_end>MAX_DIST:
- end = entity_end+MAX_DIST
- else:
- end = -1
- front_tokens = tokens[begin:entity_begin]
- end_tokens = tokens[entity_end:end]
- front_ner = ner_tags[begin:entity_begin]
- end_ner = ner_tags[end:entity_end]
-
- log(sentence_text)
-
- # Rule: Candidates that are too far apart
- patten = "(中标[人|单位|机构|])"
- sear = re.search(patten,"".join(front_tokens))
- if sear:
- if len(TYPE_MENTION.intersection(front_ner[get_index_in_tokens(sear.end(0),front_tokens):]))>0:
- yield label._replace(label=-1,type="pos:third entity between")
- else:
- yield label._replace(label=1,type="中标单位:entity")
- else:
- yield label._replace(label=-1,type="pos:no match")
-
-
- '''
- if len(LOCA.intersection(front_tokens))>0:
- if len(Bidding.intersection(front_tokens))>0:
- comb = combine(list(Bidding),list(LOCA))
- index = find_index(comb,"".join(front_tokens))
- if index>0:
- if len(TYPE_MENTION.intersection(front_ner[index:entity_begin]))>0:
- yield label._replace(label=-1,type="pos:third entity between")
- else:
- yield label._replace(label=1,type="中标单位")
- else:
- yield label._replace(label=-1,type="pos:not match all")
- else:
- yield label._replace(label=-1,type="pos:no match 单位")
- else:
- yield label._replace(label=-1,type="pos:no match")
- '''
|