|
@@ -1919,9 +1919,9 @@ class TendereeRuleRecall():
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
|
|
|
"[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
|
|
|
# 未识别实体尾部判断
|
|
|
- self.unrecognized_end1 = re.compile(
|
|
|
- "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
|
|
|
- self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
|
|
|
+ # self.unrecognized_end1 = re.compile(
|
|
|
+ # "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
|
|
|
+ # self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
|
|
|
|
|
|
def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
|
|
|
|
|
@@ -1946,10 +1946,10 @@ class TendereeRuleRecall():
|
|
|
self.entity_context_rule(ents,list_name,list_sentences,list(agency_set))
|
|
|
if not self.get_tenderee:
|
|
|
self.subject_rule(ents,list_articles,list_sentences)
|
|
|
- if not self.get_tenderee:
|
|
|
- self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
|
|
|
- if not self.get_tenderee:
|
|
|
- self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
|
|
|
+ # if not self.get_tenderee:
|
|
|
+ # self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
|
|
|
+ # if not self.get_tenderee:
|
|
|
+ # self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
|
|
|
|
|
|
#entity上下文正则判断
|
|
|
def entity_context_rule(self,entitys,list_name,list_sentences,list_agency):
|
|
@@ -2110,59 +2110,59 @@ class TendereeRuleRecall():
|
|
|
self.get_tenderee = True
|
|
|
|
|
|
# 正则召回未识别实体
|
|
|
- def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
|
|
|
- list_sentence = list_sentences[0]
|
|
|
- for in_attachment in [False,True]:
|
|
|
- for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
|
|
|
- sentence_text = sentence.sentence_text
|
|
|
- tokens = sentence.tokens
|
|
|
- doc_id = sentence.doc_id
|
|
|
- in_attachment = sentence.in_attachment
|
|
|
- list_tokenbegin = []
|
|
|
- begin = 0
|
|
|
- for i in range(0, len(tokens)):
|
|
|
- list_tokenbegin.append(begin)
|
|
|
- begin += len(str(tokens[i]))
|
|
|
- list_tokenbegin.append(begin + 1)
|
|
|
- for _match in re.finditer(pattern,sentence_text):
|
|
|
- _groupdict = _match.groupdict()
|
|
|
- _match_text = _match.group()
|
|
|
- _unrecognized_text = _groupdict["unrecognized"]
|
|
|
- _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
- if not _unrecognized:
|
|
|
- _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
- if _unrecognized:
|
|
|
- _unrecognized = _unrecognized.group()
|
|
|
- else:
|
|
|
- continue
|
|
|
- # print(_unrecognized)
|
|
|
- if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
|
|
|
- continue
|
|
|
- begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
|
|
|
- for j in range(len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j] == begin_index_temp:
|
|
|
- begin_index = j
|
|
|
- break
|
|
|
- elif list_tokenbegin[j] > begin_index_temp:
|
|
|
- begin_index = j - 1
|
|
|
- break
|
|
|
- index = begin_index_temp + len(_unrecognized)
|
|
|
- end_index_temp = index
|
|
|
- for j in range(begin_index, len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j] >= index:
|
|
|
- end_index = j - 1
|
|
|
- break
|
|
|
- entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
|
|
|
- entity_text = _unrecognized
|
|
|
- new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
|
|
|
- begin_index_temp, end_index_temp, in_attachment=in_attachment)
|
|
|
- new_entity.label = 0
|
|
|
- new_entity.values = [on_value,0,0,0,0,0]
|
|
|
- list_entitys[0].append(new_entity)
|
|
|
- self.get_tenderee = True
|
|
|
- if self.get_tenderee:
|
|
|
- list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
- break
|
|
|
+ # def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
|
|
|
+ # list_sentence = list_sentences[0]
|
|
|
+ # for in_attachment in [False,True]:
|
|
|
+ # for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
|
|
|
+ # sentence_text = sentence.sentence_text
|
|
|
+ # tokens = sentence.tokens
|
|
|
+ # doc_id = sentence.doc_id
|
|
|
+ # in_attachment = sentence.in_attachment
|
|
|
+ # list_tokenbegin = []
|
|
|
+ # begin = 0
|
|
|
+ # for i in range(0, len(tokens)):
|
|
|
+ # list_tokenbegin.append(begin)
|
|
|
+ # begin += len(str(tokens[i]))
|
|
|
+ # list_tokenbegin.append(begin + 1)
|
|
|
+ # for _match in re.finditer(pattern,sentence_text):
|
|
|
+ # _groupdict = _match.groupdict()
|
|
|
+ # _match_text = _match.group()
|
|
|
+ # _unrecognized_text = _groupdict["unrecognized"]
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
|
|
|
+ # if not _unrecognized:
|
|
|
+ # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
|
|
|
+ # if _unrecognized:
|
|
|
+ # _unrecognized = _unrecognized.group()
|
|
|
+ # else:
|
|
|
+ # continue
|
|
|
+ # # print(_unrecognized)
|
|
|
+ # if re.search("某|乙方|代理",_unrecognized) or len(_unrecognized)>15:
|
|
|
+ # continue
|
|
|
+ # begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
|
|
|
+ # for j in range(len(list_tokenbegin)):
|
|
|
+ # if list_tokenbegin[j] == begin_index_temp:
|
|
|
+ # begin_index = j
|
|
|
+ # break
|
|
|
+ # elif list_tokenbegin[j] > begin_index_temp:
|
|
|
+ # begin_index = j - 1
|
|
|
+ # break
|
|
|
+ # index = begin_index_temp + len(_unrecognized)
|
|
|
+ # end_index_temp = index
|
|
|
+ # for j in range(begin_index, len(list_tokenbegin)):
|
|
|
+ # if list_tokenbegin[j] >= index:
|
|
|
+ # end_index = j - 1
|
|
|
+ # break
|
|
|
+ # entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
|
|
|
+ # entity_text = _unrecognized
|
|
|
+ # new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
|
|
|
+ # begin_index_temp, end_index_temp, in_attachment=in_attachment)
|
|
|
+ # new_entity.label = 0
|
|
|
+ # new_entity.values = [on_value,0,0,0,0,0]
|
|
|
+ # list_entitys[0].append(new_entity)
|
|
|
+ # self.get_tenderee = True
|
|
|
+ # if self.get_tenderee:
|
|
|
+ # list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
+ # break
|
|
|
|
|
|
class RoleGrade():
|
|
|
def __init__(self):
|