|
@@ -35,6 +35,10 @@ sess_config = tf.ConfigProto(
|
|
|
log_device_placement=True)
|
|
|
sess_config = None
|
|
|
|
|
|
+file = os.path.dirname(__file__) + '/agency_set.pkl'
|
|
|
+with open(file, 'rb') as f:
|
|
|
+ agency_set = pickle.load(f)
|
|
|
+
|
|
|
from threading import RLock
|
|
|
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
|
"prem":{"predictor":None,"Lock":RLock()},
|
|
@@ -418,6 +422,8 @@ class CodeNamePredict():
|
|
|
# item['code'] = list(code_set)
|
|
|
for iter in re.finditer(self.PN_pattern,join_predict):
|
|
|
_name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
|
|
|
+ if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
|
|
|
+ continue
|
|
|
|
|
|
#add name to entitys
|
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
@@ -426,7 +432,8 @@ class CodeNamePredict():
|
|
|
w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
|
|
|
if _name not in dict_name_freq_score:
|
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
|
|
|
+ len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
|
|
|
else:
|
|
|
dict_name_freq_score[_name][0] += 1
|
|
|
'''
|
|
@@ -454,6 +461,9 @@ class CodeNamePredict():
|
|
|
othername = re.search(name_re1, sentence.sentence_text)
|
|
|
if othername != None:
|
|
|
project_name = othername.group('name')
|
|
|
+ if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除
|
|
|
+ # log('没有中文的项目名称去除')
|
|
|
+ continue
|
|
|
beg = find_index([project_name], sentence.sentence_text)[0]
|
|
|
end = beg + len(project_name)
|
|
|
_name = self.fitDataByRule(sentence.sentence_text[beg:end])
|
|
@@ -1351,11 +1361,18 @@ class RoleRulePredictor():
|
|
|
p_entity.entity_text)))
|
|
|
if str(_span[1] + _span[2][:len(str(_name))]).find(
|
|
|
_name) >= 0:
|
|
|
- find_flag = True
|
|
|
- _label = 0
|
|
|
- p_entity.label = _label
|
|
|
- p_entity.values[int(_label)] = on_value
|
|
|
- break
|
|
|
+ if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
|
|
|
+ find_flag = True
|
|
|
+ _label = 1
|
|
|
+ p_entity.label = _label
|
|
|
+ p_entity.values[int(_label)] = on_value
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ find_flag = True
|
|
|
+ _label = 0
|
|
|
+ p_entity.label = _label
|
|
|
+ p_entity.values[int(_label)] = on_value
|
|
|
+ break
|
|
|
if p_entity.sentence_index >= 4:
|
|
|
break
|
|
|
if find_flag:
|
|
@@ -1549,6 +1566,7 @@ class RoleRuleFinalAdd():
|
|
|
for sentence in main_sentences[-5:]:
|
|
|
end_tokens.extend(sentence.tokens)
|
|
|
text_end = "".join(end_tokens[-30:])
|
|
|
+ text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处)', '', text_end) # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
|
# print(text_end)
|
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
@@ -1561,6 +1579,7 @@ class RoleRuleFinalAdd():
|
|
|
tenderee_notfound = True
|
|
|
agency_notfound = True
|
|
|
tenderee_list = []
|
|
|
+ agency_list = []
|
|
|
ents = []
|
|
|
for ent in list_entitys[0]:
|
|
|
if ent.entity_type in ['org', 'company']:
|
|
@@ -1571,6 +1590,7 @@ class RoleRuleFinalAdd():
|
|
|
tenderee_list.append(ent.entity_text)
|
|
|
tenderee_notfound = False
|
|
|
elif ent.label == 1:
|
|
|
+ agency_list.append(ent.entity_text)
|
|
|
agency_notfound = False
|
|
|
elif ent.label == 5:
|
|
|
if '公共资源交易中心' in ent.entity_text:
|
|
@@ -1581,8 +1601,9 @@ class RoleRuleFinalAdd():
|
|
|
ent_re = _sear_ent.group('entity')
|
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
|
|
|
- if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
|
|
|
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
|
|
|
+ if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
|
|
|
+ or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
|
|
|
+ and ent_re not in agency_list and ent_re not in agency_set:
|
|
|
n = 0
|
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
|
if not ents[i].in_attachment:
|
|
@@ -1595,7 +1616,8 @@ class RoleRuleFinalAdd():
|
|
|
tenderee_notfound = False
|
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
|
break
|
|
|
- elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) and ent_re not in tenderee_list:
|
|
|
+ elif agency_notfound == True and ent_re not in tenderee_list and (
|
|
|
+ re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
|
|
|
n = 0
|
|
|
for i in range(len(ents) - 1, -1, -1):
|
|
|
if not ents[i].in_attachment:
|
|
@@ -1927,6 +1949,7 @@ class RoleGrade():
|
|
|
def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
|
|
|
'''
|
|
|
根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
|
|
|
+ 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
|
|
|
:param list_articles:
|
|
|
:param list_sentences:
|
|
|
:param list_entitys:
|
|
@@ -1984,6 +2007,10 @@ class RoleGrade():
|
|
|
company_winner.append(entity) # 保存中标人实体
|
|
|
if entity.label == 0 and entity.values[entity.label]> min_prob:
|
|
|
org_tenderee.append(entity.entity_text) # 保存所有招标人名称
|
|
|
+ if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
|
|
|
+ # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
|
|
|
+ entity.label = 1
|
|
|
+ entity.values[entity.label] = 0.5
|
|
|
|
|
|
if org_winner != []:
|
|
|
flag = 0
|