|
@@ -341,6 +341,7 @@ def text_process(text):
|
|
|
|
|
|
# 优化部分未识别表达
|
|
|
text = re.sub("中止", "终止", text)
|
|
|
+ text = re.sub("遴选", "招标", text)
|
|
|
|
|
|
return text
|
|
|
|
|
@@ -416,12 +417,18 @@ def channel_predict(title,text):
|
|
|
# process text
|
|
|
if title in text:
|
|
|
text = text.replace(title, '', 1)
|
|
|
- text = text_process(text)
|
|
|
+ if "##attachment##" in text:
|
|
|
+ main_text,attachment_text = text.split("##attachment##",maxsplit=1)
|
|
|
+ # print('main_text',main_text)
|
|
|
+ if len(main_text)>=500: # 正文有足够的内容时不需要使用附件预测
|
|
|
+ text = main_text
|
|
|
text = re.sub("##attachment##。?","",text)
|
|
|
+ text = text_process(text)
|
|
|
+
|
|
|
if len(text)<=100:
|
|
|
# 正文内容过短时,不预测
|
|
|
return
|
|
|
- elif len(text)<=200:
|
|
|
+ elif len(text)<=150:
|
|
|
# 正文内容过短时,重复正文
|
|
|
text = text * 2
|
|
|
text = text[:2000]
|
|
@@ -429,6 +436,7 @@ def channel_predict(title,text):
|
|
|
title = title[:100]
|
|
|
text = "公告标题:" + title + "。" + "公告内容:" + text
|
|
|
text = text[:2000]
|
|
|
+ # print('predict text:',text)
|
|
|
|
|
|
# to torch data
|
|
|
text = [text]
|
|
@@ -455,9 +463,15 @@ def channel_predict(title,text):
|
|
|
pred_class = label2class_dict[pred_label]
|
|
|
else:
|
|
|
return
|
|
|
- # check
|
|
|
+ # print('check rule before',pred_class)
|
|
|
+ # check rule
|
|
|
if pred_class==101 and re.search("((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示",title): # 纠正部分‘资审结果’模型错误识别为中标
|
|
|
pred_class = 105
|
|
|
+ elif pred_class==122 and re.search("验收服务",title):
|
|
|
+ pred_class = None
|
|
|
+ # elif pred_class==118 and re.search("重新招标",title): #重新招标类公告,因之前公告的废标原因而错识别为废标公告
|
|
|
+ # pred_class = 52
|
|
|
+
|
|
|
|
|
|
return pred_class
|
|
|
|
|
@@ -488,9 +502,12 @@ class_dict = {51: '公告变更',
|
|
|
122: '验收合同'
|
|
|
}
|
|
|
|
|
|
+tenderee_type = ['公告变更','招标公告','招标预告','招标答疑','资审结果','采购意向']
|
|
|
+win_type = ['中标信息','废标公告','候选人公示','合同公告','开标记录','验收合同']
|
|
|
+
|
|
|
def merge_channel(list_articles,channel_dic,original_docchannel):
|
|
|
|
|
|
- def merge_rule(title,text,docchannel,pred_channel,channel_dic):
|
|
|
+ def merge_rule(title,text,docchannel,pred_channel,channel_dic,original_docchannel):
|
|
|
front_text_len = len(text)//3 if len(text)>300 else 100
|
|
|
front_text = text[:front_text_len]
|
|
|
pred_channel = class_dict[pred_channel]
|
|
@@ -512,10 +529,18 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
|
|
|
channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
|
|
|
else:
|
|
|
- channel_dic = {'docchannel': {'doctype': '采招数据',
|
|
|
- 'docchannel': class_dict.get(original_docchannel, '原始类别'),
|
|
|
- 'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
|
|
|
- channel_dic['docchannel']['use_original_docchannel'] = 1
|
|
|
+ original_type = class_dict.get(original_docchannel, '原始类别')
|
|
|
+ if pred_channel in tenderee_type and docchannel in tenderee_type and original_type not in tenderee_type:
|
|
|
+ # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
|
|
|
+ channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
+ elif pred_channel in win_type and docchannel in win_type and original_type not in win_type:
|
|
|
+ # pred_channel和docchannel都是同一(招标/中标)类型时,original_docchannel不一致时不使用原网类型
|
|
|
+ channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
+ else:
|
|
|
+ channel_dic = {'docchannel': {'doctype': '采招数据',
|
|
|
+ 'docchannel': original_type,
|
|
|
+ 'life_docchannel': original_type}}
|
|
|
+ channel_dic['docchannel']['use_original_docchannel'] = 1
|
|
|
|
|
|
return channel_dic
|
|
|
|
|
@@ -526,7 +551,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
|
|
|
|
|
|
doctype = channel_dic['docchannel']['doctype']
|
|
|
docchannel = channel_dic['docchannel']['docchannel']
|
|
|
- # print('doctype',doctype,'docchannel',docchannel)
|
|
|
+ # print('doctype',doctype,'docchannel',docchannel,'original_docchannel',original_docchannel)
|
|
|
compare_type = ['公告变更','招标公告','中标信息','招标预告','招标答疑','资审结果','采购意向','废标公告','候选人公示',
|
|
|
'合同公告','开标记录','验收合同']
|
|
|
# 仅比较部分数据
|
|
@@ -535,15 +560,8 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
|
|
|
pred = channel_predict(title, text)
|
|
|
# print('pred_res', pred)
|
|
|
if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
|
|
|
- # if class_dict[pred] == docchannel:
|
|
|
- # channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
- # else:
|
|
|
- # channel_dic = {'docchannel': {'docchannel': '采招数据',
|
|
|
- # 'doctype': class_dict.get(original_docchannel, '原始类别'),
|
|
|
- # 'life_docchannel': class_dict.get(original_docchannel, '原始类别')}}
|
|
|
- # channel_dic['docchannel']['use_original_docchannel'] = 1
|
|
|
-
|
|
|
- channel_dic = merge_rule(title,text,docchannel,pred,channel_dic)
|
|
|
+ channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
|
|
|
+
|
|
|
elif doctype=='采招数据' and docchannel=="":
|
|
|
pred = channel_predict(title, text)
|
|
|
if pred is not None:
|
|
@@ -551,6 +569,22 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
|
|
|
channel_dic['docchannel']['docchannel'] = pred
|
|
|
channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
|
|
|
+ # '招标预告'类 规则纠正
|
|
|
+ if channel_dic['docchannel']['doctype']=='采招数据' and channel_dic['docchannel']['docchannel']=="招标公告":
|
|
|
+ if "##attachment##" in text:
|
|
|
+ main_text, attachment_text = text.split("##attachment##", maxsplit=1)
|
|
|
+ else:
|
|
|
+ main_text = text
|
|
|
+ main_text = text_process(main_text)
|
|
|
+ if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:len(main_text)//2]):
|
|
|
+ front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
|
|
|
+ front_text = main_text[:front_text_len]
|
|
|
+ if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):
|
|
|
+ channel_dic['docchannel']['docchannel'] = "采购意向"
|
|
|
+ else:
|
|
|
+ channel_dic['docchannel']['docchannel'] = "招标预告"
|
|
|
+ channel_dic['docchannel']['use_original_docchannel'] = 0
|
|
|
+
|
|
|
return channel_dic
|
|
|
|
|
|
|