|
@@ -1281,7 +1281,7 @@ class RoleRulePredictor():
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
_flag = True
|
|
_flag = True
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
@@ -2439,13 +2439,60 @@ class DocChannel():
|
|
self.id2type = {k: v for k, v in enumerate(lb_type)}
|
|
self.id2type = {k: v for k, v in enumerate(lb_type)}
|
|
self.id2life = {k: v for k, v in enumerate(lb_life)}
|
|
self.id2life = {k: v for k, v in enumerate(lb_life)}
|
|
|
|
|
|
|
|
+ self.load_pattern()
|
|
|
|
+
|
|
|
|
+ def load_pattern(self):
|
|
|
|
+ self.type_dic = {
|
|
|
|
+ '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
|
|
|
|
+ '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
|
|
|
|
+ '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
|
|
|
|
+ '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ self.title_type_dic = {
|
|
|
|
+ '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
|
|
|
|
+ '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
|
|
|
|
+ '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
|
|
|
|
+ '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)', # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
|
|
|
|
+ '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
|
|
|
|
+ }
|
|
|
|
+ self.life_dic = {
|
|
|
|
+ '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
|
+ '招标预告': '预计(采购|招标)(时间|日期)',
|
|
|
|
+ '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
|
|
+ '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
|
|
|
|
+ '招标答疑': '现澄清为|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
|
|
|
|
+ '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
|
|
|
|
+ '候选人公示': '候选人公示|评标结果公示',
|
|
|
|
+ '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
|
+ '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
|
|
|
|
+ '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
|
+ '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
|
|
|
|
+ '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
|
|
|
|
+ }
|
|
|
|
+ self.title_life_dic = {
|
|
|
|
+ '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
|
+ '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
|
|
|
|
+ '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
|
|
|
|
+ '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)公示',
|
|
|
|
+ '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
|
|
|
|
+ '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书', # 合同$|
|
|
|
|
+ '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
|
|
|
|
+ '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
|
|
|
|
+ # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
|
|
|
|
+ '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
|
|
|
|
+ '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
|
|
|
|
+
|
|
def load_life(self,life_model):
|
|
def load_life(self,life_model):
|
|
with tf.Graph().as_default() as graph:
|
|
with tf.Graph().as_default() as graph:
|
|
output_graph_def = graph.as_graph_def()
|
|
output_graph_def = graph.as_graph_def()
|
|
with open(os.path.dirname(__file__)+life_model, 'rb') as f:
|
|
with open(os.path.dirname(__file__)+life_model, 'rb') as f:
|
|
output_graph_def.ParseFromString(f.read())
|
|
output_graph_def.ParseFromString(f.read())
|
|
tf.import_graph_def(output_graph_def, name='')
|
|
tf.import_graph_def(output_graph_def, name='')
|
|
- print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
|
|
+ # print("%d ops in the final graph" % len(output_graph_def.node))
|
|
del output_graph_def
|
|
del output_graph_def
|
|
sess = tf.Session(graph=graph)
|
|
sess = tf.Session(graph=graph)
|
|
sess.run(tf.global_variables_initializer())
|
|
sess.run(tf.global_variables_initializer())
|
|
@@ -2464,7 +2511,7 @@ class DocChannel():
|
|
with open(os.path.dirname(__file__)+type_model, 'rb') as f:
|
|
with open(os.path.dirname(__file__)+type_model, 'rb') as f:
|
|
output_graph_def.ParseFromString(f.read())
|
|
output_graph_def.ParseFromString(f.read())
|
|
tf.import_graph_def(output_graph_def, name='')
|
|
tf.import_graph_def(output_graph_def, name='')
|
|
- print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
|
|
+ # print("%d ops in the final graph" % len(output_graph_def.node))
|
|
del output_graph_def
|
|
del output_graph_def
|
|
sess = tf.Session(graph=graph)
|
|
sess = tf.Session(graph=graph)
|
|
sess.run(tf.global_variables_initializer())
|
|
sess.run(tf.global_variables_initializer())
|
|
@@ -2611,7 +2658,7 @@ class DocChannel():
|
|
id = np.argmax(pred, axis=1)[0]
|
|
id = np.argmax(pred, axis=1)[0]
|
|
prob = pred[0][id]
|
|
prob = pred[0][id]
|
|
result['docchannel']['docchannel'] = self.id2life[id]
|
|
result['docchannel']['docchannel'] = self.id2life[id]
|
|
- # print('生命周期:',self.id2life[id], '概率:',prob)
|
|
|
|
|
|
+ # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
|
|
# if id == 6:
|
|
# if id == 6:
|
|
if result['docchannel']['docchannel'] == '中标信息':
|
|
if result['docchannel']['docchannel'] == '中标信息':
|
|
if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
|
|
if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
|
|
@@ -2666,6 +2713,281 @@ class DocChannel():
|
|
log('正则把中标信息修改为空')
|
|
log('正则把中标信息修改为空')
|
|
return channel_dic
|
|
return channel_dic
|
|
|
|
|
|
|
|
+ def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
|
|
|
|
+ '''
|
|
|
|
+ 正则,模型混合预测,返回公告类型及生命周期
|
|
|
|
+ :param title: 公告标题
|
|
|
|
+ :param content: 预处理后的返回的句子实体列表 list_sentence
|
|
|
|
+ :param html: 公告原文 html 内容
|
|
|
|
+ :param bidway: 招标方式
|
|
|
|
+ :param prem: 提取的prem 字典
|
|
|
|
+ :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
|
|
|
|
+ '''
|
|
|
|
+ def cut_single_cn_space(text):
|
|
|
|
+ new_text = ""
|
|
|
|
+ for w in text.split():
|
|
|
|
+ if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
|
|
|
|
+ new_text += w
|
|
|
|
+ else:
|
|
|
|
+ new_text += ' ' + w
|
|
|
|
+ return new_text
|
|
|
|
+
|
|
|
|
+ def html2text(html):
|
|
|
|
+ ser = re.search('<div[^<>]*richTextFetch', html)
|
|
|
|
+ if ser:
|
|
|
|
+ html = html[:ser.start()]+'##richTextFetch##'
|
|
|
|
+ text = re.sub('<[^<]*?>', '', html).replace(' ', ' ')
|
|
|
|
+ text = re.sub('\s+', ' ', text)
|
|
|
|
+ text = re.sub('[/|[()()]', '', text)
|
|
|
|
+ text = cut_single_cn_space(text)
|
|
|
|
+ return text[:20000]
|
|
|
|
+
|
|
|
|
+ def count_diffser(pattern, text):
|
|
|
|
+ num = 0
|
|
|
|
+ kw = []
|
|
|
|
+ for p in pattern.split(';'):
|
|
|
|
+ if re.search(p, text):
|
|
|
|
+ num += 1
|
|
|
|
+ kw.append(re.search(p, text).group(0))
|
|
|
|
+ return num, ';'.join(kw)
|
|
|
|
+
|
|
|
|
+ def is_contain_winner(extract_json):
|
|
|
|
+ if re.search('win_tenderer', extract_json):
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def is_single_source(bidway, title):
|
|
|
|
+ if re.search('单一来源|单一性采购', title):
|
|
|
|
+ return True
|
|
|
|
+ elif bidway == '单一来源':
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def get_type(title, text):
|
|
|
|
+ if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
|
|
|
|
+ text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
|
|
|
|
+ if re.search(self.title_type_dic['采招数据'], title + text[:50]):
|
|
|
|
+ return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
|
|
|
|
+ return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
|
|
|
|
+ elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
|
|
|
|
+ if re.search(self.title_type_dic['采招数据'], title + text[:50]):
|
|
|
|
+ return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
|
|
|
|
+ return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
|
|
|
|
+ elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
|
|
|
|
+ if re.search(self.title_type_dic['采招数据'], title + text[:50]):
|
|
|
|
+ return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
|
|
|
|
+ return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
|
|
|
|
+ elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
|
|
|
|
+ return '采招数据', (
|
|
|
|
+ re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
|
|
|
|
+ 0)
|
|
|
|
+ elif re.search(self.title_type_dic['新闻资讯'], title):
|
|
|
|
+ if re.search(self.title_type_dic['采招数据'], title + text[:150]):
|
|
|
|
+ return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
|
|
|
|
+ return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
|
|
|
|
+ else:
|
|
|
|
+ return '', '没有公告类型关键词,返回空'
|
|
|
|
+
|
|
|
|
+ def get_life(title, text, extract_json="", bidway="", original_docchannel=''):
|
|
|
|
+ if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
|
|
|
|
+ if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
|
+ return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
|
|
|
|
+ 0)
|
|
|
|
+ elif re.search(self.title_life_dic['候选人公示'], title):
|
|
|
|
+ return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
|
+ return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
|
+ elif re.search('终止|废标|流标', title):
|
|
|
|
+ return '废标公告', re.search('终止|废标|流标', title).group(0)
|
|
|
|
+ elif is_single_source(bidway, title):
|
|
|
|
+ return '中标信息', 'bidway单一来源'
|
|
|
|
+ return '采购意向', (
|
|
|
|
+ re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
|
|
|
|
+ if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
|
+ return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
|
|
|
|
+ 0)
|
|
|
|
+ elif re.search(self.title_life_dic['候选人公示'], title):
|
|
|
|
+ return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
|
+ return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
|
+ elif re.search('终止|废标|流标', title):
|
|
|
|
+ return '废标公告', re.search('终止|废标|流标', title).group(0)
|
|
|
|
+ elif is_single_source(extract_json, title):
|
|
|
|
+ return '中标信息', 'bidway单一来源'
|
|
|
|
+ return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
|
+ if re.search(self.title_life_dic['废标公告'], title):
|
|
|
|
+ return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
|
|
|
|
+ # elif re.search('(中标|成交)结果', title[-8:]):
|
|
|
|
+ # return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
|
|
|
|
+ return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
|
|
|
|
+ re.findall('(答:|回复:)', text)) >= 2: # or re.search(self.title_life_dic['招标答疑'], text[:150])
|
|
|
|
+ if re.search(self.title_life_dic['废标公告'], title):
|
|
|
|
+ return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
|
|
|
|
+ elif re.search('(中标|成交)结果', title[-8:]):
|
|
|
|
+ return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
|
|
|
|
+ return '招标答疑', (
|
|
|
|
+ re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
|
|
|
|
+ '(答:|回复:)', text)).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
|
|
|
|
+ return '废标公告', (
|
|
|
|
+ re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
|
|
|
|
+ if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
|
|
|
|
+ return '中标信息', '候选人公示排除,修改为中标信息'
|
|
|
|
+ return '候选人公示', (
|
|
|
|
+ re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
|
|
|
|
+ 0)
|
|
|
|
+ elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
|
|
|
|
+ :150]):
|
|
|
|
+ return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
|
|
|
|
+ text[:150]) or re.search(
|
|
|
|
+ self.life_dic['合同公告'], text)).group(0)
|
|
|
|
+ elif re.search(self.life_dic['合同公告'].replace(';', '|'), text): # or re.search(self.life_dic['合同公告'], text[:300]):
|
|
|
|
+ num, kw = count_diffser(self.life_dic['合同公告'], text)
|
|
|
|
+ if num >= 3:
|
|
|
|
+ return '合同公告', kw
|
|
|
|
+ elif re.search(self.title_life_dic['招标公告'], title[-8:]):
|
|
|
|
+ return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
|
|
|
|
+ elif not is_contain_winner(extract_json):
|
|
|
|
+ return '', '有合同关键词无中标角色返回空'
|
|
|
|
+ return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
|
|
|
|
+ elif is_single_source(extract_json, title):
|
|
|
|
+ return '中标信息', '单一来源采购'
|
|
|
|
+ elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
|
+ if re.search(self.title_life_dic['资审结果'], title+text[:150]):
|
|
|
|
+ return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
|
|
|
|
+ return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
|
|
|
|
+ if re.search(self.title_life_dic['资审结果'], title+text[:150]):
|
|
|
|
+ return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
|
|
|
|
+ # if re.search(self.wrong_win, text):
|
|
|
|
+ # return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
|
+ return '中标信息', (
|
|
|
|
+ re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
|
|
|
|
+ 0)
|
|
|
|
+ elif re.search(self.life_dic['中标信息2'], text[:]):
|
|
|
|
+ if re.search(self.wrong_win, text):
|
|
|
|
+ return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
|
+ return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
|
|
|
|
+ elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
|
|
|
|
+ if re.search(self.wrong_win, text):
|
|
|
|
+ return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
|
+ return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
|
|
|
|
+ elif re.search('公开选取.{,20}机构的公告', title):
|
|
|
|
+ if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
|
|
|
|
+ return '中标信息', '机构选取有中选机构'
|
|
|
|
+ else:
|
|
|
|
+ return '招标公告', '公开选取机构'
|
|
|
|
+ elif is_contain_winner(extract_json):
|
|
|
|
+ num, kw = count_diffser(self.life_dic['招标公告'], text)
|
|
|
|
+ if re.search(self.wrong_win, text):
|
|
|
|
+ return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
|
+ elif num >= 2:
|
|
|
|
+ return '招标公告', kw
|
|
|
|
+ elif re.search('##richTextFetch##', text):
|
|
|
|
+ return '', '提取到中标人但包含附件返回空'
|
|
|
|
+ return '中标信息', '提取到中标人'
|
|
|
|
+ elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
|
|
|
|
+ return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
|
|
|
|
+ elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
|
|
|
|
+ if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
|
|
|
|
+ return '', '招标正则召回标题有其他类别关键词,返回空'
|
|
|
|
+ return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
|
|
|
|
+ text[:])).group(0)
|
|
|
|
+ else:
|
|
|
|
+ return '', '未预测到关键词, 返回空'
|
|
|
|
+
|
|
|
|
+ not_extract_dic = {
|
|
|
|
+ 104: '招标文件',
|
|
|
|
+ 106: '法律法规',
|
|
|
|
+ 107: '新闻资讯',
|
|
|
|
+ 108: '拟建项目',
|
|
|
|
+ 109: '展会推广',
|
|
|
|
+ 110: '企业名录',
|
|
|
|
+ 111: '企业资质',
|
|
|
|
+ 112: '全国工程人员',
|
|
|
|
+ 113: '业主采购'
|
|
|
|
+ }
|
|
|
|
+ if original_docchannel in not_extract_dic:
|
|
|
|
+ return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
|
|
|
|
+ if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
|
|
|
|
+ return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
|
|
|
|
+
|
|
|
|
+ title = re.sub('[^\u4e00-\u9fa5]', '', title)
|
|
|
|
+ if len(title) > 50:
|
|
|
|
+ title = title[:20] + title[-30:]
|
|
|
|
+
|
|
|
|
+ text = html2text(html)
|
|
|
|
+ prem_json = json.dumps(prem, ensure_ascii=False)
|
|
|
|
+ result = {'docchannel': {'docchannel': '', 'doctype': ''}}
|
|
|
|
+
|
|
|
|
+ doc_type, type_kw = get_type(title, text)
|
|
|
|
+ doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
|
|
|
|
+ if doc_type in self.title_type_dic:
|
|
|
|
+ result['docchannel']['doctype'] = doc_type
|
|
|
|
+ if doc_life in self.title_life_dic:
|
|
|
|
+ result['docchannel']['docchannel'] = doc_life
|
|
|
|
+
|
|
|
|
+ if doc_type=="" or doc_life=="":
|
|
|
|
+ list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
|
|
|
|
+ token_l = [it.tokens for it in list_sentence]
|
|
|
|
+ tokens = [it for l in token_l for it in l]
|
|
|
|
+ content = ' '.join(tokens[:500])
|
|
|
|
+ data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
|
|
|
|
+ dochtmlcon=content) # 标题最多取50字
|
|
|
|
+ text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
|
|
|
|
+ title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
|
|
|
|
+
|
|
|
|
+ array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
|
|
|
|
+ array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
|
|
|
|
+
|
|
|
|
+ if doc_type == "":
|
|
|
|
+ pred = self.type_sess.run(self.type_softmax,
|
|
|
|
+ feed_dict={
|
|
|
|
+ self.type_title: array_title,
|
|
|
|
+ self.type_content: array_content,
|
|
|
|
+ self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
|
+ self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
|
+ self.type_prob: 1}
|
|
|
|
+ )
|
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
|
+ prob = pred[0][id]
|
|
|
|
+ result['docchannel']['doctype'] = self.id2type[id]
|
|
|
|
+ # print('公告类别:', self.id2type[id], '概率:',prob)
|
|
|
|
+ # if id == 0:
|
|
|
|
+ if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
|
|
|
|
+ if len(text)>150 and re.search(self.kws, content):
|
|
|
|
+ pred = self.lift_sess.run(self.lift_softmax,
|
|
|
|
+ feed_dict={
|
|
|
|
+ self.lift_title: array_title,
|
|
|
|
+ self.lift_content: array_content,
|
|
|
|
+ self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
|
+ self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
|
+ self.lift_prob: 1}
|
|
|
|
+ )
|
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
|
+ prob = pred[0][id]
|
|
|
|
+ if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
|
|
|
|
+ result['docchannel']['docchannel'] = '招标公告'
|
|
|
|
+ else:
|
|
|
|
+ result['docchannel']['docchannel'] = self.id2life[id]
|
|
|
|
+ # print('生命周期:',self.id2life[id], '概率:',prob)
|
|
|
|
+ # if id == 6:
|
|
|
|
+ if result['docchannel']['docchannel'] == '中标信息':
|
|
|
|
+ if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
|
|
|
|
+ ''.join([it for it in content if it.isalpha()])):
|
|
|
|
+ result['docchannel']['docchannel'] = '候选人公示'
|
|
|
|
+ # return '候选人公示', prob
|
|
|
|
+ # return [{'docchannel': '候选人公示'}]
|
|
|
|
+ # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
|
|
|
|
+ # print('result: ', result)
|
|
|
|
+ return result
|
|
|
|
+
|
|
# 保证金支付方式提取
|
|
# 保证金支付方式提取
|
|
class DepositPaymentWay():
|
|
class DepositPaymentWay():
|
|
def __init__(self,):
|
|
def __init__(self,):
|