|
@@ -2503,49 +2503,49 @@ class DocChannel():
|
|
|
|
|
|
def load_pattern(self):
|
|
|
self.type_dic = {
|
|
|
- '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
|
|
|
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
|
|
|
- '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
|
|
|
- '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
|
|
|
- }
|
|
|
+ '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
|
|
|
+ '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
|
|
|
+ '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
|
|
|
+ '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
|
|
|
+ }
|
|
|
|
|
|
self.title_type_dic = {
|
|
|
- '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
|
|
|
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
|
|
|
- '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
|
|
|
- '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标', #|竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
|
|
|
- '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
|
|
|
- }
|
|
|
+ '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
|
|
|
+ '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
|
|
|
+ '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
|
|
|
+ '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标',
|
|
|
+ # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
|
|
|
+ '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
|
|
|
+ }
|
|
|
self.life_dic = {
|
|
|
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
- '招标预告': '预计(采购|招标)(时间|日期)',
|
|
|
- '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
|
- '资审结果': '资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审结果公示|资格审查结果公示如下|资格预审工作已结束',
|
|
|
- '招标答疑': '现澄清为|现澄清如下|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清公告|(最高(投标)?限价|控制价|拦标价)公示', #|异议的回复
|
|
|
- '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
|
|
|
- '候选人公示': '候选人公示|评标结果公示',
|
|
|
- '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
- '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:|中标价格:',
|
|
|
- '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
- '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
|
|
|
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
|
|
|
- '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?原因|本项目因故取消。'
|
|
|
- }
|
|
|
+ '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
+ '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
|
|
|
+ '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|\w{,5}材料)|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
|
|
|
+ '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过(原因|资格)',
|
|
|
+ '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复
|
|
|
+ '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
|
+ '候选人公示': '候选人公示|评标结果公示',
|
|
|
+ '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
+ '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购',
|
|
|
+ '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
+ '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示',
|
|
|
+ # |确定成交供应商[:,\s]
|
|
|
+ '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|签订日期)|(供应商乙方|乙方供应商):|合同总?金额',
|
|
|
+ '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
|
+ '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?原因|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人数|供应商|单位)不足|已终止'
|
|
|
+ }
|
|
|
self.title_life_dic = {
|
|
|
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
|
|
|
- '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
|
|
|
- '公告变更': '(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
|
|
|
- '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
|
|
|
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
|
|
|
- '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式)|合同备案|合同书', # 合同$|
|
|
|
- '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
|
|
|
- '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,5}结果|开标(记录|信息|情况)|中标通知书|中标$',
|
|
|
- # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
|
|
|
- '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
|
- '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
|
- }
|
|
|
-
|
|
|
- self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商[:,\s]|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告'
|
|
|
+ '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
|
|
|
+ '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
|
|
|
+ '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
|
|
|
+ '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
|
|
|
+ '废标公告': '(终止|中止|废标|废除|流标|失败|作废|异常|撤销|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
|
|
|
+ '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
|
|
|
+ '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
|
|
|
+ '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源|中标通知书|中标$',
|
|
|
+ '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
|
+ '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
|
+ }
|
|
|
|
|
|
def load_life(self,life_model):
|
|
|
with tf.Graph().as_default() as graph:
|
|
@@ -2795,8 +2795,13 @@ class DocChannel():
|
|
|
|
|
|
def html2text(html):
|
|
|
ser = re.search('<div[^<>]*richTextFetch', html)
|
|
|
- if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500:
|
|
|
- html = html[:ser.start()]+'##richTextFetch##'
|
|
|
+ # if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500:
|
|
|
+ # html = html[:ser.start()]+'##richTextFetch##'
|
|
|
+ if ser:
|
|
|
+ if len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()])) > 200:
|
|
|
+ html = html[:ser.start()] + '##richTextFetch##'
|
|
|
+ else:
|
|
|
+ html = html[:ser.start() + 500]
|
|
|
text = re.sub('<[^<]*?>', '', html).replace(' ', ' ')
|
|
|
text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
|
|
|
text = re.sub('\s+', ' ', text)
|
|
@@ -2852,125 +2857,202 @@ class DocChannel():
|
|
|
else:
|
|
|
return '', '没有公告类型关键词,返回空'
|
|
|
|
|
|
- def get_life(title, text, extract_json="", bidway="", original_docchannel=''):
|
|
|
- if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
|
|
|
- if re.search(self.title_life_dic['候选人公示'], title):
|
|
|
- return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
|
|
|
- elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
- elif re.search('终止|废标|流标', title):
|
|
|
- return '废标公告', re.search('终止|废标|流标', title).group(0)
|
|
|
- elif is_single_source(bidway, title):
|
|
|
- return '中标信息', 'bidway单一来源'
|
|
|
- elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
|
|
|
- 0)
|
|
|
- return '采购意向', (
|
|
|
- re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
|
|
|
- elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
|
|
|
- if re.search(self.title_life_dic['候选人公示'], title):
|
|
|
- return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
|
|
|
- elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
- elif re.search('终止|废标|流标', title):
|
|
|
- return '废标公告', re.search('终止|废标|流标', title).group(0)
|
|
|
- elif is_single_source(extract_json, title):
|
|
|
- return '中标信息', 'bidway单一来源'
|
|
|
- elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
|
|
|
- 0)
|
|
|
- return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
|
|
|
- elif re.search(self.title_life_dic['废标公告'], title+text.strip().split(' ')[0]) or re.search(self.life_dic['废标公告'], text[:150]):
|
|
|
- return '废标公告', (
|
|
|
- re.search(self.title_life_dic['废标公告'], title+text.strip().split(' ')[0]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
|
|
|
- elif re.search(self.life_dic['废标公告2'], text[:]) and not is_contain_winner(extract_json):
|
|
|
- return '废标公告', re.search(self.life_dic['废标公告2'], text[:]).group(0)
|
|
|
- elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
|
|
|
- if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
|
|
|
- return '中标信息', '候选人公示排除,修改为中标信息'
|
|
|
- return '候选人公示', (
|
|
|
- re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
|
|
|
- 0)
|
|
|
- elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
|
|
|
- :150]):
|
|
|
- return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
|
|
|
- text[:150]) or re.search(
|
|
|
- self.life_dic['合同公告'], text)).group(0)
|
|
|
- elif re.search(self.life_dic['合同公告'].replace(';', '|'), text): # or re.search(self.life_dic['合同公告'], text[:300]):
|
|
|
- num, kw = count_diffser(self.life_dic['合同公告'], text)
|
|
|
- if num >= 3:
|
|
|
- return '合同公告', kw
|
|
|
- elif re.search(self.title_life_dic['招标公告'], title[-8:]):
|
|
|
- return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
|
|
|
- elif not is_contain_winner(extract_json):
|
|
|
- return '', '有合同关键词无中标角色返回空'
|
|
|
- return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
|
|
|
- elif is_single_source(extract_json, title):
|
|
|
- return '中标信息', '单一来源采购'
|
|
|
- elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
- if re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]):
|
|
|
- return '资审结果', re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]).group(0)
|
|
|
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
- elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
|
|
|
- if re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]):
|
|
|
- return '资审结果', re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]).group(0)
|
|
|
- if re.search(self.wrong_win, text):
|
|
|
- return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
- return '中标信息', (
|
|
|
- re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
|
|
|
- 0)
|
|
|
- elif re.search(self.life_dic['中标信息2'], text[:]):
|
|
|
- if re.search(self.wrong_win, text):
|
|
|
- return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
- return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
|
|
|
- elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
|
|
|
- if re.search(self.wrong_win, text):
|
|
|
- return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
- return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
|
|
|
- elif re.search('公开选取.{,20}机构的公告', title):
|
|
|
- if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
|
|
|
- return '中标信息', '机构选取有中选机构'
|
|
|
+ def get_life(title, text):
|
|
|
+ title = re.sub('[-()()0-9a-z]|第?[二三四]次公?告?', '', title)
|
|
|
+ first_line = text.split()[0] if len(text.split()) > 2 else ''
|
|
|
+ if title.strip()[-2:] not in ['公告', '公示'] and 5 < len(first_line) < 30 and first_line[-2:] in ['公告', '公示']:
|
|
|
+ # print('title: ', title, first_line)
|
|
|
+ title += first_line
|
|
|
+ # print('title: ', title)
|
|
|
+
|
|
|
+ def count_score(l):
|
|
|
+ return len(l) + len(set(l)) * 2
|
|
|
+
|
|
|
+ life_kw_title = {}
|
|
|
+ life_kw_content = {}
|
|
|
+ life_score = {}
|
|
|
+
|
|
|
+ # msc = ""
|
|
|
+ # 查找标题每个类别关键词
|
|
|
+ for k, v in self.title_life_dic.items():
|
|
|
+ k2 = re.sub('[\da-z]', '', k)
|
|
|
+ if k2 not in life_kw_title:
|
|
|
+ life_kw_title[k2] = []
|
|
|
+ for it in re.finditer(v, title):
|
|
|
+ life_kw_title[k2].append(it.group(0))
|
|
|
+
|
|
|
+ # 查找正文每个类别关键词
|
|
|
+ for k, v in self.life_dic.items():
|
|
|
+ k2 = re.sub('[\da-z]', '', k)
|
|
|
+ if k2 not in life_kw_content:
|
|
|
+ life_kw_content[k2] = {'pos': [], 'neg': []}
|
|
|
+ for it in re.finditer(v, text):
|
|
|
+ if 'neg' not in k:
|
|
|
+ life_kw_content[k2]['pos'].append(it.group(0))
|
|
|
+ else:
|
|
|
+ life_kw_content[k2]['neg'].append(it.group(0))
|
|
|
+ if k2 not in life_score:
|
|
|
+ life_score[k2] = count_score(life_kw_content[k2]['pos']) if 'neg' not in k else -count_score(
|
|
|
+ life_kw_content[k2]['neg'])
|
|
|
else:
|
|
|
- return '招标公告', '公开选取机构'
|
|
|
- elif is_contain_winner(extract_json):
|
|
|
- num, kw = count_diffser(self.life_dic['招标公告'], text)
|
|
|
- if re.search(self.wrong_win, text):
|
|
|
- return '招标公告', re.search(self.wrong_win, text).group(0)
|
|
|
- elif num >= 2:
|
|
|
- return '招标公告', kw
|
|
|
- elif re.search('##richTextFetch##', text):
|
|
|
- return '', '提取到中标人但包含附件返回空'
|
|
|
- return '中标信息', '提取到中标人'
|
|
|
- elif re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]) or re.search(self.life_dic['资审结果'], text[:150]):
|
|
|
- return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text.strip().split(' ')[0]) or re.search(self.life_dic['资审结果'], text[:150])).group(0)
|
|
|
- elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
|
|
|
- if re.search(self.title_life_dic['废标公告'], title):
|
|
|
- return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
|
|
|
- elif re.search(self.title_life_dic['候选人公示'], title):
|
|
|
- return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
|
|
|
- elif re.search(self.title_life_dic['中标信息'], title):
|
|
|
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
|
|
|
- elif is_single_source(extract_json, title):
|
|
|
- return '中标信息', 'bidway单一来源'
|
|
|
- # elif re.search('(中标|成交)结果', title[-8:]):
|
|
|
- # return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
|
|
|
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
|
|
|
- elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text[:200]) or len(
|
|
|
- re.findall('(答:|回复:)', text)) >= 2: # or re.search(self.title_life_dic['招标答疑'], text[:150])
|
|
|
- if re.search(self.title_life_dic['废标公告'], title):
|
|
|
- return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
|
|
|
- elif re.search('(中标|成交)结果', title[-8:]):
|
|
|
- return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
|
|
|
- return '招标答疑', (
|
|
|
- re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text[:200]) or re.search(
|
|
|
- '(答:|回复:)', text)).group(0)
|
|
|
- elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
|
|
|
- if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
|
|
|
- return '', '招标正则召回标题有其他类别关键词,返回空'
|
|
|
- return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
|
|
|
- text[:])).group(0)
|
|
|
+ life_score[k2] = life_score[k2] + count_score(life_kw_content[k2]['pos']) if 'neg' not in k else \
|
|
|
+ life_score[k2] - count_score(life_kw_content[k2]['neg'])
|
|
|
+
|
|
|
+ life_kw_title = {k: v for k, v in life_kw_title.items() if v != []}
|
|
|
+ life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0}
|
|
|
+ msc = [life_kw_title, life_kw_content, life_score]
|
|
|
+ msc = json.dumps(msc, ensure_ascii=False)
|
|
|
+
|
|
|
+ max_score = 0
|
|
|
+ life_list = []
|
|
|
+ for k in life_score.keys():
|
|
|
+ if life_score[k] > max_score:
|
|
|
+ max_score = life_score[k]
|
|
|
+ life_list = [k]
|
|
|
+ elif life_score[k] == max_score and life_score[k] > 0:
|
|
|
+ life_list.append(k)
|
|
|
+
|
|
|
+ if '采购意向' in life_kw_title or '采购意向' in life_list:
|
|
|
+ return '采购意向', msc
|
|
|
+ elif '招标预告' in life_kw_title or '招标预告' in life_list:
|
|
|
+ if set(['中标信息', '候选人公示', '合同公告']) & set(life_kw_content) != set():
|
|
|
+ return '', msc
|
|
|
+ return '招标预告', msc
|
|
|
+ elif '公告变更' in life_kw_title or '公告变更' in life_list:
|
|
|
+ if life_score.get('候选人公示', 0) > 3 or '候选人公示' in life_kw_title:
|
|
|
+ return '候选人公示', msc
|
|
|
+ elif life_score.get('合同公告', 0) > 3 or '合同公告' in life_kw_title:
|
|
|
+ return '合同公告', msc
|
|
|
+ elif life_score.get('中标信息', 0) > 3 or '中标信息' in life_kw_title:
|
|
|
+ return '中标信息', msc
|
|
|
+ elif '招标公告' in life_kw_title and life_score.get('公告变更', 0) < 4:
|
|
|
+ return '招标公告', msc
|
|
|
+ return '公告变更', msc
|
|
|
+ elif '招标答疑' in life_kw_title or '招标答疑' in life_list:
|
|
|
+ if '招标公告' in life_kw_title and life_score.get('招标答疑', 0) < 4:
|
|
|
+ return '招标公告', msc
|
|
|
+ elif life_score.get('招标答疑', 0) < max_score:
|
|
|
+ if max_score > 3 and len(life_list) == 1:
|
|
|
+ return life_list[0], msc
|
|
|
+ return '', msc
|
|
|
+ return '招标答疑', msc
|
|
|
+ elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
|
|
|
+ if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
|
|
|
+ return '招标公告', msc
|
|
|
+ return '候选人公示', msc
|
|
|
+ elif '合同公告' in life_kw_title or '合同公告' in life_list:
|
|
|
+ if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
|
|
|
+ return '招标公告', msc
|
|
|
+ return '合同公告', msc
|
|
|
+ elif '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
|
+ if '招标公告' in life_kw_title and life_score.get('招标公告',
|
|
|
+ 0) > 2: # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
|
|
|
+ return '招标公告', msc
|
|
|
+ elif '废标公告' in life_kw_title:
|
|
|
+ return '废标公告', msc
|
|
|
+ elif life_score.get('候选人公示', 0) > 3:
|
|
|
+ return '候选人公示', msc
|
|
|
+ elif life_score.get('合同公告', 0) > 5:
|
|
|
+ return '合同公告', msc
|
|
|
+ return '中标信息', msc
|
|
|
+ elif '废标公告' in life_kw_title or '废标公告' in life_list:
|
|
|
+ if life_score.get('招标公告', 0) > 3:
|
|
|
+ return '招标公告', msc
|
|
|
+ return '废标公告', msc
|
|
|
+ elif '资审结果' in life_kw_title or '资审结果' in life_list:
|
|
|
+ return '资审结果', msc
|
|
|
+ elif '招标公告' in life_kw_title or '招标公告' in life_list:
|
|
|
+ return '招标公告', msc
|
|
|
+
|
|
|
+ return '', msc
|
|
|
+
|
|
|
+ def get_model_inputs(list_sentence):
|
|
|
+ list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
|
|
|
+ token_l = [it.tokens for it in list_sentence]
|
|
|
+ tokens = [it for l in token_l for it in l]
|
|
|
+ content = ' '.join(tokens[:500])
|
|
|
+ data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
|
|
|
+ dochtmlcon=content) # 标题最多取50字
|
|
|
+ text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
|
|
|
+ title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
|
|
|
+
|
|
|
+ array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
|
|
|
+ array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
|
|
|
+ return array_content, array_title ,text_len, title_len, content
|
|
|
+
|
|
|
+ def type_model_predict():
|
|
|
+ pred = self.type_sess.run(self.type_softmax,
|
|
|
+ feed_dict={
|
|
|
+ self.type_title: array_title,
|
|
|
+ self.type_content: array_content,
|
|
|
+ self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
+ self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
+ self.type_prob: 1}
|
|
|
+ )
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
+ prob = pred[0][id]
|
|
|
+ return id, prob
|
|
|
+
|
|
|
+ def life_model_predict():
|
|
|
+ pred = self.lift_sess.run(self.lift_softmax,
|
|
|
+ feed_dict={
|
|
|
+ self.lift_title: array_title,
|
|
|
+ self.lift_content: array_content,
|
|
|
+ self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
+ self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
+ self.lift_prob: 1}
|
|
|
+ )
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
+ prob = pred[0][id]
|
|
|
+ return id, prob
|
|
|
+
|
|
|
+ def final_change(msc):
|
|
|
+ '''
|
|
|
+ 修改逻辑:
|
|
|
+ 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
|
|
|
+ 2、废标公告有中标人且标题无废标关键词,返回中标信息
|
|
|
+ 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
|
|
|
+ 4、招标公告有中标人且原始为中标,返回中标信息
|
|
|
+ 5、预测及原始均在招标、预告、意向,返回原始类别
|
|
|
+ 6、预测及原始均在变更、答疑,返回原始类别
|
|
|
+ 7、预测为采招数据,原始为产权且有关键词,返回原始类别
|
|
|
+ '''
|
|
|
+ if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
|
|
|
+ elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
|
|
|
+ self.title_life_dic['废标公告'], title) == None:
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
|
|
|
+ self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
|
|
|
+ elif result['docchannel']['docchannel'] == '中标信息' and is_contain_winner(prem_json) and origin_dic.get(
|
|
|
+ original_docchannel, '') == '中标信息':
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标公告', '采购意向', '招标预告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测及原始均在招标、预告、意向,返回原始类别'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标答疑', '公告变更']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
|
|
|
+ elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
|
|
|
+ result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
|
|
|
+
|
|
|
+ '''下面是新格式增加返回字段'''
|
|
|
+ if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
|
|
|
+ result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
|
|
|
else:
|
|
|
- return '', '未预测到关键词, 返回空'
|
|
|
+ result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
+ return msc
|
|
|
|
|
|
not_extract_dic = {
|
|
|
104: '招标文件',
|
|
@@ -3020,7 +3102,8 @@ class DocChannel():
|
|
|
result = {'docchannel': {'docchannel': '', 'doctype': ''}}
|
|
|
|
|
|
doc_type, type_kw = get_type(title, text)
|
|
|
- doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
|
|
|
+ # doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
|
|
|
+ doc_life, life_kw = get_life(title, text)
|
|
|
if doc_type in self.title_type_dic:
|
|
|
result['docchannel']['doctype'] = doc_type
|
|
|
if doc_life in self.title_life_dic:
|
|
@@ -3028,78 +3111,24 @@ class DocChannel():
|
|
|
# print('channel正则预测结果:', result)
|
|
|
msc = '正则结果:类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)+'\n'+'模型结果:'
|
|
|
# print('类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw))
|
|
|
- if doc_type=="" or doc_life=="":
|
|
|
- list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
|
|
|
- token_l = [it.tokens for it in list_sentence]
|
|
|
- tokens = [it for l in token_l for it in l]
|
|
|
- content = ' '.join(tokens[:500])
|
|
|
- data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
|
|
|
- dochtmlcon=content) # 标题最多取50字
|
|
|
- text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
|
|
|
- title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
|
|
|
-
|
|
|
- array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
|
|
|
- array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
|
|
|
-
|
|
|
- if doc_type == "":
|
|
|
- pred = self.type_sess.run(self.type_softmax,
|
|
|
- feed_dict={
|
|
|
- self.type_title: array_title,
|
|
|
- self.type_content: array_content,
|
|
|
- self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
- self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
- self.type_prob: 1}
|
|
|
- )
|
|
|
- id = np.argmax(pred, axis=1)[0]
|
|
|
- prob = pred[0][id]
|
|
|
- result['docchannel']['doctype'] = self.id2type[id]
|
|
|
- msc += self.id2type[id] + ';'
|
|
|
+ if doc_type == "" or doc_life == "":
|
|
|
+ array_content, array_title, text_len, title_len, content = get_model_inputs(list_sentence)
|
|
|
+ if doc_type =="":
|
|
|
+ type_id, type_prob = type_model_predict()
|
|
|
+ type_model = self.id2type[type_id]
|
|
|
+ result['docchannel']['doctype'] = type_model
|
|
|
+ msc += type_model + ';'
|
|
|
# print('公告类别:', self.id2type[id], '概率:',prob)
|
|
|
# if id == 0:
|
|
|
if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
|
|
|
if len(text)>150 and re.search(self.kws, content):
|
|
|
- pred = self.lift_sess.run(self.lift_softmax,
|
|
|
- feed_dict={
|
|
|
- self.lift_title: array_title,
|
|
|
- self.lift_content: array_content,
|
|
|
- self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
|
|
|
- self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
|
|
|
- self.lift_prob: 1}
|
|
|
- )
|
|
|
- id = np.argmax(pred, axis=1)[0]
|
|
|
- msc += self.id2life[id] + ';\n'
|
|
|
- prob = pred[0][id]
|
|
|
- if self.id2life[id] == '中标信息' and original_docchannel in [51,52,102,103,114] and not is_contain_winner(prem_json):
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
- msc += '模型预测为中标而无中标人且原始为非中标,返回原始类别;'
|
|
|
- elif self.id2life[id] == '采购意向' and re.search('意向品牌|意向单位', text):
|
|
|
- result['docchannel']['docchannel'] = '招标公告'
|
|
|
- msc += '模型为意向,正文有意向品牌等词改为招标公告;'
|
|
|
- elif self.id2life[id] == '招标答疑' and re.search('质疑|澄清|答疑(文件)?|补遗书?|(投标)?限价|控制价|拦标价', title+text[:500])==None:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
- msc += '模型为答疑,正文无答疑关键词改为原始类别;'
|
|
|
- elif prob<0.5:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
- msc += '模型概率小于0.5,返回原始类别;'
|
|
|
- else:
|
|
|
- result['docchannel']['docchannel'] = self.id2life[id]
|
|
|
- # print('生命周期:',self.id2life[id], '概率:',prob)
|
|
|
- # if id == 6:
|
|
|
- if result['docchannel']['docchannel'] == '中标信息':
|
|
|
- if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
|
|
|
- ''.join([it for it in content if it.isalpha()])):
|
|
|
- result['docchannel']['docchannel'] = '候选人公示'
|
|
|
- # return '候选人公示', prob
|
|
|
- # return [{'docchannel': '候选人公示'}]
|
|
|
- # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
|
|
|
- # print('result: ', result)
|
|
|
- if result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(self.title_life_dic['废标公告'], title)==None:
|
|
|
- result['docchannel']['docchannel'] = '中标信息'
|
|
|
- msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
|
|
|
- if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
|
|
|
- result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
|
|
|
- else:
|
|
|
- result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
+ life_id, life_prob = life_model_predict()
|
|
|
+ life_model = self.id2life[life_id]
|
|
|
+ result['docchannel']['docchannel'] = life_model
|
|
|
+ msc += life_model + ';\n'
|
|
|
+
|
|
|
+ msc = final_change(msc)
|
|
|
+ # print('channel ', msc)
|
|
|
return result, msc
|
|
|
|
|
|
# 保证金支付方式提取
|