|
@@ -664,6 +664,9 @@ class PREMPredict():
|
|
|
elif re.search('尊敬的供应商:.{,25}我公司', text):
|
|
|
label = 0
|
|
|
values[label] = 0.801
|
|
|
+ elif re.search('尊敬的供应商:', text):
|
|
|
+ label = 0
|
|
|
+ values[label] = 0.501
|
|
|
elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
@@ -1120,22 +1123,22 @@ class RoleRulePredictor():
|
|
|
self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
|
|
|
"(人|公司|单位|组织|用户|业主|主体|方|部门))" \
|
|
|
"(是|为|:|:|\s*)+$)"
|
|
|
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至]+采购意向))"
|
|
|
- self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
+ self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向))"
|
|
|
+ self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
|
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
+ self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
|
self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
|
|
|
"(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
|
|
|
- "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
- self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
+ "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
+ self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
|
|
|
# self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
# self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
- "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商|^[((]中标人名称[))]))"
|
|
|
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
+ "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((]中标人名称[))]))"
|
|
|
+ self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
# self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
|
|
|
|
|
@@ -1213,7 +1216,7 @@ class RoleRulePredictor():
|
|
|
word_flag=True, use_text=True,
|
|
|
text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
|
|
|
for _name in list_name:
|
|
|
- if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
|
|
|
+ if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
|
|
|
find_flag = True
|
|
|
if p_entity.values[0] > on_value:
|
|
|
p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
|
|
@@ -1307,7 +1310,7 @@ class RoleRulePredictor():
|
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
_flag = True
|
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
@@ -1414,6 +1417,14 @@ class RoleRulePredictor():
|
|
|
'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
|
|
|
class RoleRuleFinalAdd():
|
|
|
def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
|
|
|
+ '''
|
|
|
+ 最终规则召回角色
|
|
|
+ :param list_articles:
|
|
|
+ :param list_sentences:
|
|
|
+ :param list_entitys:
|
|
|
+ :param list_codenames:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
# text_end = list_articles[0].content.split('##attachment##')[0][-40:]
|
|
|
main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
|
|
|
end_tokens = []
|
|
@@ -1422,11 +1433,12 @@ class RoleRuleFinalAdd():
|
|
|
text_end = "".join(end_tokens[-30:])
|
|
|
# print(text_end)
|
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
- sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
- sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
- sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
- sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主)[,::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
- sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
|
|
|
+ sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
+ sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
|
+ sear_ent2 = re.search('(户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|尊敬的供应商|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
|
+ sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
|
|
|
|
tenderee_notfound = True
|
|
|
agency_notfound = True
|
|
@@ -1434,32 +1446,21 @@ class RoleRuleFinalAdd():
|
|
|
ents = []
|
|
|
for ent in list_entitys[0]:
|
|
|
if ent.entity_type in ['org', 'company']:
|
|
|
- if ent.label == 0:
|
|
|
+ if ent.label == 0 and ent.values[ent.label]>=0.5:
|
|
|
+ if '公共资源交易中心' in ent.entity_text:
|
|
|
+ ent.label = 5
|
|
|
+ continue
|
|
|
tenderee_list.append(ent.entity_text)
|
|
|
tenderee_notfound = False
|
|
|
elif ent.label == 1:
|
|
|
agency_notfound = False
|
|
|
elif ent.label == 5:
|
|
|
+ if '公共资源交易中心' in ent.entity_text:
|
|
|
+ continue
|
|
|
ents.append(ent)
|
|
|
- if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
|
|
|
+ if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
|
|
|
for _sear_ent in [_sear for _sear in sear_list if _sear]:
|
|
|
- # if sear_ent4:
|
|
|
- # ent_re = sear_ent4.group(2)
|
|
|
- # elif sear_ent3:
|
|
|
- # ent_re = sear_ent3.group(2)
|
|
|
- # elif sear_ent2:
|
|
|
- # ent_re = sear_ent2.group(2)
|
|
|
- # else:
|
|
|
- # ent_re = sear_ent.group(1)
|
|
|
- if _sear_ent==sear_ent4:
|
|
|
- ent_re = _sear_ent.group(2)
|
|
|
- elif _sear_ent==sear_ent3:
|
|
|
- ent_re = _sear_ent.group(2)
|
|
|
- elif _sear_ent==sear_ent2:
|
|
|
- ent_re = _sear_ent.group(2)
|
|
|
- else:
|
|
|
- ent_re = _sear_ent.group(1)
|
|
|
- # print('ent_re', ent_re)
|
|
|
+ ent_re = _sear_ent.group('entity')
|
|
|
ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
|
|
|
|
|
|
if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
|
|
@@ -1489,8 +1490,8 @@ class RoleRuleFinalAdd():
|
|
|
agency_notfound = False
|
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
|
break
|
|
|
- if not tenderee_notfound:
|
|
|
- break
|
|
|
+ if not tenderee_notfound:
|
|
|
+ break
|
|
|
|
|
|
elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
|
|
|
# tenderee_notfound = True
|
|
@@ -1509,6 +1510,7 @@ class RoleRuleFinalAdd():
|
|
|
if ent.entity_text in list_codenames[0]['name']:
|
|
|
ent.label = 0
|
|
|
ent.values[0] = 0.5
|
|
|
+ tenderee_notfound == False
|
|
|
# log('正则召回标题中包含的实体:%s'%ent.entity_text)
|
|
|
break
|
|
|
|
|
@@ -3045,6 +3047,7 @@ class DocChannel():
|
|
|
6、预测及原始均在变更、答疑,返回原始类别
|
|
|
7、预测为采招数据,原始为产权且有关键词,返回原始类别
|
|
|
8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
|
|
|
+ 9、若预测为非采招数据且源网为采招数据且标题无关键词返回采招数据
|
|
|
'''
|
|
|
if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
|
|
|
original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
|
|
@@ -3080,6 +3083,10 @@ class DocChannel():
|
|
|
self.title_life_dic['废标公告'], title) == None:
|
|
|
result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
|
|
|
+ elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
|
|
|
+ original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
|
|
|
+ result['docchannel']['doctype'] = '采招数据'
|
|
|
+ msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
|
|
|
|
|
|
'''下面是新格式增加返回字段'''
|
|
|
if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
|
|
@@ -3151,7 +3158,7 @@ class DocChannel():
|
|
|
type_id, type_prob = type_model_predict()
|
|
|
type_model = self.id2type[type_id]
|
|
|
result['docchannel']['doctype'] = type_model
|
|
|
- msc += type_model + ';'
|
|
|
+ msc += type_model + ' 概率:%.4f;'%type_prob
|
|
|
# print('公告类别:', self.id2type[id], '概率:',prob)
|
|
|
# if id == 0:
|
|
|
if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
|
|
@@ -3159,7 +3166,7 @@ class DocChannel():
|
|
|
life_id, life_prob = life_model_predict()
|
|
|
life_model = self.id2life[life_id]
|
|
|
result['docchannel']['docchannel'] = life_model
|
|
|
- msc += life_model + ';\n'
|
|
|
+ msc += life_model + ' 概率:%.4f;\n'%life_prob
|
|
|
|
|
|
msc = final_change(msc)
|
|
|
# print('channel ', msc)
|