|
@@ -348,6 +348,7 @@ class CodeNamePredict():
|
|
# print(join_predict)
|
|
# print(join_predict)
|
|
code_x = []
|
|
code_x = []
|
|
code_text = []
|
|
code_text = []
|
|
|
|
+ pre_text = []
|
|
temp_entitys = []
|
|
temp_entitys = []
|
|
for iter in re.finditer(self.PC_pattern,join_predict):
|
|
for iter in re.finditer(self.PC_pattern,join_predict):
|
|
get_len = 40
|
|
get_len = 40
|
|
@@ -358,6 +359,7 @@ class CodeNamePredict():
|
|
end = iter.span()[1]+get_len
|
|
end = iter.span()[1]+get_len
|
|
code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
|
|
code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
|
|
code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
|
|
code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
|
|
|
|
+ pre_text.append(pad_sentence[begin:iter.span()[0]])
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
temp_entitys.append(_entity)
|
|
temp_entitys.append(_entity)
|
|
#print("code",code_text)
|
|
#print("code",code_text)
|
|
@@ -402,20 +404,52 @@ class CodeNamePredict():
|
|
if len(it) > 8:
|
|
if len(it) > 8:
|
|
if it not in code_set:
|
|
if it not in code_set:
|
|
code_set.add(it)
|
|
code_set.add(it)
|
|
- item['code'].append(it)
|
|
|
|
|
|
+ # item['code'].append(it)
|
|
|
|
+ if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
|
+ item['code'].append((it, 0))
|
|
|
|
+ elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((it, 1))
|
|
|
|
+ elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((it, 2))
|
|
|
|
+ else:
|
|
|
|
+ item['code'].append((it, 3))
|
|
elif len(item['code']) > 0:
|
|
elif len(item['code']) > 0:
|
|
- new_it = item['code'][-1] + re.search(',|/|;|、|,', the_code).group(0) + it
|
|
|
|
|
|
+ new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it
|
|
if new_it not in code_set:
|
|
if new_it not in code_set:
|
|
code_set.add(new_it)
|
|
code_set.add(new_it)
|
|
- item['code'][-1] = new_it
|
|
|
|
|
|
+ # item['code'][-1] = new_it
|
|
|
|
+ if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
|
+ item['code'][-1] = (new_it, 0)
|
|
|
|
+ elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
|
+ item['code'][-1] = (new_it, 1)
|
|
|
|
+ elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
|
+ item['code'][-1] = (new_it, 2)
|
|
|
|
+ else:
|
|
|
|
+ item['code'][-1] = (new_it, 3)
|
|
else:
|
|
else:
|
|
if the_code not in code_set:
|
|
if the_code not in code_set:
|
|
code_set.add(the_code)
|
|
code_set.add(the_code)
|
|
- item['code'].append(the_code)
|
|
|
|
|
|
+ # item['code'].append(the_code)
|
|
|
|
+ if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 0))
|
|
|
|
+ elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 1))
|
|
|
|
+ elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2))
|
|
|
|
+ else:
|
|
|
|
+ item['code'].append((the_code, 3))
|
|
break
|
|
break
|
|
elif the_code not in code_set:
|
|
elif the_code not in code_set:
|
|
code_set.add(the_code)
|
|
code_set.add(the_code)
|
|
- item['code'].append(the_code)
|
|
|
|
|
|
+ # item['code'].append(the_code)
|
|
|
|
+ if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 0))
|
|
|
|
+ elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 1))
|
|
|
|
+ elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2))
|
|
|
|
+ else:
|
|
|
|
+ item['code'].append((the_code, 3))
|
|
|
|
|
|
# if the_code not in code_set:
|
|
# if the_code not in code_set:
|
|
# code_set.add(the_code)
|
|
# code_set.add(the_code)
|
|
@@ -511,10 +545,21 @@ class CodeNamePredict():
|
|
# 2020/11/23 大网站规则调整
|
|
# 2020/11/23 大网站规则调整
|
|
othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
if othercode != None:
|
|
if othercode != None:
|
|
- item['code'].append(othercode.group('code'))
|
|
|
|
|
|
+ # item['code'].append(othercode.group('code'))
|
|
|
|
+ if re.search("(项目编号|招标编号):?$", othercode.group(0)):
|
|
|
|
+ item['code'].append((othercode.group('code'), 0))
|
|
|
|
+ elif re.search('采购(计划)?编号:?$', othercode.group(0)):
|
|
|
|
+ item['code'].append((othercode.group('code'), 1))
|
|
|
|
+ elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
|
|
+ item['code'].append((othercode.group('code'), 2))
|
|
|
|
+ else:
|
|
|
|
+ item['code'].append((othercode.group('code'), 3))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
- item['code'] = [code for code in item['code'] if len(code)<500]
|
|
|
|
- item['code'].sort(key=lambda x:len(x),reverse=True)
|
|
|
|
|
|
+ # item['code'] = [code for code in item['code'] if len(code)<500]
|
|
|
|
+ # item['code'].sort(key=lambda x:len(x),reverse=True)
|
|
|
|
+ item['code'] = [code for code in item['code'] if len(code[0]) < 500]
|
|
|
|
+ item['code'].sort(key=lambda x: x[1])
|
|
|
|
+ item['code'] = [it[0] for it in item['code']]
|
|
result.append(item)
|
|
result.append(item)
|
|
|
|
|
|
list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
|
|
list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
|
|
@@ -728,7 +773,7 @@ class PREMPredict():
|
|
text_tup = text_list[i]
|
|
text_tup = text_list[i]
|
|
front, middle, behind = text_tup
|
|
front, middle, behind = text_tup
|
|
whole = "".join(text_tup)
|
|
whole = "".join(text_tup)
|
|
- # print('模型预测角色:', front, entity.entity_text, label, values)
|
|
|
|
|
|
+ # print('模型预测角色:', front, entity.entity_text, behind,label, values)
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
label = 5
|
|
label = 5
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
@@ -755,7 +800,7 @@ class PREMPredict():
|
|
elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
|
|
elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
|
|
label = 0
|
|
label = 0
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
- elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', front):
|
|
|
|
|
|
+ elif label == 1 and re.search('([,。:]|^)(第一)?(服务|中选|中标)(中介服务|代理)?(公司|机构)(名称)?', front):
|
|
label = 2
|
|
label = 2
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
|
|
elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
|
|
@@ -814,14 +859,16 @@ class PREMPredict():
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
- elif re.search('^(以上)?按[\d.%]+收取|^[+×*-][\d.%]+', behind):
|
|
|
|
|
|
+ elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
- elif re.search('(含|在|包括)$', front):
|
|
|
|
|
|
+ elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
elif label ==0: # 错误招标金额处理
|
|
elif label ==0: # 错误招标金额处理
|
|
- if entity.notes in ["投资", "工程造价"] or re.search('最低限价:?$', front):
|
|
|
|
|
|
+ if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
- elif re.search('(含|在|包括)$', front):
|
|
|
|
|
|
+ elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
|
|
+ values[label] = 0.49
|
|
|
|
+ elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
|
|
values[label] = 0.49
|
|
values[label] = 0.49
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
label = 1
|
|
label = 1
|
|
@@ -1251,18 +1298,24 @@ class RoleRulePredictor():
|
|
self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
- self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
|
|
|
|
- "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
|
|
|
|
- "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
|
|
|
|
- self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
|
|
|
|
- self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
|
- # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
|
- # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
|
|
|
|
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
|
- "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
|
|
|
|
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
-
|
|
|
|
- # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
|
|
|
|
|
|
+ self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
|
|
|
|
+ "(乙|竞得|受让|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
|
+ "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
|
|
|
|
+ "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
|
|
|
|
+ "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
|
|
|
|
+ "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)" # 承办单位:不作为中标 83914772
|
|
|
|
+ self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w0>" \
|
|
|
|
+ "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|第?[一1]名|第一(中标)?候选人)" \
|
|
|
|
+ "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)" # 解决表头识别不到加逗号情况,需前面为,。空
|
|
|
|
+ self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)" \
|
|
|
|
+ "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
|
|
|
|
+
|
|
|
|
+ self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|厂商)))|" \
|
|
|
|
+ "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
|
|
+ "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \
|
|
|
|
+ "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))"
|
|
|
|
+ self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标" \
|
|
|
|
+ "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
|
|
self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
|
|
self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
@@ -1270,6 +1323,8 @@ class RoleRulePredictor():
|
|
self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
|
|
+ self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)[::是为]+$)"
|
|
|
|
+
|
|
self.pattern_whole = [self.pattern_tenderee_left_w1,
|
|
self.pattern_whole = [self.pattern_tenderee_left_w1,
|
|
self.pattern_tenderee_left,
|
|
self.pattern_tenderee_left,
|
|
self.pattern_tenderee_left_w0,
|
|
self.pattern_tenderee_left_w0,
|
|
@@ -1318,6 +1373,8 @@ class RoleRulePredictor():
|
|
list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
|
|
list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
|
|
# list_name = list_codename["name"]
|
|
# list_name = list_codename["name"]
|
|
list_name = [] # 2022/1/5 改为实体列表内所有项目名称
|
|
list_name = [] # 2022/1/5 改为实体列表内所有项目名称
|
|
|
|
+ candidates = [] # 保存不能确定为第几的候选人 2023/04/14
|
|
|
|
+ notfound_tenderer = True # 未找到前三候选人
|
|
for entity in list_entity:
|
|
for entity in list_entity:
|
|
if entity.entity_type == 'name':
|
|
if entity.entity_type == 'name':
|
|
list_name.append(entity.entity_text)
|
|
list_name.append(entity.entity_text)
|
|
@@ -1385,12 +1442,6 @@ class RoleRulePredictor():
|
|
break
|
|
break
|
|
if find_flag:
|
|
if find_flag:
|
|
break
|
|
break
|
|
- # if str(_name).find(p_entity.entity_text)>=0:
|
|
|
|
- # find_flag = True
|
|
|
|
- # _label = 0
|
|
|
|
- # p_entity.label = _label
|
|
|
|
- # p_entity.values[int(_label)] = on_value
|
|
|
|
- # break
|
|
|
|
# 若是实体在标题中,默认为招标人,不进行以下的规则匹配
|
|
# 若是实体在标题中,默认为招标人,不进行以下的规则匹配
|
|
if find_flag:
|
|
if find_flag:
|
|
continue
|
|
continue
|
|
@@ -1412,6 +1463,7 @@ class RoleRulePredictor():
|
|
list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
|
|
list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
|
|
p_entity.label = 2
|
|
p_entity.label = 2
|
|
p_entity.values[2] = 0.5
|
|
p_entity.values[2] = 0.5
|
|
|
|
+ notfound_tenderer = False
|
|
# log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
# log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
break
|
|
break
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -1443,7 +1495,7 @@ class RoleRulePredictor():
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
_flag = True
|
|
_flag = True
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
@@ -1451,7 +1503,7 @@ class RoleRulePredictor():
|
|
_prob_weight = 1.2 if _weight=='w1' else 1
|
|
_prob_weight = 1.2 if _weight=='w1' else 1
|
|
# print('_v_group:',_group, _v_group, p_entity.entity_text)
|
|
# print('_v_group:',_group, _v_group, p_entity.entity_text)
|
|
|
|
|
|
- if _i_span == 1 and _direct == "center":
|
|
|
|
|
|
+ if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
|
|
_flag = True
|
|
_flag = True
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
"secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
"secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
@@ -1465,12 +1517,19 @@ class RoleRulePredictor():
|
|
_prob_weight = 1.2 if _weight == 'w1' else 1
|
|
_prob_weight = 1.2 if _weight == 'w1' else 1
|
|
# print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
# print('_v_group:', _group, _v_group, p_entity.entity_text)
|
|
|
|
|
|
- # 得到结果
|
|
|
|
|
|
+ # 得到结果
|
|
if _flag:
|
|
if _flag:
|
|
|
|
+ if _label in [2, 3, 4]:
|
|
|
|
+ notfound_tenderer = False
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
|
|
p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
|
|
# log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
# log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
|
|
break
|
|
break
|
|
|
|
+ if _i_span == 0 and re.search(self.condadate_left, list_spans[_i_span]):
|
|
|
|
+ candidates.append(p_entity)
|
|
|
|
+
|
|
|
|
+ elif str(p_entity.label) in ['2', '3', '4']:
|
|
|
|
+ notfound_tenderer = False
|
|
|
|
|
|
# 其他金额通过正则召回可能是招标或中投标的金额
|
|
# 其他金额通过正则召回可能是招标或中投标的金额
|
|
if p_entity.entity_type in ["money"]:
|
|
if p_entity.entity_type in ["money"]:
|
|
@@ -1509,6 +1568,14 @@ class RoleRulePredictor():
|
|
p_entity.label = 0
|
|
p_entity.label = 0
|
|
# print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
|
|
# print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
|
|
|
|
|
|
|
|
+ if notfound_tenderer and len(candidates) == 1 and re.search(
|
|
|
|
+ '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
|
|
|
|
+ article.content[:100]):
|
|
|
|
+ for p_entity in candidates:
|
|
|
|
+ # print('只有一个候选人的作为中标人', p_entity.entity_text)
|
|
|
|
+ p_entity.label = 2
|
|
|
|
+ p_entity.values[2] = on_value
|
|
|
|
+
|
|
# 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
# 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
list_p = []
|
|
list_p = []
|
|
state = 0
|
|
state = 0
|
|
@@ -2860,7 +2927,9 @@ class ProductAttributesPredictor():
|
|
if re.search('^\w{1,4}$', tds[id2_2]):
|
|
if re.search('^\w{1,4}$', tds[id2_2]):
|
|
quantity_unit = tds[id2_2]
|
|
quantity_unit = tds[id2_2]
|
|
if id3 != "":
|
|
if id3 != "":
|
|
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
|
|
|
|
+ if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
|
|
+ unitPrice = tds[id3]
|
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()()]+$', tds[id3].strip()):
|
|
unitPrice = tds[id3]
|
|
unitPrice = tds[id3]
|
|
# _unitPrice = tds[id3]
|
|
# _unitPrice = tds[id3]
|
|
# re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
|
|
# re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
|
|
@@ -3254,11 +3323,11 @@ class DocChannel():
|
|
'公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
'公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
|
|
'公告变更neg': '履约变更内容',
|
|
'公告变更neg': '履约变更内容',
|
|
'候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
|
|
'候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
|
|
- '候选人公示neg': '中标候选人公示期',
|
|
|
|
|
|
+ '候选人公示neg': '中标候选人公示期|中标候选人公示前',
|
|
'中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
'中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
'中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
|
|
'中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
|
|
'中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
|
|
'中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
|
|
- '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位',
|
|
|
|
|
|
+ '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果',
|
|
# |确定成交供应商[:,\s]
|
|
# |确定成交供应商[:,\s]
|
|
'合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
|
|
'合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
|
|
'废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
'废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
@@ -4918,7 +4987,7 @@ class TablePremExtractor(object):
|
|
roles = []
|
|
roles = []
|
|
if ners:
|
|
if ners:
|
|
for ner in ners[0]:
|
|
for ner in ners[0]:
|
|
- if ner[2] in ['org', 'company']:
|
|
|
|
|
|
+ if ner[2] in ['org', 'company', 'location']:
|
|
roles.append(ner[3])
|
|
roles.append(ner[3])
|
|
if roles and len(''.join(roles)) > len(text)*0.8:
|
|
if roles and len(''.join(roles)) > len(text)*0.8:
|
|
return roles[0]
|
|
return roles[0]
|
|
@@ -4943,8 +5012,10 @@ class TablePremExtractor(object):
|
|
win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
|
|
|
|
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
|
|
+ # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
break
|
|
break
|
|
if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配
|
|
if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配
|
|
|
|
+ # print('内容为空或全部一样 停止匹配')
|
|
break
|
|
break
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
project_name = ""
|
|
project_name = ""
|
|
@@ -5205,7 +5276,7 @@ class CandidateExtractor(object):
|
|
roles = []
|
|
roles = []
|
|
if ners:
|
|
if ners:
|
|
for ner in ners[0]:
|
|
for ner in ners[0]:
|
|
- if ner[2] in ['org', 'company']:
|
|
|
|
|
|
+ if ner[2] in ['org', 'company', 'location']:
|
|
roles.append(ner[3])
|
|
roles.append(ner[3])
|
|
if roles and len(''.join(roles)) > len(text)*0.8:
|
|
if roles and len(''.join(roles)) > len(text)*0.8:
|
|
return roles[0]
|
|
return roles[0]
|
|
@@ -5228,9 +5299,9 @@ class CandidateExtractor(object):
|
|
second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
|
|
second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
|
|
third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
|
|
third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
|
|
|
|
|
|
- if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
|
|
|
|
|
|
+ if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
|
|
break
|
|
break
|
|
- if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
|
|
|
|
+ if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
|
|
break
|
|
break
|
|
|
|
|
|
if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
|
|
if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
|
|
@@ -5412,6 +5483,7 @@ class CandidateExtractor(object):
|
|
|
|
|
|
def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
|
|
def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
|
|
self.nlp_enterprise = nlp_enterprise
|
|
self.nlp_enterprise = nlp_enterprise
|
|
|
|
+ html = html.replace('比选申请单位', '中标候选人') # 82347769
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
if richText:
|
|
if richText:
|