|
@@ -471,6 +471,7 @@ class CodeNamePredict():
|
|
|
# print(the_code)
|
|
|
#add code to entitys
|
|
|
list_entity.append(temp_entitys[h])
|
|
|
+ in_att = 1 if temp_entitys[h].in_attachment else 0 # 是否在附件
|
|
|
if re.search(',|/|;|、|,', the_code) and len(the_code)>25:
|
|
|
for it in re.split(',|/|;|、|,', the_code):
|
|
|
if len(it) > 8:
|
|
@@ -478,44 +479,44 @@ class CodeNamePredict():
|
|
|
code_set.add(it)
|
|
|
# item['code'].append(it)
|
|
|
if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
- item['code'].append((it, 0, sentence.sentence_index))
|
|
|
+ item['code'].append((it, in_att, 0, sentence.sentence_index))
|
|
|
elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
- item['code'].append((it, 1, sentence.sentence_index))
|
|
|
+ item['code'].append((it, in_att, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
- item['code'].append((it, 2, sentence.sentence_index))
|
|
|
+ item['code'].append((it, in_att, 2, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
- item['code'].append((it, 2.5, sentence.sentence_index))
|
|
|
+ item['code'].append((it, in_att, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
- item['code'].append((it, 3, sentence.sentence_index))
|
|
|
+ item['code'].append((it, in_att, 3, sentence.sentence_index))
|
|
|
elif len(item['code']) > 0:
|
|
|
new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it
|
|
|
if new_it not in code_set:
|
|
|
code_set.add(new_it)
|
|
|
# item['code'][-1] = new_it
|
|
|
if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
- item['code'][-1] = (new_it, 0, sentence.sentence_index)
|
|
|
+ item['code'][-1] = (new_it, in_att, 0, sentence.sentence_index)
|
|
|
elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
- item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
|
+ item['code'][-1] = (new_it, in_att, 1, sentence.sentence_index)
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
- item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
|
+ item['code'][-1] = (new_it, in_att, 2, sentence.sentence_index)
|
|
|
elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
- item['code'].append((new_it, 2.5, sentence.sentence_index))
|
|
|
+ item['code'].append((new_it, in_att, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
- item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
|
+ item['code'][-1] = (new_it, in_att, 3, sentence.sentence_index)
|
|
|
else:
|
|
|
if the_code not in code_set:
|
|
|
code_set.add(the_code)
|
|
|
# item['code'].append(the_code)
|
|
|
if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
- item['code'].append((the_code, 0, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 0, sentence.sentence_index))
|
|
|
elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 1, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 2, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
- item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 3, sentence.sentence_index))
|
|
|
break
|
|
|
elif the_code not in code_set:
|
|
|
if len(the_code)<5: # 避免510545935 这种把 招标项目编号:2024年第二期 只提取2024
|
|
@@ -523,15 +524,15 @@ class CodeNamePredict():
|
|
|
code_set.add(the_code)
|
|
|
# item['code'].append(the_code)
|
|
|
if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
|
- item['code'].append((the_code, 0, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 0, sentence.sentence_index))
|
|
|
elif re.search('采购(计划)?编号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 1, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 2, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
- item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
- item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
+ item['code'].append((the_code, in_att, 3, sentence.sentence_index))
|
|
|
|
|
|
# if the_code not in code_set:
|
|
|
# code_set.add(the_code)
|
|
@@ -634,24 +635,25 @@ class CodeNamePredict():
|
|
|
# if othercode != None:
|
|
|
# item[1]['code'].append(othercode.group(2))
|
|
|
# 2020/11/23 大网站规则调整
|
|
|
+ in_att = 1 if sentence.in_attachment else 0
|
|
|
othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{6,30}[a-zA-Z0-9\号期])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
|
if othercode != None:
|
|
|
# item['code'].append(othercode.group('code'))
|
|
|
- if re.search("(项目编号|招标编号):?$", othercode.group(0)):
|
|
|
- item['code'].append((othercode.group('code'), 0, sentence.sentence_index))
|
|
|
- elif re.search('采购(计划)?编号:?$', othercode.group(0)):
|
|
|
- item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
|
- elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
|
- item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
|
- elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
|
|
|
- item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
|
|
|
+ if re.search("(项目编号|招标编号):?", othercode.group(0)):
|
|
|
+ item['code'].append((othercode.group('code'), in_att, 0, sentence.sentence_index))
|
|
|
+ elif re.search('采购(计划)?编号:?', othercode.group(0)):
|
|
|
+ item['code'].append((othercode.group('code'), in_att, 1, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同)编号:?', othercode.group(0)):
|
|
|
+ item['code'].append((othercode.group('code'), in_att, 2, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?', othercode.group(0)):
|
|
|
+ item['code'].append((othercode.group('code'), in_att, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
- item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
|
+ item['code'].append((othercode.group('code'), in_att, 3, sentence.sentence_index))
|
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
|
# item['code'] = [code for code in item['code'] if len(code)<500]
|
|
|
# item['code'].sort(key=lambda x:len(x),reverse=True)
|
|
|
item['code'] = [code for code in item['code'] if len(code[0]) < 500]
|
|
|
- item['code'].sort(key=lambda x: [x[1],x[2]])
|
|
|
+ item['code'].sort(key=lambda x: [x[1],x[2],x[3]])
|
|
|
item['code'] = [it[0] for it in item['code']]
|
|
|
result.append(item)
|
|
|
|
|
@@ -881,7 +883,7 @@ class PREMPredict():
|
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
|
label = 5
|
|
|
elif label == 0:
|
|
|
- if re.search('拟邀请$|受邀谈判方|流入方名称:$|拟(选用|采用|选取)(单位|公司|企业)(名称)?:$|选择(建设|\w{,2})?服务单位:$|单一来源采购单位:$', front): # 修复 626700009 二、拟选用单位:海南和泰消防技术服务有限公司。 632486555 选择建设服务单位:四川富吉兴工程管理有限公司, 642115802 拟采用公司:山东久木影视传媒有限公司 654427839 单一来源采购单位:长沙新天地金融服务科技有限公司
|
|
|
+ if re.search('拟邀请$|受邀谈判方|流入方名称:$|拟?(选用|采用|选取)(单位|公司|企业)(名称)?:$|选择(建设|\w{,2})?服务单位:$|单一来源采购单位:$|中标通知书,致:$', front): # 修复 626700009 二、拟选用单位:海南和泰消防技术服务有限公司。 632486555 选择建设服务单位:四川富吉兴工程管理有限公司, 642115802 拟采用公司:山东久木影视传媒有限公司 654427839 单一来源采购单位:长沙新天地金融服务科技有限公司 659519320
|
|
|
label = 2
|
|
|
values[label] = 0.501
|
|
|
elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
|
|
@@ -903,6 +905,12 @@ class PREMPredict():
|
|
|
label = 5
|
|
|
elif re.search('交易单位|组织单位|发起组织', front[-10:]):
|
|
|
values[0] = 0.5
|
|
|
+ elif re.search('拟委托单位:$', front) and re.search('报价|价格', behind[:6]): # 修复 653102137 拟委托单位:海南炫光视听文化传媒有限公司,项目报价:382125元,
|
|
|
+ label = 2
|
|
|
+ values[2] = 0.501
|
|
|
+ elif re.search('拟将$', front) and re.search('^作为', behind): # 修复 659891919 中标预测错招标 拟将 贵州海柏纳斯工程项目管理有限公司 作为2025年度黔东南州公益性演出服务采购项目的招标代理机构
|
|
|
+ label = 2
|
|
|
+ values[2] = 0.501
|
|
|
elif label == 2:
|
|
|
if re.search('中标单位和.{,25}签订合同', whole):
|
|
|
label = 0
|
|
@@ -1517,11 +1525,11 @@ class RoleRulePredictor():
|
|
|
self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
|
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
|
|
|
"[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章|异议受理部门)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
|
|
|
- self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
|
|
|
- "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
|
|
|
+ self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货|竞拍发起|发标)" \
|
|
|
+ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址|询价商家)" \
|
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章|异议受理部门)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$|实施主体(基本情况,)?名称:$)"
|
|
|
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
|
|
|
- self.pattern_tenderee_right = "(?P<tenderee_right_50>^(机关)?,?([((](以下简称)?[,\"“]*((招标|采购)(人|单位|机构)|(服务)?购买方)(名称)?[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价) # 20250605去掉 |^关于 根据《广东省自然资源厅关于开展2024年度耕地资源分区分类评价更新工作的通知》
|
|
|
+ self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)|与\w{5,20}签订《成交确认书》))" # 659470425 成交后,应当即与澄迈县自然资源和规划局签订《成交确认书》
|
|
|
+ self.pattern_tenderee_right = "(?P<tenderee_right_50>^(机关)?,?([((](以下简称)?[,\"“:]*((招标|采购|发标)(人|单位|机构)|(服务)?购买方)(名称)?[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价) # 20250605去掉 |^关于 根据《广东省自然资源厅关于开展2024年度耕地资源分区分类评价更新工作的通知》
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
|
self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
self.pattern_agency_right = "(?P<agency_right>^,?([((](以下简称)?[,\"“]*(代理)(人|单位|机构)(名称)?[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
@@ -1536,23 +1544,23 @@ class RoleRulePredictor():
|
|
|
"(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
|
|
|
"(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)" # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
|
|
|
self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
|
|
|
- "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
|
+ "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取|[及和](\w{,2}金额|\w{,2}价格|资格))?[::是为]+$" \
|
|
|
"|结果公示如下:摇出球号:\d+号,中介机构:$|(中标|成交|中选)(单位|人|供应商)及\w{,2}金额:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
|
|
|
|
|
|
self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
|
|
|
"^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
|
"|^:贵公司参与|^:?你方于|^(胜出)?(中标|成交)[,。]|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)|^(公司)?:恭喜您中标" \
|
|
|
- "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))])|^确定为(中标|成交|中选)人)" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标 # 633061180 尊敬的如皋市中正机械有限公司公司:恭喜您中标荆州建华张拉套筒询价
|
|
|
+ "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))])|^确定为(中标|成交|中选)人|^,中标候选单位名次:第一中标候选人|^分数排[在名]第一[位名]?)" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标 # 633061180 尊敬的如皋市中正机械有限公司公司:恭喜您中标荆州建华张拉套筒询价
|
|
|
self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
|
|
|
"|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
|
|
|
"|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
|
|
|
"|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
+ self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行))|^,中标候选单位名次:第二中标候选人)"
|
|
|
|
|
|
self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
+ self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行))|^,中标候选单位名次:第三中标候选人)"
|
|
|
|
|
|
self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)?候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位|候选企业)(1.)?(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为【]+(1[.、])?$)"
|
|
|
|
|
@@ -2081,9 +2089,9 @@ class RoleRuleFinalAdd():
|
|
|
# text_end = "".join(end_tokens)
|
|
|
text_end = "".join(sentence.tokens)
|
|
|
text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
|
|
|
- text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
|
+ text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:\w{,100})', '', text_end)[-200:] # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
|
|
|
# sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
|
|
|
- sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
|
+ sear_ent = re.search('([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*((公告|发布)(日期|时间):|(电子签章),)?[0-9零一二三四五六七八九十〇○]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/]([0-9零一二三四五六七八九十]{1,2}日?)?', text_end)
|
|
|
if sear_ent:
|
|
|
b, e = sear_ent.span()
|
|
|
if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
|
|
@@ -2092,16 +2100,16 @@ class RoleRuleFinalAdd():
|
|
|
if sear_ent == None:
|
|
|
text_end = list_articles[0].content[-100:]
|
|
|
sear_ent = re.search(
|
|
|
- '([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
|
|
|
+ '([,。;]|^)(?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*((公告|发布)(日期|时间):|(电子签章),)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?',
|
|
|
text_end)
|
|
|
if sear_ent:
|
|
|
b, e = sear_ent.span()
|
|
|
if re.search('报价记录|竞价成交', text_end[max(b-10, 0):b] + text_end[e:]):
|
|
|
sear_ent = None
|
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
|
- sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
|
- if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
|
- sear_ent2 = None
|
|
|
+ sear_ent2 = re.search('[,:](户名|开户名称|发票抬头)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000]) # |单位名称|名称
|
|
|
+ # if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
|
|
|
+ # sear_ent2 = None
|
|
|
sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
|
sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
@@ -2462,16 +2470,23 @@ class RoleGrade():
|
|
|
:param codeName:
|
|
|
:return:
|
|
|
'''
|
|
|
- bid_info = []
|
|
|
+ bid_info = [] # 中标人大纲
|
|
|
+ ree_info = [] # 招标人大纲
|
|
|
if outlines:
|
|
|
- win_pattern = re.compile("^([((][一二三四五六七八九十\d]+[))]|[一二三四五六七八九十]+\s*[..、])[预拟]?((中标|中选|成交|(采购|招标|比选|比价)结果)((成交))?(人|单位|供应商)?的?(基本|主要)?(信息|情况|概况|结果)(如下)?|[预拟]?(中标|中选|成交)((成交))?(人|供应商|单位)(名称)?(、地址)?([及和]\w{,2}(价格|报价|金额))?(如下)?|中标公示单位:)[,:]?$")
|
|
|
+ ree_pattern = re.compile('^([((][一二三四五六七八九十\d]+[))]|[一二三四五六七八九十]+\s*[..、])凡?对本次(招标|采购|公告内容)提出询问,')
|
|
|
+ win_pattern = re.compile("^([((][一二三四五六七八九十\d]+[))]|[一二三四五六七八九十]+\s*[..、])[预拟]?((中标|中选|成交|(采购|招标|比选|比价|定标)结果)((成交))?(人|单位|供应商)?的?(基本|主要)?(信息|情况|概况|结果)(如下)?|[预拟]?(中标|中选|成交)((成交))?(人|供应商|单位)(名称)?(、地址)?([及和]\w{,2}(价格|报价|金额))?(如下)?|中标公示单位|(采购|招标|比选|比价|定标)结果)[,:]?$")
|
|
|
for outline in outlines:
|
|
|
text_, title_type, title_index, next_index, scope = outline
|
|
|
if re.search(win_pattern, text_) and re.search('(未|没|是否)(中标|成交)|中标单位合同签订主体|业绩|提供', text_)==None:
|
|
|
bid_info.append(scope)
|
|
|
- # log("提取的中标大纲:%s, docid:%s"%(text_, docid))
|
|
|
+ log("提取的中标大纲:%s, docid:%s"%(text_, docid))
|
|
|
+ elif re.search(ree_pattern, text_):
|
|
|
+ ree_info.append(scope)
|
|
|
+ log("提取的招标大纲:%s, docid:%s"%(text_, docid))
|
|
|
have_winner = False
|
|
|
+ have_ree = False
|
|
|
bid_info_company = []
|
|
|
+ ree_info_company = []
|
|
|
|
|
|
sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
|
role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
|
|
@@ -2580,6 +2595,21 @@ class RoleGrade():
|
|
|
if entity.label == 2 and entity.values[entity.label] > 0.5:
|
|
|
have_winner = True
|
|
|
|
|
|
+ if ree_info and entity.entity_type in ['org', 'company']: # 招标大纲下实体
|
|
|
+ for scope in ree_info:
|
|
|
+ if scope[0][0] == scope[1][0]:
|
|
|
+ if entity.sentence_index == scope[0][0] and scope[0][1] < entity.wordOffset_begin and entity.wordOffset_end <= scope[1][1]:
|
|
|
+ ree_info_company.append(entity)
|
|
|
+ else:
|
|
|
+ if entity.sentence_index == scope[0][0] and entity.wordOffset_begin > scope[0][1]:
|
|
|
+ ree_info_company.append(entity)
|
|
|
+ elif scope[0][0] < entity.sentence_index < scope[1][0]:
|
|
|
+ ree_info_company.append(entity)
|
|
|
+ elif entity.sentence_index == scope[1][0] and entity.wordOffset_end <scope[1][1]:
|
|
|
+ ree_info_company.append(entity)
|
|
|
+ if entity.label == 0 and entity.values[entity.label] > 0.5:
|
|
|
+ have_ree = True
|
|
|
+
|
|
|
if org_tenderee == [] and agency_like_tenderee:
|
|
|
for entity in agency_like_tenderee:
|
|
|
entity.label = 0
|
|
@@ -2628,6 +2658,17 @@ class RoleGrade():
|
|
|
# entity.label = 2
|
|
|
# entity.values[entity.label] = 0.55
|
|
|
# log('大纲规则补充中标人:%s, docid:%s' % (entity.entity_text, docid))
|
|
|
+ if have_ree == False and ree_info_company: # 优化 638075055 二、推荐中标候选人信息,邯郸市婷元紧固件制造有限公司第一名,
|
|
|
+ for entity in ree_info_company:
|
|
|
+ text = sentences[entity.sentence_index].sentence_text
|
|
|
+ b = entity.wordOffset_begin
|
|
|
+ e = entity.wordOffset_end
|
|
|
+ if re.search('(单位名称|([^\w]|^)名称):$', text[max(0, b-span-2):b]) and entity.label == 5:
|
|
|
+ entity.label = 0
|
|
|
+ entity.values[entity.label] = 0.55
|
|
|
+ log('大纲规则补充招标人:%s, docid:%s'%(entity.entity_text, docid))
|
|
|
+ break
|
|
|
+
|
|
|
|
|
|
class MoneyGrade():
|
|
|
def __init__(self):
|
|
@@ -6345,11 +6386,15 @@ class DistrictPredictor():
|
|
|
if pro_idx in tmp_pro:
|
|
|
tmp_pro[pro_idx] += score
|
|
|
else:
|
|
|
+ if score < 1: # 分数小于1的县级全称为多个城市重复名称,需过滤
|
|
|
+ continue
|
|
|
tmp_pro[pro_idx] = score
|
|
|
city_idx = idx_dic[idx]['市']
|
|
|
if city_idx in tmp_city:
|
|
|
tmp_city[city_idx] += score
|
|
|
else:
|
|
|
+ if score < 1: # 分数小于1的县级全称为多个城市重复名称,需过滤
|
|
|
+ continue
|
|
|
tmp_city[city_idx] = score
|
|
|
elif name in short_dic['district']:
|
|
|
for idx in short_dic['district'][name]:
|
|
@@ -7059,6 +7104,8 @@ class TableTag2List():
|
|
|
# check multiple rows
|
|
|
# pdb.set_trace()
|
|
|
row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1
|
|
|
+ if row_span == 0: # 20250806 修复 659303627 附件OCR重构表格span为0导致缺少问题
|
|
|
+ row_span = 1
|
|
|
|
|
|
# try updating smallest_row_span
|
|
|
smallest_row_span = min(smallest_row_span, row_span)
|
|
@@ -7067,6 +7114,8 @@ class TableTag2List():
|
|
|
col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1
|
|
|
if col_span > 20: # 修复 335590254 山东港口阳光智采e平台 数据源表格第一行colspan为200超过50列造成无法提取问题
|
|
|
col_span = 20
|
|
|
+ elif col_span == 0: # 20250806 修复 659303627 附件OCR重构表格span为0导致缺少问题
|
|
|
+ col_span = 1
|
|
|
|
|
|
# find the right index
|
|
|
while True:
|
|
@@ -7210,7 +7259,7 @@ class TablePremExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码|代码)",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
|
- "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
+ "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|^包)(名称?|内容)", # |货物|商品|产品|设备|通用|主要标的 20250812 货物的不作为项目名称
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况|^中标结果$',
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
@@ -7341,6 +7390,7 @@ class TablePremExtractor(object):
|
|
|
def extract_from_df(self, df, headers, web_source_name, all_winner=False):
|
|
|
prem_dic = {}
|
|
|
previous_package = "" # 上一行包号
|
|
|
+ previous_project_name = "" # 上一行项目名称
|
|
|
multi_same_package = False # 非连续的重复包号
|
|
|
package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
|
link_set = set()
|
|
@@ -7386,10 +7436,10 @@ class TablePremExtractor(object):
|
|
|
if re.search('合计|总计', package_code+project_code+project_name):
|
|
|
continue
|
|
|
if package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
- same_package = True
|
|
|
+ if project_name == previous_project_name:
|
|
|
+ same_package = True
|
|
|
if previous_package!="": # 有包号或项目编号且跟上一行相同时,去除项目名称
|
|
|
project_name = ''
|
|
|
- previous_package = package_code + project_code
|
|
|
if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
|
break
|
|
|
if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
|
|
@@ -7407,6 +7457,8 @@ class TablePremExtractor(object):
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
|
|
|
package = uniform_package_name(package_code) if package_code else '自增1' # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
+ if package == '自增1' and project_name != '':
|
|
|
+ package = "自增%s"%str(len(prem_dic) + 1) if previous_project_name != project_name else "自增%s"%str(len(prem_dic))
|
|
|
if project_name != "" and package.startswith('自增'):
|
|
|
pk_l = find_package(project_name)
|
|
|
if len(pk_l)==1:
|
|
@@ -7575,6 +7627,8 @@ class TablePremExtractor(object):
|
|
|
for k, v in package_fix2raw.items():
|
|
|
if k in prem_dic:
|
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
|
+ previous_package = package_code + project_code
|
|
|
+ previous_project_name = project_name
|
|
|
if len(tenderer_list)>2 and len(set(tenderer_list))==1 and "package_code" not in headers: # 没提取到包号且中标人一样应该是错误多包,需去掉多包 例 244355092 281854766
|
|
|
total_money = 0
|
|
|
for v in prem_dic.values():
|
|
@@ -8986,7 +9040,7 @@ class EntityTypeRulePredictor():
|
|
|
self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
|
|
|
self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
|
|
|
self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖|标的物)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区?):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
|
|
|
- self.pattern_addr_contact = '((联系|收件人?|邮寄)地[点址区]|行政区)[:为]'
|
|
|
+ self.pattern_addr_contact = '((联系|收件人?|邮寄|报名)地[点址区]|行政区)[:为]'
|
|
|
self.pattern_time_planned = '(计划|预计|预期)(招标|采购|发标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
|
self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
|
|
@@ -9026,7 +9080,8 @@ class EntityTypeRulePredictor():
|
|
|
ser2 = re.search('(%s)(?P<addr>[\w():\.-]{5,100}[,。]|(\w{2,8}[省市县])+)'%self.pattern_addr_bidsend, list_articles[0].content)
|
|
|
ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100}[,。]|(\w{2,8}[省市县])+)'%self.pattern_addr_delivery, list_articles[0].content)
|
|
|
ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100}[,。]|(\w{2,8}[省市县])+)'%self.pattern_addr_project, list_articles[0].content)
|
|
|
- ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
|
+ ser5 = re.search('(%s)(?P<addr>[\w()-]{5,100}[,。]|(\w{2,8}[省市县])+)'%self.pattern_addr_contact, list_articles[0].content)
|
|
|
+ ser6 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
|
if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
|
addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
|
if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
@@ -9035,8 +9090,10 @@ class EntityTypeRulePredictor():
|
|
|
addr_dic['addr_delivery'] = ser3.group('addr')
|
|
|
if ser4 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
|
|
|
addr_dic['addr_project'] = ser4.group('addr')
|
|
|
- if ser5 and code_investment == '':
|
|
|
- code_investment = ser5.group('code')
|
|
|
+ if ser5 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser5.group('addr')) and addr_dic.get('addr_contact', '') in ser5.group('addr'):
|
|
|
+ addr_dic['addr_contact'] = ser5.group('addr')
|
|
|
+ if ser6 and code_investment == '':
|
|
|
+ code_investment = ser6.group('code')
|
|
|
|
|
|
return addr_dic, time_dic, code_investment
|
|
|
|
|
@@ -9388,25 +9445,25 @@ if __name__=="__main__":
|
|
|
# # print("cost_time:", json.loads(requests_result.text)['cost_time'])
|
|
|
# # print(MAX_LEN, len(sentence), len(list_sentence))
|
|
|
|
|
|
- docid = ""
|
|
|
- title = ''
|
|
|
- with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
- html = f.read()
|
|
|
- product_attr = ProductAttributesPredictor()
|
|
|
- rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
- print(rs)
|
|
|
-
|
|
|
# docid = ""
|
|
|
- # title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
|
|
|
+ # title = ''
|
|
|
# with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
# html = f.read()
|
|
|
- # tb_extract = TablePremExtractor()
|
|
|
- # rs = tb_extract.predict(html, [
|
|
|
- # "江苏中联铸本混凝土有限公司",
|
|
|
- # "鼓楼区协荣机械设备经销部"
|
|
|
- # ], web_source_name = '', all_winner=False)
|
|
|
- # print('标段数:',len(rs[0]))
|
|
|
+ # product_attr = ProductAttributesPredictor()
|
|
|
+ # rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
# print(rs)
|
|
|
+
|
|
|
+ docid = ""
|
|
|
+ title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
|
|
|
+ with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
+ html = f.read()
|
|
|
+ tb_extract = TablePremExtractor()
|
|
|
+ rs = tb_extract.predict(html, [
|
|
|
+ "江苏中联铸本混凝土有限公司",
|
|
|
+ "鼓楼区协荣机械设备经销部"
|
|
|
+ ], web_source_name = '', all_winner=False)
|
|
|
+ print('标段数:',len(rs[0]))
|
|
|
+ print(rs)
|
|
|
# # bdscore = BiddingScore()
|
|
|
# # rs = bdscore.predict(html)
|
|
|
# # print(type(rs), len(rs))
|