|
@@ -28,6 +28,8 @@ import calendar
|
|
import datetime
|
|
import datetime
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
|
|
+# from BiddingKG.dl.interface.getAttributes import turnMoneySource
|
|
|
|
+from BiddingKG.dl.common.Utils import del_tabel_achievement
|
|
from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
|
|
from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
|
|
from BiddingKG.dl.time.re_servicetime import extract_servicetime
|
|
from BiddingKG.dl.time.re_servicetime import extract_servicetime
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
@@ -436,6 +438,8 @@ class CodeNamePredict():
|
|
item['code'].append((it, 1, sentence.sentence_index))
|
|
item['code'].append((it, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((it, 2, sentence.sentence_index))
|
|
item['code'].append((it, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((it, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((it, 3, sentence.sentence_index))
|
|
item['code'].append((it, 3, sentence.sentence_index))
|
|
elif len(item['code']) > 0:
|
|
elif len(item['code']) > 0:
|
|
@@ -449,6 +453,8 @@ class CodeNamePredict():
|
|
item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((new_it, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
else:
|
|
else:
|
|
@@ -461,6 +467,8 @@ class CodeNamePredict():
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
break
|
|
break
|
|
@@ -475,6 +483,8 @@ class CodeNamePredict():
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
|
|
|
@@ -581,6 +591,8 @@ class CodeNamePredict():
|
|
item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
|
|
|
|
+ item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
@@ -841,9 +853,9 @@ class PREMPredict():
|
|
elif re.search('尊敬的供应商:$', front):
|
|
elif re.search('尊敬的供应商:$', front):
|
|
label = 0
|
|
label = 0
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
- elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front): #修复第4以上的预测错为中标人
|
|
|
|
|
|
+ elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front): #修复第4以上的预测错为中标人
|
|
label = 5
|
|
label = 5
|
|
- values[label] = 0.5
|
|
|
|
|
|
+ values[2] = 0.5
|
|
elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
values[2] = 0.5
|
|
values[2] = 0.5
|
|
label = 5
|
|
label = 5
|
|
@@ -1394,10 +1406,10 @@ class RoleRulePredictor():
|
|
def __init__(self):
|
|
def __init__(self):
|
|
# (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
# (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
|
|
self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
|
|
self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
|
|
- "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
|
|
|
|
|
|
+ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
|
|
self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
|
|
self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
|
|
- "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
|
|
|
|
|
|
+ "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
|
|
"[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
|
|
"[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
|
|
self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
|
|
self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
|
|
@@ -1409,19 +1421,19 @@ class RoleRulePredictor():
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
|
|
self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
|
|
- "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
|
|
|
|
+ "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
|
|
"|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
|
|
"|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
|
|
- "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
|
|
|
|
|
|
+ "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
|
|
"|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
|
|
"|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
|
|
"|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)" # 承办单位:不作为中标 83914772
|
|
"|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)" # 承办单位:不作为中标 83914772
|
|
self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
|
|
self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
|
|
- "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
|
|
|
|
- "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$)" # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
|
|
|
|
- self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
|
|
|
|
|
|
+ "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
|
|
|
|
+ "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)" # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
|
|
|
|
+ self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
|
|
"(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
"(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
|
|
"|结果公示如下:摇出球号:\d+号,中介机构:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
|
|
"|结果公示如下:摇出球号:\d+号,中介机构:$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标 # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
|
|
|
|
|
|
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
|
|
|
|
|
|
+ self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
|
|
"^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
"^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
|
|
"|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
|
|
"|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
|
|
"|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
|
|
"|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
|
|
@@ -1430,13 +1442,13 @@ class RoleRulePredictor():
|
|
"|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
|
|
"|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
|
|
"|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
"|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
|
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
+ self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
|
+ self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
|
|
|
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
|
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
+ self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
|
|
|
|
+ self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
|
|
|
|
|
|
- self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
|
|
|
|
|
|
+ self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
|
|
|
|
|
|
self.pattern_left = [
|
|
self.pattern_left = [
|
|
self.pattern_tenderee_left_60,
|
|
self.pattern_tenderee_left_60,
|
|
@@ -1547,7 +1559,7 @@ class RoleRulePredictor():
|
|
return (_label, _prob, _flag, keyword)
|
|
return (_label, _prob, _flag, keyword)
|
|
|
|
|
|
|
|
|
|
- def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
|
|
|
|
|
|
+ def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
|
|
|
|
|
|
for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
|
|
for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
|
|
list_codenames):
|
|
list_codenames):
|
|
@@ -1679,6 +1691,25 @@ class RoleRulePredictor():
|
|
entity_text = p_entity.entity_text
|
|
entity_text = p_entity.entity_text
|
|
_label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
|
|
_label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
|
|
|
|
|
|
|
|
+ if _label == 5 and re.search(':(1[.、])?$', before) and re.search('^[、;,&/。]', after) and re.search(
|
|
|
|
+ '(中标|成交|中选))?(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取)结果', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]): # 补充召回 例:514053647 标段1:中国建设银行西安南大街支行,标段2:中国农业银行股份有限公司西安分行,
|
|
|
|
+ _flag = True
|
|
|
|
+ _label = 2
|
|
|
|
+ _prob = 0.5
|
|
|
|
+ elif _label == 5 and all_winner==1 or (all_winner==2 and re.search('(排[名序]|名次|顺序|第):?[0-9一二三四五六七八九十]+', before)==None):
|
|
|
|
+ if re.search('(中标|中选|成交|入围|入选)(人|单位|供应商|银行)(名称)?:', before) and re.search('未(中标|中选|成交|入围|入选)', before)==None:
|
|
|
|
+ _flag = True
|
|
|
|
+ _label = 2
|
|
|
|
+ _prob = 0.55
|
|
|
|
+ elif re.search('(:|[::,]\d{1,2}[.、])$', before) and re.search('^[、;,&/。]', after) and re.search('(入围|合格)(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]):
|
|
|
|
+ _flag = True
|
|
|
|
+ _label = 2
|
|
|
|
+ _prob = 0.51
|
|
|
|
+ elif re.search('(候选|投标|应答|响应)(人|单位|供应商|银行)(名称)?:', before):
|
|
|
|
+ _flag = True
|
|
|
|
+ _label = 2
|
|
|
|
+ _prob = 0.5
|
|
|
|
+
|
|
# if _label in [0, 1, 2, 3, 4]:
|
|
# if _label in [0, 1, 2, 3, 4]:
|
|
# self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
|
|
# self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
|
|
# entity.entity_text,
|
|
# entity.entity_text,
|
|
@@ -2247,7 +2278,7 @@ class RoleGrade():
|
|
self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
|
|
self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
|
|
self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
|
|
self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
|
|
self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
|
|
self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
|
|
- self.winTenderer_left_6, self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.secondTenderer_left_9, self.thirdTenderer_left_9]
|
|
|
|
|
|
+ self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
|
|
def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
|
|
def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
|
|
'''
|
|
'''
|
|
根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
|
|
根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
|
|
@@ -2572,7 +2603,7 @@ class ProductPredictor():
|
|
paths.append(path[1:])
|
|
paths.append(path[1:])
|
|
return paths
|
|
return paths
|
|
|
|
|
|
- def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
|
|
|
|
|
|
+ def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
|
|
'''
|
|
'''
|
|
预测实体代码,每个句子最多取MAX_AREA个字,超过截断
|
|
预测实体代码,每个句子最多取MAX_AREA个字,超过截断
|
|
:param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
|
|
:param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
|
|
@@ -2580,6 +2611,19 @@ class ProductPredictor():
|
|
:param MAX_AREA: 每个句子最多截取多少字
|
|
:param MAX_AREA: 每个句子最多截取多少字
|
|
:return: 把预测出来的实体放进实体类
|
|
:return: 把预测出来的实体放进实体类
|
|
'''
|
|
'''
|
|
|
|
+ p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
|
|
|
|
+ "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
|
|
|
|
+ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
|
|
|
|
+ # sentence_range = [] #20240827 取消,修复线上接口产品耗时长问题
|
|
|
|
+ # if len(out_lines) >= 3: # 三个以上大纲
|
|
|
|
+ # for i in range(len(out_lines)-1):
|
|
|
|
+ # text, s1, b1 = out_lines[i]
|
|
|
|
+ # _, s2, b2 = out_lines[i+1]
|
|
|
|
+ # if 3<text.find(':')<20:
|
|
|
|
+ # text = text.split(':')[0]
|
|
|
|
+ # if re.search(p, text[:15]):
|
|
|
|
+ # sentence_range.append((s1, s2))
|
|
|
|
+
|
|
with self.sess.as_default() as sess:
|
|
with self.sess.as_default() as sess:
|
|
with self.sess.graph.as_default():
|
|
with self.sess.graph.as_default():
|
|
result = []
|
|
result = []
|
|
@@ -2646,6 +2690,25 @@ class ProductPredictor():
|
|
if len(list_sentence)==0:
|
|
if len(list_sentence)==0:
|
|
result.append({"product":[]})
|
|
result.append({"product":[]})
|
|
continue
|
|
continue
|
|
|
|
+ # 20240827 取消,修复线上接口产品耗时长问题
|
|
|
|
+ # if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
|
|
|
|
+ # new_list = []
|
|
|
|
+ # word_num = 0
|
|
|
|
+ # for sentence in list_sentence:
|
|
|
|
+ # if sentence.sentence_index<2:
|
|
|
|
+ # new_list.append(sentence)
|
|
|
|
+ # continue
|
|
|
|
+ # for s1, s2 in sentence_range:
|
|
|
|
+ # if sentence.sentence_index < s1:
|
|
|
|
+ # continue
|
|
|
|
+ # elif s1<=sentence.sentence_index <=s2:
|
|
|
|
+ # new_list.append(sentence)
|
|
|
|
+ # word_num += len(sentence.sentence_text)
|
|
|
|
+ # elif sentence.sentence_index >= s2:
|
|
|
|
+ # break
|
|
|
|
+ # if word_num > 100:
|
|
|
|
+ # list_sentence = new_list
|
|
|
|
+
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
_begin_index = 0
|
|
_begin_index = 0
|
|
item = {"product":[]}
|
|
item = {"product":[]}
|
|
@@ -6347,6 +6410,24 @@ class TableTag2List():
|
|
if self._output[i][j] == "":
|
|
if self._output[i][j] == "":
|
|
self._output[i][j] = val
|
|
self._output[i][j] = val
|
|
|
|
|
|
|
|
+def is_head_line(list_item):
|
|
|
|
+ '''
|
|
|
|
+ 调用表头识别模型判断是否为表头行
|
|
|
|
+ :param list_item: 行内容 例: ['技术参数、要求', '变更项']
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ x = []
|
|
|
|
+ for item in list_item:
|
|
|
|
+ x.append(getPredictor("form").encode(item))
|
|
|
|
+ predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
|
|
+ count = 0
|
|
|
|
+ for item, values in zip(list_item, list(predict_y)):
|
|
|
|
+ print(item, values[1])
|
|
|
|
+ if values[1] > 0.6:
|
|
|
|
+ count += 1
|
|
|
|
+ if count/len(list_item)>0.6:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
|
|
class TablePremExtractor(object):
|
|
class TablePremExtractor(object):
|
|
def __init__(self):
|
|
def __init__(self):
|
|
@@ -6357,10 +6438,10 @@ class TablePremExtractor(object):
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
|
|
|
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|中标存款|存放金额|分配额度",
|
|
"serviceTime": '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
|
|
"serviceTime": '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
|
|
'|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
|
|
'|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
|
|
'|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
|
|
'|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
|
|
@@ -6381,30 +6462,29 @@ class TablePremExtractor(object):
|
|
self.tb = TableTag2List()
|
|
self.tb = TableTag2List()
|
|
|
|
|
|
|
|
|
|
- def find_header(self, td_list):
|
|
|
|
|
|
+ def find_header(self, td_list, all_winner=False, first_line=False):
|
|
fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
header_dic = dict()
|
|
header_dic = dict()
|
|
flag = False
|
|
flag = False
|
|
contain_header = False
|
|
contain_header = False
|
|
- # print('表头判断:', set(fix_td_list) - self.headerset)
|
|
|
|
- if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
|
|
|
|
+ not_sure_winner = False # 是否 不确定中标的中标人表达方式
|
|
|
|
+ for text in set(fix_td_list) - self.headerset:
|
|
|
|
+ if len(text)<10 and re.search(self.head_rule_dic['bid_amount'], text):
|
|
|
|
+ self.headerset.add(text)
|
|
|
|
+ if len(set(fix_td_list))>0 and (first_line or len(set(fix_td_list) & self.headerset)>=2) and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
|
|
|
|
+ other_tenderer = ""
|
|
|
|
+ other_tenderer2 = ""
|
|
flag = True
|
|
flag = True
|
|
- need_replace = 0 # 是否需要替换表头名称
|
|
|
|
- if re.search('^(投标银行|供应商名称)$', '|'.join(td_list)) and re.search('中标存款金?额|中标资金存放额|中标利率|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', '|'.join(td_list)):
|
|
|
|
- need_replace = 1
|
|
|
|
for i in range(len(td_list)) :
|
|
for i in range(len(td_list)) :
|
|
text = td_list[i]
|
|
text = td_list[i]
|
|
- text = re.sub('\s', '', text)
|
|
|
|
- if need_replace and re.search('^(投标银行|供应商名称)$', text): # 银行类特殊处理
|
|
|
|
- text = '中标银行'
|
|
|
|
- if need_replace and re.search('排名|排序|名次|推荐顺序', text): # 银行类特殊处理
|
|
|
|
- text = '序号'
|
|
|
|
|
|
+ text = re.sub('\s|[((]排名不分先后[))]', '', text)
|
|
|
|
+ text = re.sub('^人选', '入选', text)
|
|
if text == '备选中标人':
|
|
if text == '备选中标人':
|
|
text = '第二候选人'
|
|
text = '第二候选人'
|
|
if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
continue
|
|
continue
|
|
- if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
|
- return flag, contain_header, dict()
|
|
|
|
|
|
+ if re.search('未(中标|成交|中选|入围)原因', text): # 不提取此种表格
|
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
num = 0
|
|
num = 0
|
|
for k, v in self.head_rule_dic.items():
|
|
for k, v in self.head_rule_dic.items():
|
|
if re.search('评分|得分|分数|分值', text):
|
|
if re.search('评分|得分|分数|分值', text):
|
|
@@ -6414,6 +6494,8 @@ class TablePremExtractor(object):
|
|
continue
|
|
continue
|
|
if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
|
|
if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
|
|
continue
|
|
continue
|
|
|
|
+ elif k == 'bid_amount' and re.search('分配方案|基准利率|BP值', text): # 517987084 中标资金分配方案
|
|
|
|
+ continue
|
|
elif k in header_dic:
|
|
elif k in header_dic:
|
|
if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text): # 总价替换单价
|
|
if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text): # 总价替换单价
|
|
header_dic[k] = (i, text)
|
|
header_dic[k] = (i, text)
|
|
@@ -6424,9 +6506,13 @@ class TablePremExtractor(object):
|
|
continue
|
|
continue
|
|
header_dic[k] = (i, text)
|
|
header_dic[k] = (i, text)
|
|
num += 1
|
|
num += 1
|
|
|
|
+ elif re.search('^((中标|成交|中选|入围|入选)(候选)?)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)(名称)?$', text) and re.search('未', text)==None:
|
|
|
|
+ other_tenderer = (i, text)
|
|
|
|
+ elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
|
|
|
|
+ other_tenderer2 = (i, text)
|
|
if num>1:
|
|
if num>1:
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
- return flag, contain_header, dict()
|
|
|
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
for i in range(len(td_list)):
|
|
for i in range(len(td_list)):
|
|
@@ -6440,23 +6526,32 @@ class TablePremExtractor(object):
|
|
if re.search('^金额((万?元))?$', text):
|
|
if re.search('^金额((万?元))?$', text):
|
|
header_dic['budget'] = (i, text)
|
|
header_dic['budget'] = (i, text)
|
|
break
|
|
break
|
|
|
|
+ if all_winner and 'tenderer' not in header_dic: # 标题有存款、入库、入围等公告补充其他表达做中标人
|
|
|
|
+ if other_tenderer!="":
|
|
|
|
+ header_dic['tenderer'] = other_tenderer
|
|
|
|
+ elif other_tenderer2!="":
|
|
|
|
+ header_dic['tenderer'] = other_tenderer2
|
|
|
|
+ if 'win_sort' not in header_dic:
|
|
|
|
+ not_sure_winner = True
|
|
|
|
+ if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
|
|
|
|
+ header_dic.pop('win_sort')
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
- return flag, contain_header, header_dic
|
|
|
|
|
|
+ return flag, contain_header, header_dic, not_sure_winner
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
if 'win_sort' in header_dic: # 有排名的 用候选人提取类
|
|
if 'win_sort' in header_dic: # 有排名的 用候选人提取类
|
|
- return flag, contain_header, dict()
|
|
|
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
- return flag, contain_header, dict()
|
|
|
|
- return flag,contain_header, header_dic
|
|
|
|
- elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
|
|
|
|
- return flag, contain_header, header_dic
|
|
|
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
|
+ return flag,contain_header, header_dic, not_sure_winner
|
|
|
|
+ elif 'tenderer' in header_dic and (re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]) or all_winner): # 有中标人,且有明确中标关键词的进行提取
|
|
|
|
+ return flag, contain_header, header_dic, not_sure_winner
|
|
elif 'tenderer' in header_dic and 'serviceTime' in header_dic:
|
|
elif 'tenderer' in header_dic and 'serviceTime' in header_dic:
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
contain_header = True
|
|
contain_header = True
|
|
- return flag, contain_header, dict()
|
|
|
|
|
|
+ return flag, contain_header, dict(), not_sure_winner
|
|
|
|
|
|
def get_role(self, text, nlp_enterprise):
|
|
def get_role(self, text, nlp_enterprise):
|
|
'''
|
|
'''
|
|
@@ -6468,7 +6563,7 @@ class TablePremExtractor(object):
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
- text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
@@ -6487,7 +6582,7 @@ class TablePremExtractor(object):
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
- def extract_from_df(self, df, headers, web_source_name):
|
|
|
|
|
|
+ def extract_from_df(self, df, headers, web_source_name, all_winner=False):
|
|
prem_dic = {}
|
|
prem_dic = {}
|
|
previous_package = "" # 上一行包号
|
|
previous_package = "" # 上一行包号
|
|
multi_same_package = False # 非连续的重复包号
|
|
multi_same_package = False # 非连续的重复包号
|
|
@@ -6502,7 +6597,9 @@ class TablePremExtractor(object):
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
# print('没有包号及角色的不要')
|
|
# print('没有包号及角色的不要')
|
|
return {}
|
|
return {}
|
|
-
|
|
|
|
|
|
+ have_bid_amount = False # 是否包含中标金额
|
|
|
|
+ if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
|
|
|
|
+ have_bid_amount = True
|
|
for i in df.index:
|
|
for i in df.index:
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
@@ -6542,7 +6639,7 @@ class TablePremExtractor(object):
|
|
continue
|
|
continue
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
continue
|
|
continue
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
|
|
|
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
|
|
tenderer = ""
|
|
tenderer = ""
|
|
|
|
|
|
if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
|
|
if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
|
|
@@ -6604,11 +6701,11 @@ class TablePremExtractor(object):
|
|
prem_dic[package]['name'] = project_name
|
|
prem_dic[package]['name'] = project_name
|
|
|
|
|
|
if budget_ != "":
|
|
if budget_ != "":
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
break
|
|
break
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
- budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
|
|
|
|
|
|
+ budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
|
|
|
|
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
@@ -6635,16 +6732,20 @@ class TablePremExtractor(object):
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
})
|
|
})
|
|
if tenderer:
|
|
if tenderer:
|
|
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
|
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '',
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
break
|
|
break
|
|
|
|
|
|
- bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
|
|
|
+ bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
continue
|
|
continue
|
|
|
|
+ elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
|
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
|
+ prem_dic.pop(package)
|
|
|
|
+ continue
|
|
|
|
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
@@ -6678,9 +6779,10 @@ class TablePremExtractor(object):
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
- if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
|
|
- prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
|
|
- prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
|
|
|
|
|
|
+ if bid_amount != 0: # 有中标金额的才放进去
|
|
|
|
+ if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
|
|
tenderer_list.append(tenderer)
|
|
tenderer_list.append(tenderer)
|
|
serviceTime_list.append(serviceTime)
|
|
serviceTime_list.append(serviceTime)
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
@@ -6743,7 +6845,7 @@ class TablePremExtractor(object):
|
|
else:
|
|
else:
|
|
rs_dic[pack] = tmp_dic[pack]
|
|
rs_dic[pack] = tmp_dic[pack]
|
|
|
|
|
|
- def get_prem(self, soup, web_source_name=''):
|
|
|
|
|
|
+ def get_prem(self, soup, web_source_name='', all_winner=False):
|
|
tables = soup.find_all('table')
|
|
tables = soup.find_all('table')
|
|
tables.reverse()
|
|
tables.reverse()
|
|
|
|
|
|
@@ -6751,10 +6853,15 @@ class TablePremExtractor(object):
|
|
for table in tables:
|
|
for table in tables:
|
|
|
|
|
|
text = table.text.strip()
|
|
text = table.text.strip()
|
|
- previous = table.findPreviousSibling()
|
|
|
|
- text2 = previous.text.strip() if previous else ""
|
|
|
|
- # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
|
|
- if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
|
|
|
|
+ pre_text = ""
|
|
|
|
+ previous = None
|
|
|
|
+ if table.findPreviousSibling() != None:
|
|
|
|
+ previous = table.findPreviousSibling()
|
|
|
|
+ pre_text = previous.text.strip()
|
|
|
|
+ if pre_text == "" and table.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
|
|
|
|
+ previous = table.findPreviousSibling().findPreviousSibling()
|
|
|
|
+ pre_text = previous.text.strip()
|
|
|
|
+ if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+pre_text): # 包含业绩的表格过滤掉,不进行处理
|
|
tb_ex = table.extract()
|
|
tb_ex = table.extract()
|
|
if previous:
|
|
if previous:
|
|
sib = previous.extract()
|
|
sib = previous.extract()
|
|
@@ -6766,19 +6873,29 @@ class TablePremExtractor(object):
|
|
headers = ""
|
|
headers = ""
|
|
table_prem = {}
|
|
table_prem = {}
|
|
while i < len(trs) - 1:
|
|
while i < len(trs) - 1:
|
|
- flag_, contain_header_, headers_ = self.find_header(trs[i])
|
|
|
|
|
|
+ flag_, contain_header_, headers_, not_sure_winner = self.find_header(trs[i], all_winner, first_line=i==0)
|
|
|
|
+
|
|
|
|
+ if flag_ and 'tenderer' in headers_ and not_sure_winner and re.search('中标|成交|中选|入围|入选', pre_text)==None:
|
|
|
|
+ # print('过滤:',headers_)
|
|
|
|
+ flag_ = False
|
|
|
|
+ headers_ = {}
|
|
|
|
+
|
|
if flag_ and headers_ != dict():
|
|
if flag_ and headers_ != dict():
|
|
table_items = []
|
|
table_items = []
|
|
headers = headers_
|
|
headers = headers_
|
|
for j in range(i + 1, len(trs)):
|
|
for j in range(i + 1, len(trs)):
|
|
if len(trs[j]) == len(trs[i]):
|
|
if len(trs[j]) == len(trs[i]):
|
|
- flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
|
|
|
|
|
|
+ flag_2, contain_header_2, headers_2, not_sure_winner = self.find_header(trs[j], all_winner)
|
|
if flag_2 or contain_header_2:
|
|
if flag_2 or contain_header_2:
|
|
if j == i+1 and flag_2:
|
|
if j == i+1 and flag_2:
|
|
- if len(headers_)<len(headers_2):
|
|
|
|
|
|
+ if len(headers_)<=len(headers_2):
|
|
headers = headers_2
|
|
headers = headers_2
|
|
continue
|
|
continue
|
|
|
|
+ elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
|
|
|
|
+ continue
|
|
break
|
|
break
|
|
|
|
+ elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
|
|
|
|
+ continue
|
|
else:
|
|
else:
|
|
table_items.append(trs[j])
|
|
table_items.append(trs[j])
|
|
else:
|
|
else:
|
|
@@ -6786,7 +6903,7 @@ class TablePremExtractor(object):
|
|
break
|
|
break
|
|
if len(table_items) > 0:
|
|
if len(table_items) > 0:
|
|
df = pd.DataFrame(table_items)
|
|
df = pd.DataFrame(table_items)
|
|
- prem_ = self.extract_from_df(df, headers, web_source_name)
|
|
|
|
|
|
+ prem_ = self.extract_from_df(df, headers, web_source_name, all_winner)
|
|
# rs_dic.update(prem_)
|
|
# rs_dic.update(prem_)
|
|
# table_prem.update(prem_)
|
|
# table_prem.update(prem_)
|
|
self.update_prem(table_prem, prem_)
|
|
self.update_prem(table_prem, prem_)
|
|
@@ -6806,7 +6923,7 @@ class TablePremExtractor(object):
|
|
table.extract()
|
|
table.extract()
|
|
return rs_dic
|
|
return rs_dic
|
|
|
|
|
|
- def predict(self, html, nlp_enterprise, web_source_name=""):
|
|
|
|
|
|
+ def predict(self, html, nlp_enterprise, web_source_name="", all_winner=False):
|
|
html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
html = re.sub("##attachment##","",html)
|
|
html = re.sub("##attachment##","",html)
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
@@ -6815,9 +6932,11 @@ class TablePremExtractor(object):
|
|
in_attachment = False
|
|
in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
- prem = self.get_prem(soup, web_source_name)
|
|
|
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
|
|
|
|
+ prem = self.get_prem(soup, web_source_name, all_winner)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
- prem = self.get_prem(richText, web_source_name)
|
|
|
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
|
|
+ prem = self.get_prem(richText, web_source_name, all_winner)
|
|
in_attachment = True
|
|
in_attachment = True
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
k = list(prem)[0]
|
|
k = list(prem)[0]
|
|
@@ -6834,7 +6953,7 @@ class CandidateExtractor(object):
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
|
|
- "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
|
|
|
+ "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -6842,7 +6961,7 @@ class CandidateExtractor(object):
|
|
}
|
|
}
|
|
'''非表格候选人正则'''
|
|
'''非表格候选人正则'''
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
- self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
|
|
|
|
|
|
+ self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答|响应)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
|
|
self.tb = TableTag2List()
|
|
self.tb = TableTag2List()
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
self.headerset = pickle.load(f)
|
|
self.headerset = pickle.load(f)
|
|
@@ -6906,6 +7025,9 @@ class CandidateExtractor(object):
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
|
|
|
|
+ text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
+ text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
if len(text) > 50 or len(text)<4:
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -6922,7 +7044,6 @@ class CandidateExtractor(object):
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def extract_from_df(self, df, headers):
|
|
def extract_from_df(self, df, headers):
|
|
- print('表头: ', headers)
|
|
|
|
prem_dic = {}
|
|
prem_dic = {}
|
|
link_set = set()
|
|
link_set = set()
|
|
candidate_set = set()
|
|
candidate_set = set()
|
|
@@ -7193,8 +7314,10 @@ class CandidateExtractor(object):
|
|
in_attachment = False
|
|
in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
|
|
prem, candidate_set = self.get_prem(soup)
|
|
prem, candidate_set = self.get_prem(soup)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
prem, candidate_set = self.get_prem(richText)
|
|
prem, candidate_set = self.get_prem(richText)
|
|
in_attachment = True
|
|
in_attachment = True
|
|
candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
@@ -7306,7 +7429,7 @@ class ApprovalPredictor():
|
|
self.role_type = {
|
|
self.role_type = {
|
|
"declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)", # 申报单位
|
|
"declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)", # 申报单位
|
|
"construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方", # 建设单位
|
|
"construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方", # 建设单位
|
|
- "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办)(部门|机关|单位|企业|公司|机构)|实施主体", # 审批部门
|
|
|
|
|
|
+ "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办))?(部门|机关|单位|企业|公司|机构)|实施主体", # 审批部门
|
|
"evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
|
|
"evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
|
|
"compilation_unit": "编制单位", # 编制单位 20240701加
|
|
"compilation_unit": "编制单位", # 编制单位 20240701加
|
|
"publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
|
|
"publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
|
|
@@ -7440,7 +7563,7 @@ class ApprovalPredictor():
|
|
multi_project['district'] = district['district']['district']
|
|
multi_project['district'] = district['district']['district']
|
|
multi_project = {k:v for k,v in multi_project.items() if v != ''}
|
|
multi_project = {k:v for k,v in multi_project.items() if v != ''}
|
|
rs_l.append(multi_project)
|
|
rs_l.append(multi_project)
|
|
- if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())&set(rs_l[1].keys())!=set():
|
|
|
|
|
|
+ if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
|
|
return rs_l
|
|
return rs_l
|
|
elif found_key == 1:
|
|
elif found_key == 1:
|
|
district = getPredictor('district').get_area(
|
|
district = getPredictor('district').get_area(
|
|
@@ -7813,14 +7936,14 @@ if __name__=="__main__":
|
|
# print(rs)
|
|
# print(rs)
|
|
|
|
|
|
docid = ""
|
|
docid = ""
|
|
- title = ''
|
|
|
|
|
|
+ title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
html = f.read()
|
|
html = f.read()
|
|
tb_extract = TablePremExtractor()
|
|
tb_extract = TablePremExtractor()
|
|
rs = tb_extract.predict(html, [
|
|
rs = tb_extract.predict(html, [
|
|
"江苏中联铸本混凝土有限公司",
|
|
"江苏中联铸本混凝土有限公司",
|
|
"鼓楼区协荣机械设备经销部"
|
|
"鼓楼区协荣机械设备经销部"
|
|
- ], web_source_name = '河钢供应链管理平台')
|
|
|
|
|
|
+ ], web_source_name = '', all_winner=True)
|
|
print('标段数:',len(rs[0]))
|
|
print('标段数:',len(rs[0]))
|
|
print(rs)
|
|
print(rs)
|
|
|
|
|