|
@@ -26,6 +26,7 @@ from bs4 import BeautifulSoup
|
|
|
import copy
|
|
|
import calendar
|
|
|
import datetime
|
|
|
+from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
|
|
|
|
cpu_num = int(os.environ.get("CPU_NUM",0))
|
|
@@ -928,9 +929,9 @@ class PREMPredict():
|
|
|
elif label ==0: # 错误招标金额处理
|
|
|
if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
|
|
|
values[label] = 0.49
|
|
|
- elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
|
+ elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+|(含)', behind):
|
|
|
values[label] = 0.49
|
|
|
- elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
|
|
|
+ elif re.search('(含|在|包括|[大小等高低]于|如预算金额为)$|[\d.%]+((含))?[+×*-]$', front):
|
|
|
values[label] = 0.49
|
|
|
elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
|
|
|
label = 1
|
|
@@ -938,12 +939,13 @@ class PREMPredict():
|
|
|
entity.set_Money(label, values)
|
|
|
|
|
|
def correct_money_by_rule(self, title, list_entitys, list_articles):
|
|
|
- if len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None:
|
|
|
- keyword = re.search('监理|设计|勘察', title).group(0)
|
|
|
+ if (len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None) or re.search('服务金额', list_articles[0].content):
|
|
|
+ # keyword = re.search('监理|设计|勘察', title).group(0)
|
|
|
for list_entity in list_entitys:
|
|
|
for _entity in list_entity:
|
|
|
# print('keyword:',keyword, '_entity.notes :',_entity.notes)
|
|
|
- if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
|
|
|
+ # if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
|
|
|
+ if _entity.entity_type == "money" and _entity.notes == '招标或中标金额' and _entity.label == 2:
|
|
|
# if channel_dic['docchannel'] == "招标公告":
|
|
|
if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
|
|
|
_entity.values[0] = 0.51
|
|
@@ -1417,7 +1419,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|服务金额|采购成本价") # |建安费用 不作为招标金额
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价") # |建安费用 不作为招标金额
|
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
@@ -5789,7 +5791,7 @@ class TablePremExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
- "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的|^包)(名称?|内容)",
|
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
@@ -5825,6 +5827,8 @@ class TablePremExtractor(object):
|
|
|
if re.search(v, text):
|
|
|
if k in ['tenderer'] and re.search('是否', text):
|
|
|
continue
|
|
|
+ if k in header_dic:
|
|
|
+ continue
|
|
|
header_dic[k] = (i, text)
|
|
|
num += 1
|
|
|
if num>1:
|
|
@@ -5893,7 +5897,7 @@ class TablePremExtractor(object):
|
|
|
else:
|
|
|
return ''
|
|
|
|
|
|
- def extract_from_df(self, df, headers):
|
|
|
+ def extract_from_df(self, df, headers, web_source_name):
|
|
|
prem_dic = {}
|
|
|
previous_package = "" # 上一行包号
|
|
|
multi_same_package = False # 非连续的重复包号
|
|
@@ -5901,6 +5905,10 @@ class TablePremExtractor(object):
|
|
|
link_set = set()
|
|
|
not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
|
|
|
'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
|
+
|
|
|
+ if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set(): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683
|
|
|
+ # print('没有包号及角色的不要')
|
|
|
+ return {}
|
|
|
for i in df.index:
|
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
|
project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
|
|
@@ -5961,7 +5969,11 @@ class TablePremExtractor(object):
|
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
-
|
|
|
+ if project_code != "":
|
|
|
+ uni_project_code= uniform_package_name(project_code)
|
|
|
+ if uni_project_code != "" and package != "":
|
|
|
+ # print('重组包号:', '%s_%s'%(uni_project_code, package))
|
|
|
+ package = '%s_%s'%(uni_project_code, package)
|
|
|
if package_code_raw!='':
|
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
|
package_fix2raw[package] = package_code_raw
|
|
@@ -5991,7 +6003,7 @@ class TablePremExtractor(object):
|
|
|
budget_header + budget_) and budget < 100) or budget > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
budget = 0
|
|
|
if budget > 0:
|
|
|
- if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
+ if same_package and prem_dic[package]['tendereeMoney'] != budget: # 处理 类似 136839070 一包多物品多预算
|
|
|
prem_dic[package]['tendereeMoney'] += budget
|
|
|
else:
|
|
|
prem_dic[package]['tendereeMoney'] = budget
|
|
@@ -6017,7 +6029,7 @@ class TablePremExtractor(object):
|
|
|
break
|
|
|
|
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
- if 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
|
+ if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
prem_dic.pop(package)
|
|
|
continue
|
|
@@ -6042,14 +6054,14 @@ class TablePremExtractor(object):
|
|
|
})
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
|
prem_dic.pop(package)
|
|
|
- break
|
|
|
+ # break # 注释掉避免 400084571 某些包废标 中断匹配
|
|
|
if multi_same_package: # 预处理后包号重复的,使用原始包号
|
|
|
for k, v in package_fix2raw.items():
|
|
|
if k in prem_dic:
|
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
|
return prem_dic
|
|
|
|
|
|
- def get_prem(self, soup):
|
|
|
+ def get_prem(self, soup, web_source_name=''):
|
|
|
tables = soup.find_all('table')
|
|
|
tables.reverse()
|
|
|
|
|
@@ -6088,7 +6100,7 @@ class TablePremExtractor(object):
|
|
|
break
|
|
|
if len(table_items) > 0:
|
|
|
df = pd.DataFrame(table_items)
|
|
|
- prem_ = self.extract_from_df(df, headers)
|
|
|
+ prem_ = self.extract_from_df(df, headers, web_source_name)
|
|
|
# rs_dic.update(prem_)
|
|
|
table_prem.update(prem_)
|
|
|
i = j - 1
|
|
@@ -6106,7 +6118,7 @@ class TablePremExtractor(object):
|
|
|
table.extract()
|
|
|
return rs_dic
|
|
|
|
|
|
- def predict(self, html, nlp_enterprise):
|
|
|
+ def predict(self, html, nlp_enterprise, web_source_name=""):
|
|
|
html = re.sub("<html>|</html>|<body>|</body>","",html)
|
|
|
html = re.sub("##attachment##","",html)
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
@@ -6114,9 +6126,9 @@ class TablePremExtractor(object):
|
|
|
self.nlp_enterprise = nlp_enterprise
|
|
|
if richText:
|
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
- prem = self.get_prem(soup)
|
|
|
+ prem = self.get_prem(soup, web_source_name)
|
|
|
if prem == {} and richText:
|
|
|
- prem = self.get_prem(richText)
|
|
|
+ prem = self.get_prem(richText, web_source_name)
|
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
|
k = list(prem)[0]
|
|
|
if k == '1' or len(k) > 2:
|
|
@@ -6487,6 +6499,8 @@ class WebsourceTenderee():
|
|
|
d['role_text'] = web_ree
|
|
|
elif re.search('大学$', web_ree) and re.search('学院$', d['role_text']) and web_ree not in d['role_text']:
|
|
|
d['role_text'] = web_ree
|
|
|
+ elif d.get('role_prob', 0) < 0.8 and get_business_data(d['role_text'])[0] == False: # 20240201 概率低于0.8且没有工商数据的替换为站源招标人
|
|
|
+ d['role_text'] = web_ree
|
|
|
# elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
|
|
|
# d['role_text'] = web_ree
|
|
|
# elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
|
|
@@ -6865,13 +6879,13 @@ if __name__=="__main__":
|
|
|
# # print("cost_time:", json.loads(requests_result.text)['cost_time'])
|
|
|
# # print(MAX_LEN, len(sentence), len(list_sentence))
|
|
|
|
|
|
- docid = ""
|
|
|
- title = ''
|
|
|
- with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
- html = f.read()
|
|
|
- product_attr = ProductAttributesPredictor()
|
|
|
- rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
- print(rs)
|
|
|
+ # docid = ""
|
|
|
+ # title = ''
|
|
|
+ # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
+ # html = f.read()
|
|
|
+ # product_attr = ProductAttributesPredictor()
|
|
|
+ # rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
+ # print(rs)
|
|
|
|
|
|
docid = ""
|
|
|
title = ''
|
|
@@ -6882,7 +6896,8 @@ if __name__=="__main__":
|
|
|
"广东省广裕集团嘉顺实业有限责任公司",
|
|
|
"广州顺为招标采购有限公司",
|
|
|
"中华人民共和国"
|
|
|
- ])
|
|
|
+ ], web_source_name = '河钢供应链管理平台')
|
|
|
+ print('标段数:',len(rs))
|
|
|
print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|