|
@@ -29,6 +29,7 @@ import datetime
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
from BiddingKG.dl.interface.getAttributes import turnMoneySource
|
|
from BiddingKG.dl.interface.getAttributes import turnMoneySource
|
|
|
|
+from BiddingKG.dl.common.Utils import del_tabel_achievement
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
|
|
|
|
cpu_num = int(os.environ.get("CPU_NUM",0))
|
|
cpu_num = int(os.environ.get("CPU_NUM",0))
|
|
@@ -435,6 +436,8 @@ class CodeNamePredict():
|
|
item['code'].append((it, 1, sentence.sentence_index))
|
|
item['code'].append((it, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((it, 2, sentence.sentence_index))
|
|
item['code'].append((it, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((it, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((it, 3, sentence.sentence_index))
|
|
item['code'].append((it, 3, sentence.sentence_index))
|
|
elif len(item['code']) > 0:
|
|
elif len(item['code']) > 0:
|
|
@@ -448,6 +451,8 @@ class CodeNamePredict():
|
|
item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((new_it, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
else:
|
|
else:
|
|
@@ -460,6 +465,8 @@ class CodeNamePredict():
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
break
|
|
break
|
|
@@ -474,6 +481,8 @@ class CodeNamePredict():
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
|
|
|
@@ -580,6 +589,8 @@ class CodeNamePredict():
|
|
item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
|
|
|
|
+ item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
|
|
else:
|
|
else:
|
|
item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
@@ -840,9 +851,9 @@ class PREMPredict():
|
|
elif re.search('尊敬的供应商:$', front):
|
|
elif re.search('尊敬的供应商:$', front):
|
|
label = 0
|
|
label = 0
|
|
values[label] = 0.501
|
|
values[label] = 0.501
|
|
- elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front): #修复第4以上的预测错为中标人
|
|
|
|
|
|
+ elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front): #修复第4以上的预测错为中标人
|
|
label = 5
|
|
label = 5
|
|
- values[label] = 0.5
|
|
|
|
|
|
+ values[2] = 0.5
|
|
elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
values[2] = 0.5
|
|
values[2] = 0.5
|
|
label = 5
|
|
label = 5
|
|
@@ -2571,7 +2582,7 @@ class ProductPredictor():
|
|
paths.append(path[1:])
|
|
paths.append(path[1:])
|
|
return paths
|
|
return paths
|
|
|
|
|
|
- def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
|
|
|
|
|
|
+ def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
|
|
'''
|
|
'''
|
|
预测实体代码,每个句子最多取MAX_AREA个字,超过截断
|
|
预测实体代码,每个句子最多取MAX_AREA个字,超过截断
|
|
:param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
|
|
:param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
|
|
@@ -2579,6 +2590,19 @@ class ProductPredictor():
|
|
:param MAX_AREA: 每个句子最多截取多少字
|
|
:param MAX_AREA: 每个句子最多截取多少字
|
|
:return: 把预测出来的实体放进实体类
|
|
:return: 把预测出来的实体放进实体类
|
|
'''
|
|
'''
|
|
|
|
+ p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
|
|
|
|
+ "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
|
|
|
|
+ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
|
|
|
|
+ sentence_range = []
|
|
|
|
+ if len(out_lines) >= 3: # 三个以上大纲
|
|
|
|
+ for i in range(len(out_lines)-1):
|
|
|
|
+ text, s1, b1 = out_lines[i]
|
|
|
|
+ _, s2, b2 = out_lines[i+1]
|
|
|
|
+ if 3<text.find(':')<20:
|
|
|
|
+ text = text.split(':')[0]
|
|
|
|
+ if re.search(p, text[:15]):
|
|
|
|
+ sentence_range.append((s1, s2))
|
|
|
|
+
|
|
with self.sess.as_default() as sess:
|
|
with self.sess.as_default() as sess:
|
|
with self.sess.graph.as_default():
|
|
with self.sess.graph.as_default():
|
|
result = []
|
|
result = []
|
|
@@ -2645,6 +2669,25 @@ class ProductPredictor():
|
|
if len(list_sentence)==0:
|
|
if len(list_sentence)==0:
|
|
result.append({"product":[]})
|
|
result.append({"product":[]})
|
|
continue
|
|
continue
|
|
|
|
+
|
|
|
|
+ if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
|
|
|
|
+ new_list = []
|
|
|
|
+ word_num = 0
|
|
|
|
+ for sentence in list_sentence:
|
|
|
|
+ if sentence.sentence_index<2:
|
|
|
|
+ new_list.append(sentence)
|
|
|
|
+ continue
|
|
|
|
+ for s1, s2 in sentence_range:
|
|
|
|
+ if sentence.sentence_index < s1:
|
|
|
|
+ continue
|
|
|
|
+ elif s1<=sentence.sentence_index <=s2:
|
|
|
|
+ new_list.append(sentence)
|
|
|
|
+ word_num += len(sentence.sentence_text)
|
|
|
|
+ elif sentence.sentence_index >= s2:
|
|
|
|
+ break
|
|
|
|
+ if word_num > 100:
|
|
|
|
+ list_sentence = new_list
|
|
|
|
+
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
_begin_index = 0
|
|
_begin_index = 0
|
|
item = {"product":[]}
|
|
item = {"product":[]}
|
|
@@ -6373,19 +6416,12 @@ class TablePremExtractor(object):
|
|
header_dic = dict()
|
|
header_dic = dict()
|
|
flag = False
|
|
flag = False
|
|
contain_header = False
|
|
contain_header = False
|
|
- # print('表头判断:', set(fix_td_list) - self.headerset)
|
|
|
|
if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
flag = True
|
|
flag = True
|
|
need_replace = 0 # 是否需要替换表头名称
|
|
need_replace = 0 # 是否需要替换表头名称
|
|
- if re.search('^(投标银行|供应商名称)$', '|'.join(td_list)) and re.search('中标存款金?额|中标资金存放额|中标利率|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', '|'.join(td_list)):
|
|
|
|
- need_replace = 1
|
|
|
|
for i in range(len(td_list)) :
|
|
for i in range(len(td_list)) :
|
|
text = td_list[i]
|
|
text = td_list[i]
|
|
text = re.sub('\s', '', text)
|
|
text = re.sub('\s', '', text)
|
|
- if need_replace and re.search('^(投标银行|供应商名称)$', text): # 银行类特殊处理
|
|
|
|
- text = '中标银行'
|
|
|
|
- if need_replace and re.search('排名|排序|名次|推荐顺序', text): # 银行类特殊处理
|
|
|
|
- text = '序号'
|
|
|
|
if text == '备选中标人':
|
|
if text == '备选中标人':
|
|
text = '第二候选人'
|
|
text = '第二候选人'
|
|
if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
@@ -6453,7 +6489,7 @@ class TablePremExtractor(object):
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
- text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
@@ -6486,7 +6522,9 @@ class TablePremExtractor(object):
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
# print('没有包号及角色的不要')
|
|
# print('没有包号及角色的不要')
|
|
return {}
|
|
return {}
|
|
-
|
|
|
|
|
|
+ have_bid_amount = False # 是否包含中标金额
|
|
|
|
+ if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
|
|
|
|
+ have_bid_amount = True
|
|
for i in df.index:
|
|
for i in df.index:
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
@@ -6507,7 +6545,7 @@ class TablePremExtractor(object):
|
|
break
|
|
break
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
project_name = ""
|
|
project_name = ""
|
|
- if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
|
|
|
|
|
|
+ if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}$', project_name):
|
|
package_code_raw = project_name
|
|
package_code_raw = project_name
|
|
project_name = ""
|
|
project_name = ""
|
|
|
|
|
|
@@ -6628,6 +6666,10 @@ class TablePremExtractor(object):
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
continue
|
|
continue
|
|
|
|
+ elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
|
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
|
+ prem_dic.pop(package)
|
|
|
|
+ continue
|
|
|
|
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
@@ -6654,9 +6696,10 @@ class TablePremExtractor(object):
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
- if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
|
|
- prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
|
|
- prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
|
|
|
|
|
|
+ if bid_amount != 0: # 有中标金额的才放进去
|
|
|
|
+ if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
|
|
tenderer_list.append(tenderer)
|
|
tenderer_list.append(tenderer)
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
@@ -6727,7 +6770,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
text = table.text.strip()
|
|
text = table.text.strip()
|
|
previous = table.findPreviousSibling()
|
|
previous = table.findPreviousSibling()
|
|
- text2 = previous .text.strip() if previous else ""
|
|
|
|
|
|
+ text2 = previous.text.strip() if previous else ""
|
|
# text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
# text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
tb_ex = table.extract()
|
|
tb_ex = table.extract()
|
|
@@ -6750,10 +6793,14 @@ class TablePremExtractor(object):
|
|
flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
|
|
flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
|
|
if flag_2 or contain_header_2:
|
|
if flag_2 or contain_header_2:
|
|
if j == i+1 and flag_2:
|
|
if j == i+1 and flag_2:
|
|
- if len(headers_)<len(headers_2):
|
|
|
|
|
|
+ if len(headers_)<=len(headers_2):
|
|
headers = headers_2
|
|
headers = headers_2
|
|
continue
|
|
continue
|
|
|
|
+ elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
|
|
|
|
+ continue
|
|
break
|
|
break
|
|
|
|
+ elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
|
|
|
|
+ continue
|
|
else:
|
|
else:
|
|
table_items.append(trs[j])
|
|
table_items.append(trs[j])
|
|
else:
|
|
else:
|
|
@@ -6770,7 +6817,7 @@ class TablePremExtractor(object):
|
|
if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
|
|
if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
|
|
sib = table.find_previous_sibling()
|
|
sib = table.find_previous_sibling()
|
|
sib_text = sib.get_text()
|
|
sib_text = sib.get_text()
|
|
- ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
|
|
if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = uniform_package_name(package_sib)
|
|
package_sib = uniform_package_name(package_sib)
|
|
@@ -6790,8 +6837,10 @@ class TablePremExtractor(object):
|
|
in_attachment = False
|
|
in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
|
|
prem = self.get_prem(soup, web_source_name)
|
|
prem = self.get_prem(soup, web_source_name)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
prem = self.get_prem(richText, web_source_name)
|
|
prem = self.get_prem(richText, web_source_name)
|
|
in_attachment = True
|
|
in_attachment = True
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
@@ -6817,7 +6866,7 @@ class CandidateExtractor(object):
|
|
}
|
|
}
|
|
'''非表格候选人正则'''
|
|
'''非表格候选人正则'''
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
- self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
|
|
|
|
|
|
+ self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
|
|
self.tb = TableTag2List()
|
|
self.tb = TableTag2List()
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
self.headerset = pickle.load(f)
|
|
self.headerset = pickle.load(f)
|
|
@@ -6881,6 +6930,9 @@ class CandidateExtractor(object):
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
|
|
|
|
+ text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
|
+ text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
if len(text) > 50 or len(text)<4:
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -6897,7 +6949,6 @@ class CandidateExtractor(object):
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def extract_from_df(self, df, headers):
|
|
def extract_from_df(self, df, headers):
|
|
- print('表头: ', headers)
|
|
|
|
prem_dic = {}
|
|
prem_dic = {}
|
|
link_set = set()
|
|
link_set = set()
|
|
candidate_set = set()
|
|
candidate_set = set()
|
|
@@ -7128,7 +7179,7 @@ class CandidateExtractor(object):
|
|
if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
sib = table.find_previous_sibling()
|
|
sib = table.find_previous_sibling()
|
|
sib_text = sib.get_text()
|
|
sib_text = sib.get_text()
|
|
- ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = ser_sib.group(0)
|
|
package_sib = uniform_package_name(package_sib)
|
|
package_sib = uniform_package_name(package_sib)
|
|
@@ -7168,8 +7219,10 @@ class CandidateExtractor(object):
|
|
in_attachment = False
|
|
in_attachment = False
|
|
if richText:
|
|
if richText:
|
|
richText = richText.extract() # 过滤掉附件
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
|
|
prem, candidate_set = self.get_prem(soup)
|
|
prem, candidate_set = self.get_prem(soup)
|
|
if prem == {} and richText:
|
|
if prem == {} and richText:
|
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
prem, candidate_set = self.get_prem(richText)
|
|
prem, candidate_set = self.get_prem(richText)
|
|
in_attachment = True
|
|
in_attachment = True
|
|
candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
|