|
@@ -29,6 +29,7 @@ import datetime
|
|
|
from BiddingKG.dl.entityLink.entityLink import get_business_data
|
|
|
from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
|
|
|
from BiddingKG.dl.interface.getAttributes import turnMoneySource
|
|
|
+from BiddingKG.dl.common.Utils import del_tabel_achievement
|
|
|
# import fool # 统一用 selffool ,阿里云上只有selffool 包
|
|
|
|
|
|
cpu_num = int(os.environ.get("CPU_NUM",0))
|
|
@@ -435,6 +436,8 @@ class CodeNamePredict():
|
|
|
item['code'].append((it, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
item['code'].append((it, 2, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
+ item['code'].append((it, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
item['code'].append((it, 3, sentence.sentence_index))
|
|
|
elif len(item['code']) > 0:
|
|
@@ -448,6 +451,8 @@ class CodeNamePredict():
|
|
|
item['code'][-1] = (new_it, 1, sentence.sentence_index)
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
item['code'][-1] = (new_it, 2, sentence.sentence_index)
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
+ item['code'].append((new_it, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
item['code'][-1] = (new_it, 3, sentence.sentence_index)
|
|
|
else:
|
|
@@ -460,10 +465,14 @@ class CodeNamePredict():
|
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
break
|
|
|
elif the_code not in code_set:
|
|
|
+ if len(the_code)<5: # 避免510545935 这种把 招标项目编号:2024年第二期 只提取2024
|
|
|
+ continue
|
|
|
code_set.add(the_code)
|
|
|
# item['code'].append(the_code)
|
|
|
if re.search("(项目编号|招标编号):?$", pre_text[h]):
|
|
@@ -472,6 +481,8 @@ class CodeNamePredict():
|
|
|
item['code'].append((the_code, 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', pre_text[h]):
|
|
|
item['code'].append((the_code, 2, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
|
|
|
+ item['code'].append((the_code, 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
item['code'].append((the_code, 3, sentence.sentence_index))
|
|
|
|
|
@@ -569,7 +580,7 @@ class CodeNamePredict():
|
|
|
# if othercode != None:
|
|
|
# item[1]['code'].append(othercode.group(2))
|
|
|
# 2020/11/23 大网站规则调整
|
|
|
- othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
|
+ othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{6,30}[a-zA-Z0-9\号期])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
|
if othercode != None:
|
|
|
# item['code'].append(othercode.group('code'))
|
|
|
if re.search("(项目编号|招标编号):?$", othercode.group(0)):
|
|
@@ -578,6 +589,8 @@ class CodeNamePredict():
|
|
|
item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
|
|
|
elif re.search('(询价|合同)编号:?$', othercode.group(0)):
|
|
|
item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
|
|
|
+ elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
|
|
|
+ item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
|
|
|
else:
|
|
|
item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
|
|
|
# print('规则召回项目编号:', othercode.group('code'))
|
|
@@ -838,9 +851,9 @@ class PREMPredict():
|
|
|
elif re.search('尊敬的供应商:$', front):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front): #修复第4以上的预测错为中标人
|
|
|
+ elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front): #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
- values[label] = 0.5
|
|
|
+ values[2] = 0.5
|
|
|
elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
|
values[2] = 0.5
|
|
|
label = 5
|
|
@@ -1400,7 +1413,7 @@ class RoleRulePredictor():
|
|
|
"(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
|
|
|
"[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
|
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
|
|
|
- self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
+ self.pattern_tenderee_right = "(?P<tenderee_right>^(机关)?([((](以下简称)?[,\"“]*((招标|采购)(人|单位|机构)|(服务)?购买方)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
|
|
|
self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
|
|
|
self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
|
|
@@ -1462,9 +1475,9 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价|总费用约?为") # |建安费用 不作为招标金额
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为") # |建安费用 不作为招标金额
|
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
- self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
+ self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
|
# self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
|
|
@@ -2569,7 +2582,7 @@ class ProductPredictor():
|
|
|
paths.append(path[1:])
|
|
|
return paths
|
|
|
|
|
|
- def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
|
|
|
+ def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
|
|
|
'''
|
|
|
预测实体代码,每个句子最多取MAX_AREA个字,超过截断
|
|
|
:param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
|
|
@@ -2577,6 +2590,19 @@ class ProductPredictor():
|
|
|
:param MAX_AREA: 每个句子最多截取多少字
|
|
|
:return: 把预测出来的实体放进实体类
|
|
|
'''
|
|
|
+ p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
|
|
|
+ "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
|
|
|
+ "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
|
|
|
+ sentence_range = []
|
|
|
+ if len(out_lines) >= 3: # 三个以上大纲
|
|
|
+ for i in range(len(out_lines)-1):
|
|
|
+ text, s1, b1 = out_lines[i]
|
|
|
+ _, s2, b2 = out_lines[i+1]
|
|
|
+ if 3<text.find(':')<20:
|
|
|
+ text = text.split(':')[0]
|
|
|
+ if re.search(p, text[:15]):
|
|
|
+ sentence_range.append((s1, s2))
|
|
|
+
|
|
|
with self.sess.as_default() as sess:
|
|
|
with self.sess.graph.as_default():
|
|
|
result = []
|
|
@@ -2643,6 +2669,25 @@ class ProductPredictor():
|
|
|
if len(list_sentence)==0:
|
|
|
result.append({"product":[]})
|
|
|
continue
|
|
|
+
|
|
|
+ if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
|
|
|
+ new_list = []
|
|
|
+ word_num = 0
|
|
|
+ for sentence in list_sentence:
|
|
|
+ if sentence.sentence_index<2:
|
|
|
+ new_list.append(sentence)
|
|
|
+ continue
|
|
|
+ for s1, s2 in sentence_range:
|
|
|
+ if sentence.sentence_index < s1:
|
|
|
+ continue
|
|
|
+ elif s1<=sentence.sentence_index <=s2:
|
|
|
+ new_list.append(sentence)
|
|
|
+ word_num += len(sentence.sentence_text)
|
|
|
+ elif sentence.sentence_index >= s2:
|
|
|
+ break
|
|
|
+ if word_num > 100:
|
|
|
+ list_sentence = new_list
|
|
|
+
|
|
|
list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
|
|
|
_begin_index = 0
|
|
|
item = {"product":[]}
|
|
@@ -3970,7 +4015,7 @@ class DocChannel():
|
|
|
'招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
|
|
|
'废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
|
|
|
'合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
|
|
|
- '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示|评审结果',
|
|
|
+ '候选人公示': '候选人(变更)?公示|评标(结果)?公示|评审结果', #中标前公示|中标预公示|
|
|
|
'中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标', # |开标(记录|信息|情况)
|
|
|
'资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
|
'招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
@@ -6352,11 +6397,12 @@ class TablePremExtractor(object):
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码|代码)",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
- "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
+ "win_sort": "排名|排序|名次|推荐顺序",
|
|
|
+ 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -6372,12 +6418,13 @@ class TablePremExtractor(object):
|
|
|
contain_header = False
|
|
|
if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
|
flag = True
|
|
|
+ need_replace = 0 # 是否需要替换表头名称
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
|
text = re.sub('\s', '', text)
|
|
|
if text == '备选中标人':
|
|
|
text = '第二候选人'
|
|
|
- if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
+ if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
|
|
|
continue
|
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
return flag, contain_header, dict()
|
|
@@ -6420,27 +6467,18 @@ class TablePremExtractor(object):
|
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
|
return flag, contain_header, header_dic
|
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
|
- if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
|
+ if 'win_sort' in header_dic: # 有排名的 用候选人提取类
|
|
|
+ return flag, contain_header, dict()
|
|
|
+ elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
|
return flag, contain_header, dict()
|
|
|
return flag,contain_header, header_dic
|
|
|
+ elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
|
|
|
+ return flag, contain_header, header_dic
|
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
contain_header = True
|
|
|
return flag, contain_header, dict()
|
|
|
|
|
|
- def is_role(self, text):
|
|
|
- if len(text) > 25 or len(text)<4:
|
|
|
- return False
|
|
|
- elif len(re.findall('有限责?任?公司', text)) > 1:
|
|
|
- return False
|
|
|
- elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
|
|
|
- return True
|
|
|
- else:
|
|
|
- ners = selffool.ner(text)
|
|
|
- if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
|
|
|
- return True
|
|
|
- return False
|
|
|
-
|
|
|
def get_role(self, text, nlp_enterprise):
|
|
|
'''
|
|
|
获取字符串text角色实体
|
|
@@ -6451,7 +6489,7 @@ class TablePremExtractor(object):
|
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
, ',', text)
|
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
- text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
|
if text in nlp_enterprise:
|
|
|
return text
|
|
@@ -6484,7 +6522,9 @@ class TablePremExtractor(object):
|
|
|
or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
|
# print('没有包号及角色的不要')
|
|
|
return {}
|
|
|
-
|
|
|
+ have_bid_amount = False # 是否包含中标金额
|
|
|
+ if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
|
|
|
+ have_bid_amount = True
|
|
|
for i in df.index:
|
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
@@ -6495,30 +6535,31 @@ class TablePremExtractor(object):
|
|
|
budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
|
|
|
bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
|
|
|
win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
|
|
|
+ win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
|
|
|
|
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
|
|
|
# print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
|
|
|
break
|
|
|
- if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配
|
|
|
+ if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2 and tenderer=='': # 内容为空或全部一样 停止匹配
|
|
|
# print('内容为空或全部一样 停止匹配')
|
|
|
break
|
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
|
project_name = ""
|
|
|
- if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
|
|
|
+ if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}$', project_name):
|
|
|
package_code_raw = project_name
|
|
|
project_name = ""
|
|
|
|
|
|
package_code = package_code_raw
|
|
|
if re.search('合计|总计', package_code+project_code):
|
|
|
continue
|
|
|
- if package_code != '' and package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
+ if package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
same_package = True
|
|
|
- project_name = ''
|
|
|
+ if previous_package!="": # 有包号或项目编号且跟上一行相同时,去除项目名称
|
|
|
+ project_name = ''
|
|
|
previous_package = package_code + project_code
|
|
|
-
|
|
|
if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
|
break
|
|
|
- if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标
|
|
|
+ if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
|
|
|
continue
|
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
|
continue
|
|
@@ -6530,7 +6571,7 @@ class TablePremExtractor(object):
|
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
|
|
|
- package = uniform_package_name(package_code) if package_code else '自增'+str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
+ package = uniform_package_name(package_code) if package_code else '自增1' # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
if project_name != "" and package.startswith('自增'):
|
|
|
pk_l = find_package(project_name)
|
|
|
if len(pk_l)==1:
|
|
@@ -6542,6 +6583,8 @@ class TablePremExtractor(object):
|
|
|
|
|
|
tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
|
|
|
tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
|
|
|
+ tenderee = cut_repeat_name(tenderee)
|
|
|
+ tenderer = cut_repeat_name(tenderer)
|
|
|
|
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
break
|
|
@@ -6612,7 +6655,7 @@ class TablePremExtractor(object):
|
|
|
"role_text": tenderee,
|
|
|
"serviceTime": ""
|
|
|
})
|
|
|
- if tenderer and not same_package:
|
|
|
+ if tenderer:
|
|
|
if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
|
|
|
bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
prem_dic.pop(package)
|
|
@@ -6623,25 +6666,40 @@ class TablePremExtractor(object):
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
prem_dic.pop(package)
|
|
|
continue
|
|
|
+ elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+ continue
|
|
|
|
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
|
bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000: # 如果是费率或大于500亿的金额改为0
|
|
|
bid_amount = 0
|
|
|
- prem_dic[package]['roleList'].append({
|
|
|
- "address": "",
|
|
|
- "linklist": [],
|
|
|
- "role_money": {
|
|
|
- "discount_ratio": "",
|
|
|
- "downward_floating_ratio": "",
|
|
|
- "floating_ratio": "",
|
|
|
- "money": bid_amount,
|
|
|
- "money_unit": money_unit
|
|
|
- },
|
|
|
- "role_name": "win_tenderer",
|
|
|
- "role_text": tenderer,
|
|
|
- "serviceTime": ""
|
|
|
- })
|
|
|
+ if not same_package or len(prem_dic[package]['roleList'])==0:
|
|
|
+ prem_dic[package]['roleList'].append({
|
|
|
+ "address": "",
|
|
|
+ "linklist": [],
|
|
|
+ "role_money": {
|
|
|
+ "discount_ratio": "",
|
|
|
+ "downward_floating_ratio": "",
|
|
|
+ "floating_ratio": "",
|
|
|
+ "money": bid_amount,
|
|
|
+ "money_unit": money_unit
|
|
|
+ },
|
|
|
+ "role_name": "win_tenderer",
|
|
|
+ "role_text": tenderer,
|
|
|
+ "serviceTime": ""
|
|
|
+ })
|
|
|
+ elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
|
|
|
+ if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
|
|
|
+ prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
|
|
|
+ prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
|
+ elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
|
|
|
+ prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
|
|
|
+ if bid_amount != 0: # 有中标金额的才放进去
|
|
|
+ if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
|
|
|
+ prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
|
|
|
tenderer_list.append(tenderer)
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
|
prem_dic.pop(package)
|
|
@@ -6656,6 +6714,9 @@ class TablePremExtractor(object):
|
|
|
for d in v['roleList']:
|
|
|
if d['role_name'] == "win_tenderer":
|
|
|
total_money += d['role_money']['money']
|
|
|
+ if 'other_winner_dic' in d:
|
|
|
+ for other in d['other_winner_dic']:
|
|
|
+ total_money += other.get('money', 0)
|
|
|
return {'自增1': {
|
|
|
'code': '',
|
|
|
'name': '',
|
|
@@ -6709,7 +6770,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
text = table.text.strip()
|
|
|
previous = table.findPreviousSibling()
|
|
|
- text2 = previous .text.strip() if previous else ""
|
|
|
+ text2 = previous.text.strip() if previous else ""
|
|
|
# text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
|
if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
|
tb_ex = table.extract()
|
|
@@ -6729,9 +6790,17 @@ class TablePremExtractor(object):
|
|
|
headers = headers_
|
|
|
for j in range(i + 1, len(trs)):
|
|
|
if len(trs[j]) == len(trs[i]):
|
|
|
- flag_, contain_header_, headers_ = self.find_header(trs[j])
|
|
|
- if flag_ or contain_header_:
|
|
|
+ flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
|
|
|
+ if flag_2 or contain_header_2:
|
|
|
+ if j == i+1 and flag_2:
|
|
|
+ if len(headers_)<=len(headers_2):
|
|
|
+ headers = headers_2
|
|
|
+ continue
|
|
|
+ elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
|
|
|
+ continue
|
|
|
break
|
|
|
+ elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
|
|
|
+ continue
|
|
|
else:
|
|
|
table_items.append(trs[j])
|
|
|
else:
|
|
@@ -6745,11 +6814,11 @@ class TablePremExtractor(object):
|
|
|
self.update_prem(table_prem, prem_)
|
|
|
i = j - 1
|
|
|
i += 1
|
|
|
- if table_prem and len(trs) == 2 and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
|
+ if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
|
|
|
sib = table.find_previous_sibling()
|
|
|
sib_text = sib.get_text()
|
|
|
- ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
- if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
+ if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
|
|
|
package_sib = ser_sib.group(0)
|
|
|
package_sib = uniform_package_name(package_sib)
|
|
|
table_prem[package_sib] = table_prem.pop('自增1')
|
|
@@ -6768,8 +6837,10 @@ class TablePremExtractor(object):
|
|
|
in_attachment = False
|
|
|
if richText:
|
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
|
|
|
prem = self.get_prem(soup, web_source_name)
|
|
|
if prem == {} and richText:
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
|
prem = self.get_prem(richText, web_source_name)
|
|
|
in_attachment = True
|
|
|
if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
|
|
@@ -6784,10 +6855,10 @@ class CandidateExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
|
- "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
|
+ "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
|
|
|
- "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
+ "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -6795,7 +6866,7 @@ class CandidateExtractor(object):
|
|
|
}
|
|
|
'''非表格候选人正则'''
|
|
|
# self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
|
|
|
- self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
|
|
|
+ self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
|
|
|
self.tb = TableTag2List()
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
self.headerset = pickle.load(f)
|
|
@@ -6830,7 +6901,7 @@ class CandidateExtractor(object):
|
|
|
if num>1:
|
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
return flag, contain_header, dict()
|
|
|
- if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
|
+ if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
|
|
|
return flag, contain_header, header_dic
|
|
|
elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
contain_header = True
|
|
@@ -6859,6 +6930,9 @@ class CandidateExtractor(object):
|
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
, ',', text)
|
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
|
|
|
+ text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
+ text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
|
if text in nlp_enterprise:
|
|
|
return text
|
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -6883,6 +6957,14 @@ class CandidateExtractor(object):
|
|
|
findmoney = False
|
|
|
line_num = 0
|
|
|
line_package = None
|
|
|
+ package_flag = 0
|
|
|
+ if "package_code" in headers:
|
|
|
+ package_flag = 1
|
|
|
+ if len(df)!=len(set(df[headers["package_code"][0]])): # 如果有包号但重复,进行下列判断是否和跟其他字段组合包号
|
|
|
+ if "project_code" in headers and df[headers["project_code"][0]][0] != df[headers["package_code"][0]][0]:
|
|
|
+ package_flag = 2
|
|
|
+ elif "project_name" in headers and find_package(df[headers["package_code"][0]][0]):
|
|
|
+ package_flag = 3
|
|
|
for i in df.index:
|
|
|
package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
|
|
|
project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
|
|
@@ -6932,12 +7014,21 @@ class CandidateExtractor(object):
|
|
|
if package == "" and project_code != "": # 修复 395747178 多项目 只提取到一个
|
|
|
package = project_code
|
|
|
package = uniform_package_name(package) if package !="" else "Project"
|
|
|
+ if package_flag == 2 and project_code != "":
|
|
|
+ project_code_pk = uniform_package_name(project_code)
|
|
|
+ package = "%s_%s"%(project_code_pk, package)
|
|
|
+ elif package_flag == 3 and project_name != "":
|
|
|
+ for iter in find_package(project_name):
|
|
|
+ project_name_pk = uniform_package_name(iter.group(0))
|
|
|
+ package = "%s_%s"%(project_name_pk, package)
|
|
|
+ break
|
|
|
+
|
|
|
if candidate:
|
|
|
if win_or_not and re.search('否|未入围', win_or_not):
|
|
|
candidate_set.add(candidate)
|
|
|
- elif re.search('^((建议|推荐)(中标|成交)|是)$', win_or_not) and win_sort in ['', '参与投标单位及排名'] and win_tenderer=='':
|
|
|
- win_sort = '第一名'
|
|
|
- candidate_set.add(candidate)
|
|
|
+ # elif re.search('^((建议|推荐)(中标|成交)|是)$', win_or_not) and win_sort in ['', '参与投标单位及排名'] and win_tenderer=='':
|
|
|
+ # win_sort = '第一名'
|
|
|
+ # candidate_set.add(candidate)
|
|
|
else:
|
|
|
candidate_set.add(candidate)
|
|
|
|
|
@@ -7088,7 +7179,7 @@ class CandidateExtractor(object):
|
|
|
if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
|
sib = table.find_previous_sibling()
|
|
|
sib_text = sib.get_text()
|
|
|
- ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
+ ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
|
|
|
if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
|
|
|
package_sib = ser_sib.group(0)
|
|
|
package_sib = uniform_package_name(package_sib)
|
|
@@ -7128,8 +7219,10 @@ class CandidateExtractor(object):
|
|
|
in_attachment = False
|
|
|
if richText:
|
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
+ del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
|
|
|
prem, candidate_set = self.get_prem(soup)
|
|
|
if prem == {} and richText:
|
|
|
+ del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
|
|
|
prem, candidate_set = self.get_prem(richText)
|
|
|
in_attachment = True
|
|
|
candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
|
|
@@ -7367,7 +7460,7 @@ class ApprovalPredictor():
|
|
|
if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
|
|
|
code_name_set.add(multi_project['project_code']+multi_project['project_name'])
|
|
|
district = getPredictor('district').get_area(
|
|
|
- multi_project['project_name'] + multi_project['project_addr'], '')
|
|
|
+ multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
|
|
|
if district['district']['province'] != '全国':
|
|
|
multi_project['area'] = district['district']['area']
|
|
|
multi_project['province'] = district['district']['province']
|
|
@@ -7379,7 +7472,7 @@ class ApprovalPredictor():
|
|
|
return rs_l
|
|
|
elif found_key == 1:
|
|
|
district = getPredictor('district').get_area(
|
|
|
- rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
|
|
|
+ rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
|
|
|
if district['district']['province'] != '全国':
|
|
|
rs_dic['area'] = district['district']['area']
|
|
|
rs_dic['province'] = district['district']['province']
|
|
@@ -7747,18 +7840,17 @@ if __name__=="__main__":
|
|
|
# rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
# print(rs)
|
|
|
|
|
|
- # docid = ""
|
|
|
- # title = ''
|
|
|
- # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
- # html = f.read()
|
|
|
- # tb_extract = TablePremExtractor()
|
|
|
- # rs = tb_extract.predict(html, [
|
|
|
- # "广东省广裕集团嘉顺实业有限责任公司",
|
|
|
- # "广州顺为招标采购有限公司",
|
|
|
- # "中华人民共和国"
|
|
|
- # ], web_source_name = '河钢供应链管理平台')
|
|
|
- # print('标段数:',len(rs))
|
|
|
- # print(rs)
|
|
|
+ docid = ""
|
|
|
+ title = ''
|
|
|
+ with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
+ html = f.read()
|
|
|
+ tb_extract = TablePremExtractor()
|
|
|
+ rs = tb_extract.predict(html, [
|
|
|
+ "江苏中联铸本混凝土有限公司",
|
|
|
+ "鼓楼区协荣机械设备经销部"
|
|
|
+ ], web_source_name = '河钢供应链管理平台')
|
|
|
+ print('标段数:',len(rs[0]))
|
|
|
+ print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|