|
@@ -705,6 +705,9 @@ class PREMPredict():
|
|
|
elif re.search('第[4-9四五六]中标候选人', front): #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
values[label] = 0.5
|
|
|
+ elif re.search('(序号|排名|排序|名次):[4-9],', front): # 293225236 附件中 排名预测错误
|
|
|
+ values[2] = 0.5
|
|
|
+ label = 5
|
|
|
elif re.search('是否中标:是,供应商', front) and label == 5:
|
|
|
label = 2
|
|
|
values[label] = 0.9
|
|
@@ -761,12 +764,14 @@ class PREMPredict():
|
|
|
# print('金额: ', entity.entity_text, label, values, text)
|
|
|
if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
|
label = 2
|
|
|
- elif label == 1 and re.search('[::,。](总金额|总价|单价):?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
|
|
|
- values[label] = 0.49
|
|
|
- elif label ==0 and entity.notes in ["投资", "工程造价"]:
|
|
|
- values[label] = 0.49
|
|
|
- elif label == 0 and re.search('最低限价:?$', text):
|
|
|
- values[label] = 0.49
|
|
|
+ elif label == 1: # 错误中标金额处理
|
|
|
+ if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
|
|
|
+ values[label] = 0.49
|
|
|
+ elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', text): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
|
|
|
+ values[label] = 0.49
|
|
|
+ elif label ==0: # 错误招标金额处理
|
|
|
+ if entity.notes in ["投资", "工程造价"] or re.search('最低限价:?$', text):
|
|
|
+ values[label] = 0.49
|
|
|
elif re.search('金额在$', text):
|
|
|
values[label] = 0.49
|
|
|
elif re.search('报价:预估不?含税总价[为:]$', text) and (label != 1 or values[label]<0.5):
|
|
@@ -1230,7 +1235,7 @@ class RoleRulePredictor():
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
|
|
|
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价|报酬(含税):")
|
|
|
+ self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
|
self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
|
|
@@ -1370,7 +1375,7 @@ class RoleRulePredictor():
|
|
|
_weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
|
|
|
# _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
|
# "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
|
|
|
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
+ if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
|
|
|
list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
|
|
|
_flag = True
|
|
|
_label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
|
|
@@ -1407,6 +1412,8 @@ class RoleRulePredictor():
|
|
|
_span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
|
|
|
end_index=p_entity.end_index, size=10, center_include=True,
|
|
|
word_flag=True, text=p_entity.entity_text)
|
|
|
+ if re.search('金额在(\d+)?$', _span[0]):
|
|
|
+ continue
|
|
|
if re.search(',\w{2,}', _span[0]):
|
|
|
_span[0] = _span[0].split(',')[-1] #避免多个价格在一起造成误判
|
|
|
if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
|
|
@@ -2164,6 +2171,17 @@ class ProductPredictor():
|
|
|
reasons.append(it)
|
|
|
elif reasons == []:
|
|
|
reasons.append(it)
|
|
|
+ if reasons == []: # 如果模型识别不到失败原因 就用规则补充
|
|
|
+ for text in text_list:
|
|
|
+ ser1 = re.search('\w{,4}(理由|原因):\s*((第\d+包|标项\d+|原因类型)?[::]?[\s*\w,]{2,30}((不满?足|少于|未达)((法定)?[123一二三两]家|(规定)?要求)|(项目|采购)(终止|废标)),?)+',text)
|
|
|
+ ser2 = re.search(
|
|
|
+ '\w{,4}(理由|原因):\s*(第\d+包|标项\d+|原因类型)?[::]?[\s*\w]{4,30},', text)
|
|
|
+ if ser1:
|
|
|
+ reasons.append(ser1.group(0))
|
|
|
+ break
|
|
|
+ elif ser2:
|
|
|
+ reasons.append(ser2.group(0))
|
|
|
+ break
|
|
|
return {'fail_reason':';'.join(reasons)}, product_list
|
|
|
|
|
|
if list_entitys is None:
|
|
@@ -3078,9 +3096,9 @@ class DocChannel():
|
|
|
'候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
|
|
|
'候选人公示neg': '中标候选人公示期',
|
|
|
'中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
|
|
|
- '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
|
|
|
- '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
|
|
|
- '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家',
|
|
|
+ '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
|
|
|
+ '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
|
|
|
+ '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位',
|
|
|
# |确定成交供应商[:,\s]
|
|
|
'合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
|
|
|
'废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
|
|
@@ -3092,10 +3110,10 @@ class DocChannel():
|
|
|
'招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
|
|
|
'公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
|
|
|
'招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
|
|
|
- '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
|
|
|
+ '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
|
|
|
'合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
|
|
|
'候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
|
|
|
- '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源|中标通知书|中标$',
|
|
|
+ '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$',
|
|
|
'资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
|
'招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
|
}
|
|
@@ -4653,12 +4671,12 @@ class TablePremExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
- "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品)(名称?|内容)",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
|
"win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]价|(中标|成交))?([金总]?额|[报均总]价|价[格款]?)|承包价",
|
|
|
+ "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -4668,8 +4686,10 @@ class TablePremExtractor(object):
|
|
|
|
|
|
|
|
|
def find_header(self, td_list):
|
|
|
+ td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
+ contain_header = False
|
|
|
if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
@@ -4677,7 +4697,7 @@ class TablePremExtractor(object):
|
|
|
if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
continue
|
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
- return flag, dict()
|
|
|
+ return flag, contain_header, dict()
|
|
|
num = 0
|
|
|
for k, v in self.head_rule_dic.items():
|
|
|
if re.search(v, text):
|
|
@@ -4687,24 +4707,26 @@ class TablePremExtractor(object):
|
|
|
num += 1
|
|
|
if num>1:
|
|
|
print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
- return flag, dict()
|
|
|
- if re.search(';金额(万?元);', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
|
+ return flag, contain_header, dict()
|
|
|
+ if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
|
if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
|
|
|
for i in range(len(td_list)):
|
|
|
text = td_list[i]
|
|
|
- if re.search('^金额(万?元)$',text):
|
|
|
+ if re.search('^金额((万?元))?$',text):
|
|
|
header_dic['bid_amount'] = (i, text)
|
|
|
break
|
|
|
elif 'tenderee' in header_dic and 'budget' not in header_dic:
|
|
|
for i in range(len(td_list)):
|
|
|
text = td_list[i]
|
|
|
- if re.search('^金额(万?元)$', text):
|
|
|
+ if re.search('^金额((万?元))?$', text):
|
|
|
header_dic['budget'] = (i, text)
|
|
|
break
|
|
|
if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
|
|
|
'budget' in header_dic or 'bid_amount' in header_dic):
|
|
|
- return flag, header_dic
|
|
|
- return flag, dict()
|
|
|
+ return flag, contain_header, header_dic
|
|
|
+ elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
+ contain_header = True
|
|
|
+ return flag, contain_header, dict()
|
|
|
|
|
|
def is_role(self, text):
|
|
|
if len(text) > 25 or len(text)<4:
|
|
@@ -4719,7 +4741,15 @@ class TablePremExtractor(object):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
- def get_role(self, text):
|
|
|
+ def get_role(self, text, nlp_enterprise):
|
|
|
+ '''
|
|
|
+ 获取字符串text角色实体
|
|
|
+ :param text: 待获取实体字符串
|
|
|
+ :param nlp_enterprise: 公告中的角色实体列表
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ if text in nlp_enterprise:
|
|
|
+ return text
|
|
|
if len(text) > 25 or len(text)<4:
|
|
|
return ''
|
|
|
ners = getNers([text], useselffool=True)
|
|
@@ -4754,7 +4784,6 @@ class TablePremExtractor(object):
|
|
|
break
|
|
|
if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配
|
|
|
break
|
|
|
-
|
|
|
if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
|
project_name = ""
|
|
|
if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
|
|
@@ -4779,8 +4808,8 @@ class TablePremExtractor(object):
|
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
|
|
|
- tenderee = self.get_role(tenderee)
|
|
|
- tenderer = self.get_role(tenderer)
|
|
|
+ tenderee = self.get_role(tenderee, self.nlp_enterprise)
|
|
|
+ tenderer = self.get_role(tenderer, self.nlp_enterprise)
|
|
|
|
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
break
|
|
@@ -4808,19 +4837,18 @@ class TablePremExtractor(object):
|
|
|
|
|
|
prem_dic[package]['code'] = project_code
|
|
|
prem_dic[package]['name'] = project_name
|
|
|
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", budget_)
|
|
|
- if re_price:
|
|
|
- budget_ = re_price[0]
|
|
|
- if '万元' in headers['budget'][1] and '万' not in budget_:
|
|
|
- budget_ += '万元'
|
|
|
- budget = float(str(getUnifyMoney(budget_)))
|
|
|
- if budget > 10000000000000: # 大于万亿的去除
|
|
|
- budget = 0
|
|
|
- if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
- prem_dic[package]['tendereeMoney'] += budget
|
|
|
- else:
|
|
|
- prem_dic[package]['tendereeMoney'] = budget
|
|
|
- prem_dic[package]['tendereeMoneyUnit'] = '万元' if '万' in budget_ else '元'
|
|
|
+
|
|
|
+ if budget_ != "":
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
+ break
|
|
|
+ budget_header = headers['budget'][1] if 'budget' in headers else ''
|
|
|
+ budget, money_unit = money_process(budget_, budget_header)
|
|
|
+ if budget > 0:
|
|
|
+ if same_package and prem_dic[package]['tendereeMoney'] != budget: #
|
|
|
+ prem_dic[package]['tendereeMoney'] += budget
|
|
|
+ else:
|
|
|
+ prem_dic[package]['tendereeMoney'] = budget
|
|
|
+ prem_dic[package]['tendereeMoneyUnit'] = money_unit
|
|
|
if tenderee and not same_package:
|
|
|
prem_dic[package]['roleList'].append({
|
|
|
"address": "",
|
|
@@ -4837,17 +4865,10 @@ class TablePremExtractor(object):
|
|
|
"serviceTime": ""
|
|
|
})
|
|
|
if tenderer and not same_package:
|
|
|
- bid_amount = 0
|
|
|
- money_unit = ""
|
|
|
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", bid_amount_)
|
|
|
- if re_price:
|
|
|
- bid_amount_ = re_price[0]
|
|
|
- if '万元' in headers['bid_amount'][1] and '万' not in bid_amount_:
|
|
|
- bid_amount_ += '万元'
|
|
|
- bid_amount = float(str(getUnifyMoney(bid_amount_)))
|
|
|
- if bid_amount > 10000000000000: # 大于万亿的去除
|
|
|
- bid_amount = 0
|
|
|
- money_unit = '万元' if '万' in bid_amount_ else '元'
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
|
|
|
+ bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
+ break
|
|
|
+ bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and 'bid_amount' in headers else (0, '')
|
|
|
prem_dic[package]['roleList'].append({
|
|
|
"address": "",
|
|
|
"linklist": [],
|
|
@@ -4882,14 +4903,14 @@ class TablePremExtractor(object):
|
|
|
headers = ""
|
|
|
table_prem = {}
|
|
|
while i < len(trs) - 1:
|
|
|
- flag_, headers_ = self.find_header(trs[i])
|
|
|
+ flag_, contain_header_, headers_ = self.find_header(trs[i])
|
|
|
if flag_ and headers_ != dict():
|
|
|
table_items = []
|
|
|
headers = headers_
|
|
|
for j in range(i + 1, len(trs)):
|
|
|
if len(trs[j]) == len(trs[i]):
|
|
|
- flag_, headers_ = self.find_header(trs[j])
|
|
|
- if flag_:
|
|
|
+ flag_, contain_header_, headers_ = self.find_header(trs[j])
|
|
|
+ if flag_ or contain_header_:
|
|
|
break
|
|
|
else:
|
|
|
table_items.append(trs[j])
|
|
@@ -4916,9 +4937,10 @@ class TablePremExtractor(object):
|
|
|
table.extract()
|
|
|
return rs_dic
|
|
|
|
|
|
- def predict(self, html):
|
|
|
+ def predict(self, html, nlp_enterprise):
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
+ self.nlp_enterprise = nlp_enterprise
|
|
|
if richText:
|
|
|
richText = richText.extract() # 过滤掉附件
|
|
|
prem = self.get_prem(soup)
|
|
@@ -4938,7 +4960,7 @@ class CandidateExtractor(object):
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
|
"candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
- "bid_amount": "投标[报总]价|(中标|成交))?([金总]额|[报均总]价|价[格款])|承包价",
|
|
|
+ "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
|
"third_tenderer": "第三名|第三(中标|成交)?候选人",
|
|
@@ -4950,8 +4972,10 @@ class CandidateExtractor(object):
|
|
|
self.headerset = pickle.load(f)
|
|
|
|
|
|
def find_header(self, td_list):
|
|
|
+ td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
+ contain_header = False
|
|
|
if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
@@ -4959,7 +4983,7 @@ class CandidateExtractor(object):
|
|
|
if len(text) > 15: # 长度大于15 不进行表头匹配
|
|
|
continue
|
|
|
if re.search('未(中标|成交)原因', text): # 不提取此种表格
|
|
|
- return flag, dict()
|
|
|
+ return flag, contain_header, dict()
|
|
|
num = 0
|
|
|
for k, v in self.head_rule_dic.items():
|
|
|
if re.search(v, text):
|
|
@@ -4970,10 +4994,12 @@ class CandidateExtractor(object):
|
|
|
num += 1
|
|
|
if num>1:
|
|
|
print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
- return flag, dict()
|
|
|
+ return flag, contain_header, dict()
|
|
|
if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
|
- return flag, header_dic
|
|
|
- return flag, dict()
|
|
|
+ return flag, contain_header, header_dic
|
|
|
+ elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
+ contain_header = True
|
|
|
+ return flag, contain_header, dict()
|
|
|
|
|
|
def is_role(self, text):
|
|
|
if len(text) > 25 or len(text) < 4:
|
|
@@ -4988,7 +5014,15 @@ class CandidateExtractor(object):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
- def get_role(self, text):
|
|
|
+ def get_role(self, text, nlp_enterprise):
|
|
|
+ '''
|
|
|
+ 获取字符串text角色实体
|
|
|
+ :param text: 待获取实体字符串
|
|
|
+ :param nlp_enterprise: 公告中的角色实体列表
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ if text in nlp_enterprise:
|
|
|
+ return text
|
|
|
if len(text) > 25 or len(text)<4:
|
|
|
return ''
|
|
|
ners = getNers([text], useselffool=True)
|
|
@@ -5002,26 +5036,6 @@ class CandidateExtractor(object):
|
|
|
else:
|
|
|
return ''
|
|
|
|
|
|
- def money_process(self, money_text, header):
|
|
|
- '''
|
|
|
- 输入金额文本及金额列表头,返回统一数字化金额及金额单位
|
|
|
- :param money_text:
|
|
|
- :param header:
|
|
|
- :return:
|
|
|
- '''
|
|
|
- money = 0
|
|
|
- money_unit = ""
|
|
|
- re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
|
|
|
- if re_price:
|
|
|
- money_text = re_price.group(0)
|
|
|
- if '万元' in header and '万' not in money_text:
|
|
|
- money_text += '万元'
|
|
|
- money = float(str(getUnifyMoney(money_text)))
|
|
|
- if money > 10000000000000: # 大于万亿的去除
|
|
|
- money = 0
|
|
|
- money_unit = '万元' if '万' in money_text else '元'
|
|
|
- return (money, money_unit)
|
|
|
-
|
|
|
def extract_from_df(self, df, headers):
|
|
|
prem_dic = {}
|
|
|
link_set = set()
|
|
@@ -5055,7 +5069,7 @@ class CandidateExtractor(object):
|
|
|
|
|
|
# candidate = candidate_ if self.is_role(candidate_) else ""
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
- candidate = self.get_role(candidate_)
|
|
|
+ candidate = self.get_role(candidate_, self.nlp_enterprise)
|
|
|
|
|
|
# if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
# break
|
|
@@ -5074,7 +5088,7 @@ class CandidateExtractor(object):
|
|
|
if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
|
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
|
- text = self.get_role(text)
|
|
|
+ text = self.get_role(text, self.nlp_enterprise)
|
|
|
if text:
|
|
|
# if self.is_role(text):
|
|
|
if type not in role_dic:
|
|
@@ -5087,7 +5101,10 @@ class CandidateExtractor(object):
|
|
|
header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
|
|
|
for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
|
|
|
[win_tenderer, second_tenderer, third_tenderer]):
|
|
|
- money, money_unit = self.money_process(text, header)
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
|
|
|
+ text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
+ break
|
|
|
+ money, money_unit = money_process(text, header)
|
|
|
if money > 0:
|
|
|
if type not in role_dic:
|
|
|
role_dic[type] = dict()
|
|
@@ -5112,8 +5129,9 @@ class CandidateExtractor(object):
|
|
|
'tendereeMoney': 0,
|
|
|
'tendereeMoneyUnit': ""
|
|
|
}
|
|
|
-
|
|
|
- bid_amount, money_unit = self.money_process(bid_amount_, df.loc[i, headers['bid_amount'][0]]) if "bid_amount" in headers else (0, "")
|
|
|
+ if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
|
|
|
+ break
|
|
|
+ bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
|
|
|
prem_dic[package]['roleList'].append({
|
|
|
"address": "",
|
|
|
"linklist": [],
|
|
@@ -5175,14 +5193,14 @@ class CandidateExtractor(object):
|
|
|
i = 0
|
|
|
headers = ""
|
|
|
while i < len(trs) - 1:
|
|
|
- flag_, headers_ = self.find_header(trs[i])
|
|
|
+ flag_, contain_header_, headers_ = self.find_header(trs[i])
|
|
|
if flag_ and headers_ != dict():
|
|
|
table_items = []
|
|
|
headers = headers_
|
|
|
for j in range(i + 1, len(trs)):
|
|
|
if len(trs[j]) == len(trs[i]):
|
|
|
- flag_, headers_ = self.find_header(trs[j])
|
|
|
- if flag_:
|
|
|
+ flag_, contain_header_, headers_ = self.find_header(trs[j])
|
|
|
+ if flag_ or contain_header_:
|
|
|
break
|
|
|
else:
|
|
|
table_items.append(trs[j])
|
|
@@ -5216,7 +5234,8 @@ class CandidateExtractor(object):
|
|
|
candidates.add(ent.entity_text)
|
|
|
return candidates
|
|
|
|
|
|
- def predict(self, html, list_sentences, list_entitys):
|
|
|
+ def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
|
|
|
+ self.nlp_enterprise = nlp_enterprise
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
|
|
|
if richText:
|