|
@@ -497,6 +497,8 @@ class CodeNamePredict():
|
|
|
continue
|
|
|
elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色
|
|
|
continue
|
|
|
+ elif _name.endswith('公司') and len(_name)<20: # 修复 456957250 雄县辉茂纸塑包装制品销售有限公司 作为项目名称
|
|
|
+ continue
|
|
|
|
|
|
#add name to entitys
|
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
@@ -856,7 +858,7 @@ class PREMPredict():
|
|
|
elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front): #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
values[2] = 0.5
|
|
|
- elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
|
+ elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front): # or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
|
|
|
values[2] = 0.5
|
|
|
label = 5
|
|
|
elif re.search('税费', front) and re.search('^承担', behind):
|
|
@@ -1477,7 +1479,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
|
|
|
|
|
|
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为") # |建安费用 不作为招标金额
|
|
|
+ self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|招标规模") # |建安费用 不作为招标金额
|
|
|
self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格") # 单写 总价 不能作为中标金额,很多表格有单价、总价
|
|
|
self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
|
|
|
self.pattern_money_other = re.compile("代理费|服务费")
|
|
@@ -2391,6 +2393,8 @@ class RoleGrade():
|
|
|
for entity in low_prob_winner: # 如果低概率中标人在招标或代理列表,改为非角色
|
|
|
if entity.entity_text in all_tenderee_agency:
|
|
|
entity.label = 5
|
|
|
+ elif entity.in_attachment: # 附件低概率中标角色不要 避免:516109391 桂林银行崇左宁明支行,宁明县城中镇兴宁大道中70号,预测为中标
|
|
|
+ entity.label = 5
|
|
|
|
|
|
if org_winner != []:
|
|
|
flag = 0
|
|
@@ -2432,7 +2436,7 @@ class MoneyGrade():
|
|
|
if ser:
|
|
|
groupdict = pattern.split('>')[0].replace('(?P<', '')
|
|
|
_role, _direct, _prob = groupdict.split('_')
|
|
|
- if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context) or float(entity.entity_text)<100:
|
|
|
+ if re.search('单价', context[-4:]) or re.search('(最低|风险)控制价', context):# or float(entity.entity_text)<100:
|
|
|
_prob = 6
|
|
|
_label = role2id.get(_role)
|
|
|
if _label != entity.label:
|
|
@@ -2455,6 +2459,8 @@ class MoneyGrade():
|
|
|
# _prob = min_prob - 0.1 if in_att else min_prob
|
|
|
entity.values[entity.label] = _prob + entity.values[entity.label] / 20
|
|
|
# print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
|
|
|
+ if entity.entity_type in ['money'] and entity.label in [0, 1] and 0.5<=entity.values[entity.label]<0.75 and float(entity.entity_text)<100: # 20241011 低概率小金额改为其他金额
|
|
|
+ entity.label = 2
|
|
|
|
|
|
|
|
|
# 时间类别
|
|
@@ -6022,7 +6028,7 @@ class DistrictPredictor():
|
|
|
return province_l, city_l, district_l
|
|
|
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区', ' ', text)
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河', ' ', text)
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
text = re.sub('茂名滨海新区', '茂名市', text)
|
|
@@ -6121,13 +6127,13 @@ class DistrictPredictor():
|
|
|
pro_idx = idx_dic[idx]['省']
|
|
|
if pro_idx in pro_ids:
|
|
|
pro_ids[pro_idx] += (score + 0) * w * weight
|
|
|
- else:
|
|
|
- pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
|
|
|
+ # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
|
|
|
+ # pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
|
|
|
city_idx = idx_dic[idx]['市']
|
|
|
if city_idx in city_ids:
|
|
|
city_ids[city_idx] += (score + 0) * w * weight
|
|
|
- else:
|
|
|
- city_ids[city_idx] = (score + 0) * w * weight * 0.1
|
|
|
+ # else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
|
|
|
+ # city_ids[city_idx] = (score + 0) * w * weight * 0.1
|
|
|
|
|
|
for k, v in pro_ids.items():
|
|
|
pro_ids[k] = v * text_weight
|
|
@@ -6995,7 +7001,7 @@ class CandidateExtractor(object):
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
contain_header = False
|
|
|
- if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
|
|
|
+ if len(set(fix_td_list) & self.headerset)>=2 and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
|
|
|
flag = True
|
|
|
for i in range(len(td_list)) :
|
|
|
text = td_list[i]
|
|
@@ -8080,7 +8086,7 @@ if __name__=="__main__":
|
|
|
rs = tb_extract.predict(html, [
|
|
|
"江苏中联铸本混凝土有限公司",
|
|
|
"鼓楼区协荣机械设备经销部"
|
|
|
- ], web_source_name = '', all_winner=True)
|
|
|
+ ], web_source_name = '', all_winner=False)
|
|
|
print('标段数:',len(rs[0]))
|
|
|
print(rs)
|
|
|
|