|
@@ -1498,10 +1498,11 @@ class RoleRulePredictor():
|
|
|
for _name in name_entitys:
|
|
|
if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
|
|
|
find_flag = True
|
|
|
- if p_entity.values[0] > on_value:
|
|
|
- p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
|
|
|
- else:
|
|
|
- p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
|
|
|
+ p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
|
|
|
+ # if p_entity.values[0] > on_value:
|
|
|
+ # p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
|
|
|
+ # else:
|
|
|
+ # p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
|
|
|
|
|
|
# for _name in list_name:
|
|
|
# if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
|
|
@@ -1788,7 +1789,7 @@ class RoleRuleFinalAdd():
|
|
|
ents = []
|
|
|
for ent in list_entitys[0]:
|
|
|
if ent.entity_type in ['org', 'company']:
|
|
|
- if ent.label == 0 and ent.values[ent.label]>=0.5:
|
|
|
+ if ent.label == 0 and ent.values[ent.label]>0.5:
|
|
|
if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台
|
|
|
# ent.label = 5
|
|
|
ent.values[ent.label] = 0.5 # 改为降低概率,不改类别,防止 336220759 明显招标人表达不提取
|
|
@@ -1818,7 +1819,7 @@ class RoleRuleFinalAdd():
|
|
|
break
|
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
ents[i].label = 0
|
|
|
- ents[i].values[0] = 0.5
|
|
|
+ ents[i].values[0] = 0.51 # 修改为比标题概率略高
|
|
|
tenderee_notfound = False
|
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
|
break
|
|
@@ -1832,7 +1833,7 @@ class RoleRuleFinalAdd():
|
|
|
break
|
|
|
if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
|
|
|
ents[i].label = 1
|
|
|
- ents[i].values[1] = 0.5
|
|
|
+ ents[i].values[1] = 0.51 # 修改为比标题概率略高
|
|
|
agency_notfound = False
|
|
|
# log('正则最后补充实体: %s'%(ent_re))
|
|
|
break
|
|
@@ -2554,8 +2555,8 @@ class ProductPredictor():
|
|
|
class ProductAttributesPredictor():
|
|
|
def __init__(self,):
|
|
|
self.p0 = '(类别|类型|物类|目录|类目|分类)(名称|$)|^品名|^品类|^品目|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
|
|
|
- self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)'
|
|
|
- self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$'
|
|
|
+ self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?([、\w]{,4}名称|内容|描述)'
|
|
|
+ self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
|
|
|
# self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
|
# self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -5333,7 +5334,7 @@ class TablePremExtractor(object):
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -5343,7 +5344,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
|
|
|
def find_header(self, td_list):
|
|
|
- fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
|
+ fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
contain_header = False
|
|
@@ -5498,15 +5499,15 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
- package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
- package = uniform_package_name(package)
|
|
|
+ package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
|
|
|
- if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
|
- package_fix2raw[package] = package_code_raw
|
|
|
- elif same_package == False:
|
|
|
- multi_same_package = True
|
|
|
- if multi_same_package:
|
|
|
- package = package_code_raw
|
|
|
+ if package_code_raw!='':
|
|
|
+ if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
|
+ package_fix2raw[package] = package_code_raw
|
|
|
+ elif same_package == False:
|
|
|
+ multi_same_package = True
|
|
|
+ if multi_same_package:
|
|
|
+ package = package_code_raw
|
|
|
if package not in prem_dic or not same_package:
|
|
|
prem_dic[package] = {
|
|
|
'code': '',
|
|
@@ -5555,6 +5556,10 @@ class TablePremExtractor(object):
|
|
|
break
|
|
|
|
|
|
bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
|
|
|
+ if 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的包 丢弃
|
|
|
+ prem_dic.pop(package)
|
|
|
+ continue
|
|
|
|
|
|
bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
|
|
|
if (re.search('费率|下浮率|[%%‰折]',
|
|
@@ -5577,7 +5582,7 @@ class TablePremExtractor(object):
|
|
|
if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
|
prem_dic.pop(package)
|
|
|
break
|
|
|
- if multi_same_package:
|
|
|
+ if multi_same_package: # 预处理后包号重复的,使用原始包号
|
|
|
for k, v in package_fix2raw.items():
|
|
|
if k in prem_dic:
|
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
@@ -6345,42 +6350,33 @@ if __name__=="__main__":
|
|
|
# # print("cost_time:", json.loads(requests_result.text)['cost_time'])
|
|
|
# # print(MAX_LEN, len(sentence), len(list_sentence))
|
|
|
|
|
|
- # docid = ""
|
|
|
- # title = ''
|
|
|
- # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
- # html = f.read()
|
|
|
- # product_attr = ProductAttributesPredictor()
|
|
|
- # rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
- # print(rs)
|
|
|
-
|
|
|
docid = ""
|
|
|
title = ''
|
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
html = f.read()
|
|
|
- tb_extract = TablePremExtractor()
|
|
|
- rs = tb_extract.predict(html, [
|
|
|
- "广州供电局",
|
|
|
- "南方电网数字电网科技(广东)有限公司",
|
|
|
- "河北远东通信系统工程有限公司",
|
|
|
- "咸亨国际科技股份有限公司",
|
|
|
- "广东高德智能建筑股份有限公司",
|
|
|
- "杭州应敏科技有限公司",
|
|
|
- "广东冠电科技股份有限公司",
|
|
|
- "广州科玮实验室设备有限公司",
|
|
|
- "云南诺霆科技有限公司",
|
|
|
- "广州建能电力科技有限公司",
|
|
|
- "海鸿电气有限公司",
|
|
|
- "深圳市深联创展科技开发有限公司",
|
|
|
- "许继电气股份有限公司",
|
|
|
- "南方电网数字电网集团信息通信科技有限公司",
|
|
|
- "广州宇阳电力科技有限公司",
|
|
|
- "深圳市科陆电子科技股份有限公司",
|
|
|
- "中国南方电网有限责任公司",
|
|
|
- "广东电网有限责任公司",
|
|
|
- "南方电网供应链集团有限公司"
|
|
|
- ])
|
|
|
+ product_attr = ProductAttributesPredictor()
|
|
|
+ rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
print(rs)
|
|
|
|
|
|
+ # docid = ""
|
|
|
+ # title = ''
|
|
|
+ # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
+ # html = f.read()
|
|
|
+ # tb_extract = TablePremExtractor()
|
|
|
+ # rs = tb_extract.predict(html, [
|
|
|
+ # "河钢集团供应链管理有限公司邯郸分公司",
|
|
|
+ # "石家庄中达科技有限公司",
|
|
|
+ # "河北骥驰耐磨材料有限公司",
|
|
|
+ # "衡水奥诺工矿机械设备有限公司",
|
|
|
+ # "河北勤鹏机械设备科技有限公司",
|
|
|
+ # "邯郸市华北不锈钢厂有限公司",
|
|
|
+ # "邯郸市芳林机械备件制造有限公司",
|
|
|
+ # "济南宏鲁新型材料有限公司",
|
|
|
+ # "邯郸海博机械设备有限公司",
|
|
|
+ # "河北万革新能源科技有限公司"
|
|
|
+ # ])
|
|
|
+ # print(rs)
|
|
|
+
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|
|
|
# # # ids = [42078089, 51828144, 60511017, 69042200, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|