Browse Source

修复表格中标金额为0的不作中标人 例:河钢供应链管理平台站源;修复产品属性提取错误;修复批发零售行业限额造成变压器等项目少了万元

lsm 1 năm trước cách đây
mục cha
commit
1c577c1310

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -338,7 +338,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-09-04'}
+    version_date = {'version_date': '2023-09-13'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -3473,7 +3473,7 @@ def limit_maximum_amount(dic, list_entity):
         else:
             maximum_amount = 50000000000
             minximum_amount = 500
-    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text) or category in ['零售批发']:
+    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text):
         # print('商品采购限额')
         maximum_amount = 80000000
         minximum_amount = 10

+ 46 - 50
BiddingKG/dl/interface/predictor.py

@@ -1498,10 +1498,11 @@ class RoleRulePredictor():
                                 for _name in name_entitys:
                                     if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
                                         find_flag = True
-                                        if p_entity.values[0] > on_value:
-                                            p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
-                                        else:
-                                            p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
+                                        p_entity.values[0] = on_value # 项目名称里面实体修改为最低概率
+                                        # if p_entity.values[0] > on_value:
+                                        #     p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
+                                        # else:
+                                        #     p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
 
                                 # for _name in list_name:
                                 #     if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
@@ -1788,7 +1789,7 @@ class RoleRuleFinalAdd():
         ents = []
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
-                if ent.label == 0 and ent.values[ent.label]>=0.5:
+                if ent.label == 0 and ent.values[ent.label]>0.5:
                     if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理,只算平台
                         # ent.label = 5
                         ent.values[ent.label] = 0.5 # 改为降低概率,不改类别,防止 336220759 明显招标人表达不提取
@@ -1818,7 +1819,7 @@ class RoleRuleFinalAdd():
                             break
                         if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                             ents[i].label = 0
-                            ents[i].values[0] = 0.5
+                            ents[i].values[0] = 0.51 # 修改为比标题概率略高
                             tenderee_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
@@ -1832,7 +1833,7 @@ class RoleRuleFinalAdd():
                             break
                         if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                             ents[i].label = 1
-                            ents[i].values[1] = 0.5
+                            ents[i].values[1] = 0.51 # 修改为比标题概率略高
                             agency_notfound = False
                             # log('正则最后补充实体: %s'%(ent_re))
                             break
@@ -2554,8 +2555,8 @@ class ProductPredictor():
 class ProductAttributesPredictor():
     def __init__(self,):
         self.p0 = '(类别|类型|物类|目录|类目|分类)(名称|$)|^品名|^品类|^品目|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
-        self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)'
-        self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$'
+        self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?([、\w]{,4}名称|内容|描述)'
+        self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
         # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
         # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -5333,7 +5334,7 @@ class TablePremExtractor(object):
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -5343,7 +5344,7 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
-        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
+        fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
@@ -5498,15 +5499,15 @@ class TablePremExtractor(object):
                     continue
                 link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
-            package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
-            package = uniform_package_name(package)
+            package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
 
-            if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
-                package_fix2raw[package] = package_code_raw
-            elif same_package == False:
-                multi_same_package = True
-            if multi_same_package:
-                package = package_code_raw
+            if package_code_raw!='':
+                if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
+                    package_fix2raw[package] = package_code_raw
+                elif same_package == False:
+                    multi_same_package = True
+                if multi_same_package:
+                    package = package_code_raw
             if package not in prem_dic or not same_package:
                 prem_dic[package] = {
                     'code': '',
@@ -5555,6 +5556,10 @@ class TablePremExtractor(object):
                     break
 
                 bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
+                if 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
+                    if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
+                        prem_dic.pop(package)
+                    continue
 
                 bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
                 if (re.search('费率|下浮率|[%%‰折]',
@@ -5577,7 +5582,7 @@ class TablePremExtractor(object):
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
                 break
-            if multi_same_package:
+            if multi_same_package: # 预处理后包号重复的,使用原始包号
                 for k, v in package_fix2raw.items():
                     if k in prem_dic:
                         prem_dic[v] = prem_dic.pop(k)
@@ -6345,42 +6350,33 @@ if __name__=="__main__":
     #     # print("cost_time:", json.loads(requests_result.text)['cost_time'])
     #     # print(MAX_LEN, len(sentence), len(list_sentence))
 
-    # docid = ""
-    # title = ''
-    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
-    #     html = f.read()
-    # product_attr = ProductAttributesPredictor()
-    # rs = product_attr.predict(docid='', html=html, page_time="")
-    # print(rs)
-
     docid = ""
     title = ''
     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
         html = f.read()
-    tb_extract = TablePremExtractor()
-    rs = tb_extract.predict(html, [
-        "广州供电局",
-        "南方电网数字电网科技(广东)有限公司",
-        "河北远东通信系统工程有限公司",
-        "咸亨国际科技股份有限公司",
-        "广东高德智能建筑股份有限公司",
-        "杭州应敏科技有限公司",
-        "广东冠电科技股份有限公司",
-        "广州科玮实验室设备有限公司",
-        "云南诺霆科技有限公司",
-        "广州建能电力科技有限公司",
-        "海鸿电气有限公司",
-        "深圳市深联创展科技开发有限公司",
-        "许继电气股份有限公司",
-        "南方电网数字电网集团信息通信科技有限公司",
-        "广州宇阳电力科技有限公司",
-        "深圳市科陆电子科技股份有限公司",
-        "中国南方电网有限责任公司",
-        "广东电网有限责任公司",
-        "南方电网供应链集团有限公司"
-    ])
+    product_attr = ProductAttributesPredictor()
+    rs = product_attr.predict(docid='', html=html, page_time="")
     print(rs)
 
+    # docid = ""
+    # title = ''
+    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
+    #     html = f.read()
+    # tb_extract = TablePremExtractor()
+    # rs = tb_extract.predict(html, [
+    #     "河钢集团供应链管理有限公司邯郸分公司",
+    #     "石家庄中达科技有限公司",
+    #     "河北骥驰耐磨材料有限公司",
+    #     "衡水奥诺工矿机械设备有限公司",
+    #     "河北勤鹏机械设备科技有限公司",
+    #     "邯郸市华北不锈钢厂有限公司",
+    #     "邯郸市芳林机械备件制造有限公司",
+    #     "济南宏鲁新型材料有限公司",
+    #     "邯郸海博机械设备有限公司",
+    #     "河北万革新能源科技有限公司"
+    # ])
+    # print(rs)
+
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
     # # # ids = [42078089, 51828144, 60511017, 69042200, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]