Browse Source

只有一个包号的不提取包,避免359354697正文无包号附件有包号变两个包;调整行业限额;调整角色规则;修复公告多个表格重复产品数量不同造成重复问题

lsm 1 năm trước cách đây
mục cha
commit
03d1b1ebc6

+ 2 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -2507,6 +2507,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
 
             if filter != "":
                 continue
+            if len(entity_text)>30 or len(re.sub('[E-]', '', science))>2: # 限制数字长度,避免类似265339018附件金额错误,数值超大报错 decimal.InvalidOperation
+                continue
             start_index, end_index = _match.span()
             start_index += len(text_beforeMoney)
 

+ 23 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -830,6 +830,8 @@ def getPackagesFromArticle(list_sentence, list_entity):
     if len(True_package2) > 2: # 同时包含多标段及多中标人的
         PackageList_scope = PackageList_scope + PackageList_scope2
     PackageList = get_package_scope(PackageList_scope)
+    if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project
+        return [], set(), {}
     return PackageList, PackageSet, dict_packageCode
 
 
@@ -3448,6 +3450,21 @@ def limit_maximum_amount(dic, list_entity):
     :param list_entity: 实体列表
     :return:
     '''
+    indu_amount = {
+        '计算机设备': 200000000,
+        '办公设备': 100000000,
+        '家具用具': 500000000,
+        '办公消耗用品及类似物品': 100000000,
+        '日杂用品': 100000000,
+        '餐饮业': 1000000000,
+        '物业管理': 1000000000,
+        '工程技术与设计服务': 1000000000,
+        '工程评价服务': 100000000,
+        '其他工程服务': 100000000,
+        '工程监理服务': 100000000,
+        '工程造价服务': 100000000,
+        '会计、审计及税务服务': 100000000,
+    }
     title = dic.get('doctitle_refine', '')
     name = dic.get('name', '')
     product = ','.join(dic.get('product', []))
@@ -3484,6 +3501,8 @@ def limit_maximum_amount(dic, list_entity):
         # print('快递限额')
         maximum_amount = 80000000
         minximum_amount = 10
+    elif industry in indu_amount:
+        maximum_amount = indu_amount[industry]
     # print('maximum_amount:', maximum_amount)
     for value in dic['prem'].values():
         for l in value['roleList']:
@@ -3503,6 +3522,8 @@ def limit_maximum_amount(dic, list_entity):
                     if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
                         l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
                         # print('行业限额纠正连接金额')
+                    elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
                     # elif flag and l["role_money"]['money_unit'] == '元':
                     #     l["role_money"]['money'] = 0
                 elif 0<float(l["role_money"]['money']) < minximum_amount:
@@ -3524,6 +3545,8 @@ def limit_maximum_amount(dic, list_entity):
                 if flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney'])):
                     value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
                     # print('行业限额纠正连接金额')
+                elif industry in ['餐饮业', '物业管理'] and maximum_amount == indu_amount[industry]:
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
                 # elif flag and value['tendereeMoneyUnit'] == '元':
                 #     value['tendereeMoney'] = 0
             elif 0<float(value['tendereeMoney']) < minximum_amount:

+ 16 - 13
BiddingKG/dl/interface/predictor.py

@@ -1323,7 +1323,7 @@ class RoleRulePredictor():
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
                                      "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^拟采购|^拟招标|^须购[买置]一批)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
@@ -1338,7 +1338,8 @@ class RoleRulePredictor():
                                            "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)"  # 解决表头识别不到加逗号情况,需前面为,。空
         self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
-                                           "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
+                                           "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
+                                           "|结果公示如下:摇出球号:\d+号,中介机构:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
 
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
                                          "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
@@ -3096,7 +3097,7 @@ class ProductAttributesPredictor():
                             else:
                                 header_quan_unit = ""
 
-                    if found_header and len(headers)<1:  # 只保留出现的第一个表头
+                    if found_header and ('_'.join(header_list) not in headers or '_'.join(header_list2) not in headers_demand):# and len(headers)<1:  # 只保留出现的第一个表头
                         headers.append('_'.join(header_list))
                         headers_demand.append('_'.join(header_list2))
                         header_col.append('_'.join(tds))
@@ -3300,13 +3301,15 @@ class ProductAttributesPredictor():
                                 #         except:
                                 #             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
 
-                                if (product, unitPrice, quantity) not in product_set:
-                                    product_set.add((product, unitPrice, quantity))
+                                # if (product, unitPrice, quantity) not in product_set:
+                                #     product_set.add((product, unitPrice, quantity))
+                                if (product, unitPrice,) not in product_set: # 2023/09/22 改为只判断产品/单价,只要两个一样就不作为新产品 避免多个表格重复表达有些没数量造成重复提取 353858683
+                                    product_set.add((product, unitPrice))
                                     product_link.append(link)
+                                    if link['unitPrice']:
+                                        unit_price_list.append(link['unitPrice'])
                                     if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
-                                            if link['unitPrice']:
-                                                unit_price_list.append(link['unitPrice'])
                                             total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
                                         except:
                                             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
@@ -3326,17 +3329,17 @@ class ProductAttributesPredictor():
                     i += 1
                 else:
                     i += 1
-        if len(total_price_list)>0 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
-            # print('总价一半以上重复的为多行一个总价,需去掉')
-            for link in product_link:
+        if len(total_price_list)>1 and len(set(total_price_list))/len(total_price_list)<=0.5: # 2023/7/27 总价一半以上重复的为多行一个总价,需去掉
+            # print('总价一半以上重复的为多行一个总价,需去掉', total_price_list)
+            for link in product_link:  # 预防最后一列总价为所有产品总价,列补全后所有产品总价一样情况
                 if 'total_price' in link:
                     link['total_price'] = ""
         if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
             # print('如果单价重复率高不算总产品价避免错误')
             total_product_money = 0
-            for link in product_link:
-                if 'unitPrice' in link:
-                    link['unitPrice'] = ""
+            # for link in product_link:
+            #     if 'unitPrice' in link:
+            #         link['unitPrice'] = ""
 
         if len(product_link)>0:
             attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}