Sfoglia il codice sorgente

修复标段、角色金额问题

lsm 1 giorno fa
parent
commit
94d8cbbf4e

+ 3 - 2
BiddingKG/dl/common/Utils.py

@@ -1000,6 +1000,7 @@ def find_package(content):
     content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
     # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
     content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
+    content = re.sub('打包|标段名称|标段(包)?名称', ' ', content) # 打包Zip下载
 
     for it in re.finditer(filter_package_pattern, content):
         content = content.replace(it.group(0), ' ' * len(it.group(0)))
@@ -1329,7 +1330,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
             elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
                            sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
                 notes = '成本警戒线'
-            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]):
+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]|服务金额', sentence_text[_match.span()[0]:_match.span()[1]]) and re.search('缴纳', sentence_text[max(0, _match.span()[0]-5):_match.span()[1]]) == None: # 排除 20250702 ,应缴纳的中标服务金额:1200.00元
                 # cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
                 # notes = cost_re.group(1)
                 notes = '招标或中标金额'
@@ -1338,7 +1339,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
             elif re.search('^[/每]', sentence_text[_match.end():]):
                 # print('单价:', _match.group(0))
                 notes = '单价'
-            elif re.search('单价', sentence_text[max(0, _match.start()-10):_match.start()]):
+            elif re.search('单价', sentence_text[max(0, _match.start()-10):_match.start()]) and re.search('单价:\d+,', sentence_text[:_match.start()]) == None: # 单价:1298,合同金额:12.460800万元, 避免后面金额做单价
                 notes = '单价'
             elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
                 notes = '大写'

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -536,7 +536,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-07-01'}
+    version_date = {'version_date': '2025-07-03'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 3 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -4358,7 +4358,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留
             dict_other["product"].append(entity.entity_text)
-        elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
+        elif entity.entity_type=='money' and entity.notes=='总投资' and float(entity.entity_text)>5000000 and float(dict_other["total_tendereeMoney"])<float(entity.entity_text): # 20250702 限制总投资为500万的以上
             dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
 
@@ -4539,7 +4539,7 @@ def correct_rolemoney(docid, prem, total_product_money, total_budget, list_artic
                         #     l[2] = total_product_money
                         #     log('修改中标金额为所有产品总金额')
                         # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
-                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or(float(l["role_money"]['money'])<total_product_money<500000) or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or(float(l["role_money"]['money'])*2<total_product_money<500000) or (float(l["role_money"]['money'])<ree_money/2 and float(l["role_money"]['money'])<total_product_money<ree_money)): # 改为小于一半招标金额或为0时替换为合计金额
                             l["role_money"]['money'] = total_product_money
                             log('修改中标金额为产品总金额: %s, docid:%s' % (total_product_money, docid))
                     except Exception as e:
@@ -4560,7 +4560,7 @@ def correct_rolemoney(docid, prem, total_product_money, total_budget, list_artic
                                     log('修改中标金额为总价或合计金额: %s, docid:%s'%(money, docid))
                             except Exception as e:
                                 print('修正中标价格报错:%s' % e)
-    if 0 < total_budget < 2000000 and len(prem[0]['prem']) == 1 and 'Project' in prem[0]['prem'] and float(prem[0]['prem']['Project']["tendereeMoney"]) < total_budget:
+    if 0 < total_budget < 2000000 and len(prem[0]['prem']) == 1 and 'Project' in prem[0]['prem'] and float(prem[0]['prem']['Project']["tendereeMoney"])*2 < total_budget:
         prem[0]['prem']['Project']["tendereeMoney"] = total_budget
         log('修改招标金额为表格合计预算: %s, docid:%s' % (total_budget, docid))
 

+ 11 - 10
BiddingKG/dl/interface/predictor.py

@@ -950,7 +950,7 @@ class PREMPredict():
                 elif re.search('^为\w{,10}第二(成交|中标)单位', behind): # 中标预测错误,例:601143888 河南省创慧新材料科技有限公司为铸咀采购项目第二成交单位
                     label = 3
                     values[3] = 0.5
-                elif re.search('中标单位,$|被确定为$|成交电商:$|如果?我方成功中选$', front): # 632523961 现通知:贵司被确定为广州地铁传媒有限公司贵阳地铁广告媒体服务项目(2025年)的执行单位。 # 609280615 万银政采平台 罗山县政采平台等成交电商都不是中标人  632380222 如我方成功中选 中国人民保险 采购项目,
+                elif re.search('中标单位,$|被确定为$|如果?我方成功中选$', front): # 632523961 现通知:贵司被确定为广州地铁传媒有限公司贵阳地铁广告媒体服务项目(2025年)的执行单位。 632380222 如我方成功中选 中国人民保险 采购项目,
                     label = 5
                 elif re.search('^为预备中标单位', behind):
                     label = 3
@@ -2504,6 +2504,8 @@ class RoleGrade():
                             _prob -= 0.05
                         if _label == 0 and is_agency(entity.entity_text): # 20250116 修复 584333688 同时有招标单位 : 安徽省招标集团股份有限公司,.采购人信息 名 称:安徽开放大学
                             _prob -= 0.1
+                        if re.search('成交电商:$', text[:b]): # 609280615 万银政采平台 罗山县政采平台等成交电商都不是中标人
+                            _prob = 0.55
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
                         # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
@@ -2607,7 +2609,7 @@ class MoneyGrade():
                             _prob = max(0.5, _prob - 0.2)
                         entity.values[_label] = _prob + entity.values[_label] / 20
                         not_found = 0
-                        if _label == 0 and float(entity.entity_text)<10000 and entity.values[_label] > 0.6: # 20250624 小金额预算概率降低 634252534 包合计才是真正的预算
+                        if _label == 0 and float(entity.entity_text)<100 and entity.values[_label] > 0.6: # 20250624 小金额预算概率降低 634252534 包合计才是真正的预算
                             entity.values[_label] = 0.6
                         # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
                         break
@@ -3486,7 +3488,7 @@ class ProductAttributesPredictor():
             while i < (len(inner_table)):
                 tds = inner_table[i]
                 not_empty = [it for it in tds if re.sub('\s', '', it) != ""]
-                if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总', tds[0])): # 非空列或者不重复内容小于两列的 继续
+                if len(set(not_empty))<2 or len(set(tds))<2 or (len(set(tds))==2 and re.search('总计|合计|汇总|总价', tds[0])): # 非空列或者不重复内容小于两列的 继续  281415580 合同总价
                     i += 1
                     # print('表格产品提取:非空列或者不重复内容小于两列的 继续', i, tds)
                     continue
@@ -3566,15 +3568,14 @@ class ProductAttributesPredictor():
 
                     if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id1]) == None:
-                        product = tds[id1]
+                        product = re.sub('\s+', '', tds[id1]) # 去掉空格,避免格式问题无法去重 例 267610200
 
                     if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|^详见|无$|xxx', tds[id0]) == None:
-                        category = tds[id0]
+                        category = re.sub('\s', '', tds[id0])
                         product = "%s_%s"%(category, product) if product!="" and product!=category else category
 
                     if product != "" and product not in ['工程类', '服务类', '货物类', '工程', '服务', '货物']:
-                        # print('匹配产品内容: ', product)
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
                                 # if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):  # 254816100 这篇数量很大,貌似正常
@@ -3683,8 +3684,6 @@ class ProductAttributesPredictor():
                                             unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                         if budget != "":
                                             budget, _money_unit = money_process(budget, header_list2[2])
-                                            if budget > 0:
-                                                budget_list.append(budget)
                                             budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                         if total_price != "":
                                             total_price, _money_unit = money_process(total_price, header_list[6])
@@ -3708,6 +3707,8 @@ class ProductAttributesPredictor():
                                         if (product, specs, unitPrice, quantity) not in product_set:
                                             product_set.add((product, specs, unitPrice, quantity))
                                             product_link.append(link)
+                                            if budget != '' and float(budget) > 0:
+                                                budget_list.append(float(budget))
                                             if link['unitPrice'] != "" and link['quantity'] != '':
                                                 try:
                                                     total_product_money += float(link['unitPrice']) * float(
@@ -3729,8 +3730,6 @@ class ProductAttributesPredictor():
                                     unitPrice = str(unitPrice) if unitPrice != 0 and unitPrice<100000000 else ""
                                 if budget != "":
                                     budget, _money_unit = money_process(budget, header_list2[2])
-                                    if budget > 0:
-                                        budget_list.append(budget)
                                     budget = str(budget) if budget != 0 and budget<50000000000 else ''
                                 if total_price != "":
                                     total_price, _money_unit = money_process(total_price, header_list[6])
@@ -3754,6 +3753,8 @@ class ProductAttributesPredictor():
                                 if (product, unitPrice,) not in product_set: # 2023/09/22 改为只判断产品/单价,只要两个一样就不作为新产品 避免多个表格重复表达有些没数量造成重复提取 353858683
                                     product_set.add((product, unitPrice))
                                     product_link.append(link)
+                                    if budget != '' and float(budget) > 0:
+                                        budget_list.append(float(budget))
                                     if link['unitPrice']:
                                         unit_price_list.append(link['unitPrice'])
                                     if link['unitPrice'] != "" and link['quantity'] != '':