浏览代码

优化多标段提取;开标记录补充开标会;优化Prem结果合并

lsm 1 年之前
父节点
当前提交
71184865a1

+ 1 - 0
BiddingKG/dl/common/Utils.py

@@ -932,6 +932,7 @@ def money_process(money_text, header):
     money = 0
     money_unit = ""
     # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
     re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
     if re_price:
         money_text = re_price.group(0)

+ 1 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -2936,7 +2936,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
                     '''下面对公司实体进行清洗'''
                     entity_text = re.sub('\s', '', entity_text)
-                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$', entity_text):  # 删除
+                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$', entity_text):  # 删除
                         # print('公司实体不符合规范:', entity_text)
                         continue
                     elif re.match('xx|XX', entity_text):  # 删除

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -351,7 +351,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-04-16'}
+    version_date = {'version_date': '2024-04-19'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 6 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -3976,6 +3976,11 @@ def update_prem(old_prem, new_prem):
                     del_k.append(k)
             for k in del_k:
                 old_prem.pop(k)
+
+        if len(old_prem) == 2 and len(new_prem) == 1 and 'Project' in new_prem and 'Project' in old_prem:
+            k = list(old_prem.keys()-new_prem.keys())[0]
+            new_prem[k] = new_prem.pop('Project')
+
         multi_tendereeMoney = [] # 多包招标金额
         for k, v in new_prem.items():
             if k == 'Project':
@@ -4033,10 +4038,7 @@ def update_prem(old_prem, new_prem):
                         else:
                             other_winner.add(d['role_text'])
             if pro_winner & other_winner != set():
-                # print('过滤掉多包相同中标人在不同包')
-                for d in old_prem['Project']['roleList']:
-                    if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
-                        old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色
+                old_prem['Project']['roleList'] = [d for d in old_prem['Project']['roleList'] if d['role_name'] not in ['win_tenderer', 'second_tenderer', 'third_tenderer']]
         if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
             old_prem['Project']['tendereeMoney'] = 0
 

+ 34 - 13
BiddingKG/dl/interface/predictor.py

@@ -2326,11 +2326,11 @@ class RoleGrade():
                         # log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
                         ent.values[2] = 0.6
                         flag = 1
-            if flag == 0 and company_winner != []:
-                for ent in org_winner:
-                    if ent.label == 2 and ent.values[2] > 0.6:
-                        # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
-                        ent.values[2] = 0.6
+            # if flag == 0 and company_winner != []:  # 2024/04/18 注释掉 避免提取不到 273351465 供应商(乙方:湖南省第二测绘院
+            #     for ent in org_winner:
+            #         if ent.label == 2 and ent.values[2] > 0.6:
+            #             # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
+            #             ent.values[2] = 0.6
 
 
 class MoneyGrade():
@@ -3913,7 +3913,7 @@ class DocChannel():
           '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
-          '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',
+          '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会',
           '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)'
       }
 
@@ -6060,8 +6060,8 @@ class TablePremExtractor(object):
         '''各要素表头规则'''
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
-            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的|^包)(名称?|内容)",
+            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
@@ -6121,7 +6121,7 @@ class TablePremExtractor(object):
                      'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
                 return flag, contain_header, header_dic
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
-                if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
+                if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
                     # print('只有供应商名称 没排名和包号的去掉')
                     return flag, contain_header, dict()
                 return flag,contain_header, header_dic
@@ -6162,7 +6162,7 @@ class TablePremExtractor(object):
             for ner in ners[0]:
                 if ner[2] in ['org', 'company', 'location']:
                     roles.append(ner[3])
-        if roles and len(''.join(roles)) > len(text)*0.8:
+        if roles and (len(''.join(roles)) > len(text)*0.8 or text.startswith(roles[0])):
             return roles[0]
         else:
             return ''
@@ -6177,7 +6177,7 @@ class TablePremExtractor(object):
                           'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
 
         if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉
-            or re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
+            or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
         for i in df.index:
@@ -6332,6 +6332,25 @@ class TablePremExtractor(object):
                         prem_dic[v] = prem_dic.pop(k)
         return prem_dic
 
+    def update_prem(self, rs_dic, tmp_dic):
+        '''
+        合并更新 prem
+        :param rs_dic: 返回结果
+        :param tmp_dic: 待合并结果
+        :return:
+        '''
+        for pack in tmp_dic:
+            if pack in rs_dic:
+                for k in tmp_dic[pack]:
+                    if rs_dic[pack][k] in ['', 0]:
+                        rs_dic[pack][k] = tmp_dic[pack][k]
+                    elif rs_dic[pack][k] == []:
+                        rs_dic[pack][k]  = tmp_dic[pack][k]
+                    elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
+                        rs_dic[pack][k] = tmp_dic[pack][k]
+            else:
+                rs_dic[pack] = tmp_dic[pack]
+
     def get_prem(self, soup, web_source_name=''):
         tables = soup.find_all('table')
         tables.reverse()
@@ -6373,7 +6392,8 @@ class TablePremExtractor(object):
                         df = pd.DataFrame(table_items)
                         prem_ = self.extract_from_df(df, headers, web_source_name)
                         # rs_dic.update(prem_)
-                        table_prem.update(prem_)
+                        # table_prem.update(prem_)
+                        self.update_prem(table_prem, prem_)
                     i = j - 1
                 i += 1
             if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
@@ -6385,7 +6405,8 @@ class TablePremExtractor(object):
                     package_sib = uniform_package_name(package_sib)
                     table_prem[package_sib] = table_prem.pop('1')
             if table_prem:
-                rs_dic.update(table_prem)
+                # rs_dic.update(table_prem)
+                self.update_prem(rs_dic, table_prem)
             table.extract()
         return rs_dic