Selaa lähdekoodia

补充标段包号、产品类包名;修复非招标报错;

lsm 6 kuukautta sitten
vanhempi
commit
fb018b4663

+ 4 - 1
BiddingKG/dl/interface/extract.py

@@ -442,7 +442,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     property_label = predictor.getPredictor('property_label').predict(title, product=','.join(product_list),project_name=codeName[0]['name'], prem=prem,channel_dic=channel_dic)
 
     '''最终验证prem'''
-    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'], name=codeName[0]['name'])
+    getAttributes.confirm_prem(prem[0]['prem'], channel_dic, deposit_project, prem[0]['total_tendereeMoney'])
+
+    '''通过产品补充标段包名20241203'''
+    getAttributes.add_package_name(prem[0]['prem'], list_entitys[0], product_list, name=codeName[0]['name'])
 
     # 提取拟在建所需字段
     start_time = time.time()

+ 89 - 80
BiddingKG/dl/interface/getAttributes.py

@@ -2753,103 +2753,83 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         packagePointer.pointer_tendereeMoney = entity
         p_entity -= 1            
 
-    '''包名与标段号链接'''
-    l_main = []
-    l_attn = []
-    pack_num_main = 0
-    name_num_main = 0
-    pack_num_attn = 0
-    name_num_attn = 0
-    for entity in list_entity:
-        if entity.entity_type in  ['name', 'package']:
-            if entity.in_attachment:
-                l_attn.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
-                if entity.entity_type == 'name':
-                    name_num_attn += 1
-                else:
-                    pack_num_attn += 1
-            else:
-                l_main.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end))
-                if entity.entity_type == 'name':
-                    name_num_main += 1
-                else:
-                    pack_num_main += 1
-    if name_num_main > 0 and pack_num_main > 0:
-        l_main.sort(key=lambda x: [x[2],x[3]])
-        # print('正文名称:',l_main)
-        link_dic = {}
-        i = 1
-        pre_ty = l_main[0][0]
-        while i < len(l_main):
-            if l_main[i][0] != pre_ty:
-                ty1, ent1, s1, b1, e1 = l_main[i-1]
-                ty2, ent2, s2, b2, e2 = l_main[i]
-                if ty1 == 'package':
-                    if ent1 not in link_dic:
-                        link_dic[ent1] = []
-                    if s1 == s2:
-                        dist = abs(b2 - b1)
-                    else:
-                        dist = len(list_sentence[s1].sentence_text) - b1
-                        for id in range(s1+1, s2):
-                            dist += len(list_sentence[id].sentence_text)
-                        dist += b2
-                    link_dic[ent1].append((s2-s1, dist, ent2))
-                elif ty2 == 'package':
-                    if ent2 not in link_dic:
-                        link_dic[ent2] = []
-                    if s1 == s2:
-                        dist = abs(b2 - b1)
-                    else:
-                        dist = len(list_sentence[s1].sentence_text) - b1
-                        for id in range(s1+1, s2):
-                            dist += len(list_sentence[id].sentence_text)
-                        dist += b2
-                    link_dic[ent2].append((s2-s1, dist, ent1))
-            pre_ty = l_main[i][0]
-            i += 1
-        for k, v in link_dic.items():
-            v.sort(key=lambda x: [x[0], x[1]])
-            # print('各包排序后项目名:', k, v)
-            PackDict[k]["name"] = v[0][2]
-    elif name_num_attn > 0 and pack_num_attn > 0:
-        # print("附件名称:", l_attn)
-        l_attn.sort(key=lambda x: [x[2],x[3]])
+    '''标段链接包名包号'''
+    pk_name_l = []
+    pk_code_l = []
+    count_dic = {
+        'package': set(),
+        'name': set(),
+        'code': set()
+    }
+
+    def get_sort_dist(l, max_sent_dist=2):
+        '''
+        计算标段与其他要素距离,并按距离排序返回字典
+        :param l: [(entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end)]
+        :param max_sent_dist: 最大句子距离
+        :return:
+        '''
+        l.sort(key=lambda x: [x[2],x[3]])
         link_dic = {}
         i = 1
-        pre_ty = l_attn[0][0]
-        while i < len(l_attn):
-            if l_attn[i][0] != pre_ty:
-                ty1, ent1, s1, b1, e1 = l_attn[i-1]
-                ty2, ent2, s2, b2, e2 = l_attn[i]
+        while i < len(l):
+            ty1, ent1, s1, b1, e1, in_att1 = l[i - 1]
+            ty2, ent2, s2, b2, e2, in_att2 = l[i]
+            if ty1 != ty2 and in_att1 == in_att2 and s2 - s1 <= max_sent_dist:
                 if ty1 == 'package':
                     if ent1 not in link_dic:
                         link_dic[ent1] = []
                     if s1 == s2:
-                        dist = abs(b2 - b1)
+                        dist = abs(b2 - e1)
                     else:
-                        dist = len(list_sentence[s1].sentence_text) - b1
+                        dist = len(list_sentence[s1].sentence_text) - e1
                         for id in range(s1+1, s2):
                             dist += len(list_sentence[id].sentence_text)
                         dist += b2
-                    link_dic[ent1].append((s2-s1, dist, ent2))
+                    if in_att1:
+                        dist += 100 # 附件的距离加100
+                    link_dic[ent1].append((s2 - s1, dist, ent2))
                 elif ty2 == 'package':
                     if ent2 not in link_dic:
                         link_dic[ent2] = []
                     if s1 == s2:
-                        dist = abs(b2 - b1)
+                        dist = abs(b2 - e1)
                     else:
-                        dist = len(list_sentence[s1].sentence_text) - b1
+                        dist = len(list_sentence[s1].sentence_text) - e1
                         for id in range(s1+1, s2):
                             dist += len(list_sentence[id].sentence_text)
                         dist += b2
-                    link_dic[ent2].append((s2-s1, dist, ent1))
-            pre_ty = l_attn[i][0]
+                    if in_att1:
+                        dist += 100  # 附件的距离加100
+                    dist += 30 # 包号在实体后面距离再加30
+                    link_dic[ent2].append((s2 - s1, dist, ent1))
             i += 1
-        for k, v in link_dic.items():
-            v.sort(key=lambda x: [x[0], x[1]])
-            # print('各包排序后项目名:', k, v)
-            PackDict[k]["name"] = v[0][2]
+        return link_dic
+
+    for entity in list_entity:
+        if entity.entity_type == 'package':
+            pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
+            pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
+            count_dic['package'].add(entity.entity_text)
+        elif entity.entity_type == 'name':
+            pk_name_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
+            count_dic['name'].add(entity.entity_text)
+        elif entity.entity_type == 'code':
+            pk_code_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
+            count_dic['code'].add(entity.entity_text)
+    if len(count_dic['package']) > 0:
+        if len(count_dic['name'])>0:
+            link_dic = get_sort_dist(pk_name_l)
+            for k, v in link_dic.items():
+                v.sort(key=lambda x: [x[0], x[1]])
+                if v[0][0] < 2 and v[0][1] < 200: # 标段号与包名句子数小于2,字距离小于200的才添加
+                    PackDict[k]["name"] = v[0][2]
+        if len(count_dic['code'])>0:
+            link_dic = get_sort_dist(pk_code_l)
+            for k, v in link_dic.items():
+                v.sort(key=lambda x: [x[0], x[1]])
+                if v[0][0] < 2 and v[0][1] < 200:
+                    PackDict[k]["code"] = v[0][2]
         
     #删除一个机构有多个角色的数据
     #删除重复人、概率不回传
@@ -4801,7 +4781,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
 
     # return old_prem
 
-def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0, name=""):
+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
     '''
     规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
     :param prem: prem 字段字典
@@ -4852,7 +4832,36 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
         for k in prem:
             if float(prem[k]['tendereeMoney'])==0:
                 prem[k]['tendereeMoney'] = total_tendereeMoney
-    if name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称
+
+def add_package_name(prem, list_entity, product_list, name):
+    '''
+    通过产品、项目名称,补充各标段包名,如果标段无包名,标段后紧接产品,把产品作为包名;如果标段数少于等于2且包名为空,补充项目名称为包名
+    :param prem:
+    :param list_entity:
+    :param product_list:
+    :param name:
+    :return:
+    '''
+    if len(prem)>2 and len(product_list)>2:
+        ent_l = []
+        for entity in list_entity:
+            if entity.entity_type in ['product', 'package']:
+                ent_l.append((entity.entity_type, entity.entity_text, entity.sentence_index, entity.wordOffset_begin, entity.wordOffset_end, entity.in_attachment))
+        ent_l.sort(key=lambda x: [x[2],x[3]])
+        i = 0
+        pk_dic = {}
+        while i < len(ent_l)-1:
+            ty1, ent1, s1, b1, e1, in_att1 = ent_l[i]
+            ty2, ent2, s2, b2, e2, in_att2 = ent_l[i+1]
+            if in_att1 == in_att2 and ty1 == 'package' and ty2 == 'product' and s1 == s2 and 0<b2-e1<3:
+                pk_dic[ent1] = ent2
+            i += 1
+        if len(pk_dic) > 1:
+            for k, v in prem.items():
+                if k in pk_dic and v.get('name', '') == '':
+                    v['name'] = pk_dic[k]
+
+    elif name != '' and len(prem)<=2: # 20241129 小于等于两个包且无包名称,取项目名称
         for k in prem:
             if prem[k].get('name', '') == '':
                 prem[k]['name'] = name

+ 6 - 9
BiddingKG/dl/interface/predictor.py

@@ -4634,7 +4634,13 @@ class DocChannel():
        119: '候选人公示',
        120: '合同公告'}
 
+      title = re.sub('[^\u4e00-\u9fa5]+|出租车', '', title)
+      if len(title) > 50:
+          title = title[:20] + title[-30:]
+      text = html2text(html)
       self.origin_dic = origin_dic
+      self.title = title
+      self.text = text
 
       if original_docchannel in not_extract_dic:
           return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
@@ -4643,15 +4649,6 @@ class DocChannel():
       if original_docchannel == 303:
           return {'docchannel': {'docchannel': '处罚公告', 'doctype': '处罚公告', 'life_docchannel': '处罚公告'}}, "源类别为处罚公告"
 
-      title = re.sub('[^\u4e00-\u9fa5]+|出租车', '', title)
-      if len(title) > 50:
-          title = title[:20] + title[-30:]
-
-      text = html2text(html)
-
-      self.title = title
-      self.text = text
-
       result = {'docchannel': {'docchannel': '', 'doctype': ''}}
 
       doc_type, type_kw = get_type(title, text)