Browse Source

修复符号分割多中标人提取

lsm 9 months ago
parent
commit
9ec5b8cc63
1 changed files with 16 additions and 4 deletions
  1. 16 4
      BiddingKG/dl/interface/getAttributes.py

+ 16 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -4326,14 +4326,16 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                                 b = b2
                                                 e = e2
                                                 find_joint = 1
-                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
+                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
                                                 join_l.append(behind_entity.entity_text)
                                                 b = b2
                                                 e = e2
+                                            elif e == e2: # 修复重复实体导致中断情况
+                                                continue
                                             else:
                                                 break
                                         if len(join_l)>1:
-                                            d['win_tenderer_joint'] = ''.join(set(join_l))
+                                            d['win_tenderer_joint'] = ','.join(set(join_l))
 
 
 
@@ -4453,15 +4455,17 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                             sentence_text = sentences[ent_bh.sentence_index].sentence_text
                             if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
                                     len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']):  # 修复多中标人刚好在文末index超出报错,例子 407126558
-                                multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                                multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                                 e_idx_fr = e_idx_bh
                                 i = j + 1
                             else:
                                 break
                         elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
-                            multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                            multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                             e_idx_fr = e_idx_bh
                             i = j + 1
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况
+                            i = j + 1
                         else:
                             break
                     if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
@@ -4615,8 +4619,16 @@ def  confirm_prem(prem, channel_dic):
                 if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
                     if k == 'Project':
                         pro_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            pro_winner.updata(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            pro_winner.update(set(d['multi_winner'].split(',')))
                     else:
                         other_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            other_winner.update(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            other_winner.update(set(d['multi_winner'].split(',')))
         if pro_winner & other_winner != set():
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',