Explorar el Código

修复无内容公告文末招标人提取报错;新增返回所有金额;新增多中标人

lsm hace 1 año
padre
commit
9cec020919

+ 5 - 2
BiddingKG/dl/interface/extract.py

@@ -308,6 +308,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # print('msc', msc)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
+    '''一包多中标人提取及所有金额提取'''
+    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences)
+
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
     fail = channel_dic['docchannel']['docchannel'] == "废标公告"
     fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
@@ -342,8 +345,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-01-24'}
-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
+    version_date = {'version_date': '2024-01-29'}
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])

+ 79 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -3845,6 +3845,85 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     except Exception as e:
         print('获取联合体抛出异常', e)
 
+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
+    '''
+    获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
+    :param channel_dic:
+    :param prem:
+    :param list_entitys:
+    :param list_sentences:
+    :return:
+    '''
+    moneys = []
+    moneys_attachment = []
+    if channel_dic['docchannel']['docchannel']=='中标信息' and 'win_tenderer' in str(prem):
+        sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
+        finalists = [] # 入围供应商
+        i = 0
+        while i < len(list_entitys[0])-1:
+            ent = list_entitys[0][i]
+            b_idx_fr = ent.wordOffset_begin
+            e_idx_fr = ent.wordOffset_end
+            multi_winner_l = [ent.entity_text]
+            i += 1
+            if ent.entity_type in ['money']:
+                money = float(ent.entity_text)
+                if ent.in_attachment:
+                    moneys_attachment.append(money)
+                else:
+                    moneys.append(money)
+            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.5:
+                sentence_text = sentences[ent.sentence_index].sentence_text
+                pre_text = sentence_text[max(0, b_idx_fr-10):b_idx_fr]
+                if re.search('入围', pre_text) and re.search('未入围', pre_text)==None and ent.entity_text not in finalists:
+                    finalists.append(ent.entity_text)
+                for j in range(i, len(list_entitys[0])):
+                    ent_bh = list_entitys[0][j]
+                    b_idx_bh = ent_bh.wordOffset_begin
+                    e_idx_bh = ent_bh.wordOffset_end
+                    if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
+                        sentence_text = sentences[ent_bh.sentence_index].sentence_text
+                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and sentence_text[e_idx_bh] in ['、', ',', '。']:
+                            multi_winner_l.append(ent_bh.entity_text)
+                            e_idx_fr = e_idx_bh
+                            i = j + 1
+                        else:
+                            break
+                    else:
+                        break
+            if len(multi_winner_l)>=2:
+                for project in prem[0].values():
+                    if not isinstance(project, dict):
+                        continue
+                    for v in project.values():
+                        for d in v['roleList']:
+                            if d.get('role_name', '') == 'win_tenderer':
+                                winner = d.get('role_text')
+                                if winner == multi_winner_l[0]:
+                                    d['multi_winner'] = ','.join(multi_winner_l)
+                                    break
+
+        if len(finalists)>=2:
+            for project in prem[0].values():
+                if not isinstance(project, dict):
+                    continue
+                for v in project.values():
+                    for d in v['roleList']:
+                        if d.get('role_name', '') == 'win_tenderer':
+                            winner = d.get('role_text')
+                            if winner in finalists:
+                                d['multi_winner'] = ','.join(finalists)
+    else:
+        for i in range(len(list_entitys[0])):
+            ent = list_entitys[0][i]
+            if ent.entity_type in ['money']:
+                money = float(ent.entity_text)
+                if ent.in_attachment:
+                    moneys_attachment.append(money)
+                else:
+                    moneys.append(money)
+    return {'moneys': list(set(moneys)), 'moneys_attachment': list(set(moneys_attachment))}
+
 def update_prem(old_prem, new_prem):
     '''
     根据新旧对比,更新数据

+ 3 - 1
BiddingKG/dl/interface/predictor.py

@@ -1816,6 +1816,8 @@ class RoleRuleFinalAdd():
         '''
         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
+        if len(main_sentences)==0:
+            return 0
         # end_tokens = []
         for sentence in main_sentences[-5:][::-1]:  # 402073799 最后五句由后往前,匹配文末角色,日期
             # end_tokens.extend(sentence.tokens)
@@ -3796,7 +3798,7 @@ class DocChannel():
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
           '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
-          '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
+          '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
       self.life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',