Parcourir la source

修复只公布一个标段的没提取包号;优化角色规则

lsm il y a 1 an
Parent
commit
995f8ec8c4

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -345,7 +345,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-02-01'}
+    version_date = {'version_date': '2024-03-01'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 4 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -806,7 +806,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
                     else:
                         scope_end = [PackageList_scope[j + 1]["sentence_index"],
                                      PackageList_scope[j + 1]["offsetWords_begin"]]
-                    if PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
+                    if j>0 and PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
                             PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \
                             PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]:
                         continue
@@ -837,8 +837,8 @@ def getPackagesFromArticle(list_sentence, list_entity):
     if len(True_package2) > 2: # 同时包含多标段及多中标人的
         PackageList_scope = PackageList_scope + PackageList_scope2
     PackageList = get_package_scope(PackageList_scope)
-    if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project
-        return [], set(), {}
+    # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号
+        # return [], set(), {}
     return PackageList, PackageSet, dict_packageCode
 
 
@@ -3891,7 +3891,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     e_idx_bh = ent_bh.wordOffset_end
                     if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
                         sentence_text = sentences[ent_bh.sentence_index].sentence_text
-                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and sentence_text[e_idx_bh] in ['、', ',', '。']:
+                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in ['、', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
                             multi_winner_l.append(ent_bh.entity_text)
                             e_idx_fr = e_idx_bh
                             i = j + 1

+ 2 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -96,7 +96,8 @@ class Model_role_classify_word():
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
         text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
-        text = re.sub('(采购|招标|发布)机构', '发布人', text)
+        text = re.sub('(中标|成交|中选|入围)(工程|项目)', '工程', text)  # 修复易错分为中标人
+        # text = re.sub('(采购|招标|发布)机构', '发布人', text)
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 2 - 2
BiddingKG/dl/interface/predictor.py

@@ -791,10 +791,10 @@ class PREMPredict():
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
                 label = 5
             elif label == 0:
-                if re.search('拟邀请$', front):
+                if re.search('拟邀请$|受邀谈判方', front):
                     label = 2
                     values[label] = 0.501
-                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
+                elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and re.search('(招标|采购|咨询|代理|管理)\w*公司|(采购|交易)(中心|市场)', entity.entity_text):
                     label = 1
                     values[label] = 0.501
                 elif re.search('采用$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-