|
@@ -806,7 +806,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
|
|
|
else:
|
|
|
scope_end = [PackageList_scope[j + 1]["sentence_index"],
|
|
|
PackageList_scope[j + 1]["offsetWords_begin"]]
|
|
|
- if PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
|
|
|
+ if j>0 and PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
|
|
|
PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \
|
|
|
PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]:
|
|
|
continue
|
|
@@ -837,8 +837,8 @@ def getPackagesFromArticle(list_sentence, list_entity):
|
|
|
if len(True_package2) > 2: # 同时包含多标段及多中标人的
|
|
|
PackageList_scope = PackageList_scope + PackageList_scope2
|
|
|
PackageList = get_package_scope(PackageList_scope)
|
|
|
- if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project
|
|
|
- return [], set(), {}
|
|
|
+ # if len(PackageSet)<2: # 20230922只提取到一个包号的去掉,都放在默认包project 2024/02/02 注释掉,防止多标段每篇公告只公布一个标段的没法提取标段号
|
|
|
+ # return [], set(), {}
|
|
|
return PackageList, PackageSet, dict_packageCode
|
|
|
|
|
|
|
|
@@ -3891,7 +3891,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
e_idx_bh = ent_bh.wordOffset_end
|
|
|
if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
|
|
|
sentence_text = sentences[ent_bh.sentence_index].sentence_text
|
|
|
- if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and sentence_text[e_idx_bh] in ['、', ',', '。']:
|
|
|
+ if sentence_text[e_idx_fr:b_idx_bh] in [';','、'] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in ['、', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
|
|
|
multi_winner_l.append(ent_bh.entity_text)
|
|
|
e_idx_fr = e_idx_bh
|
|
|
i = j + 1
|