|
@@ -464,6 +464,12 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
|
|
|
win_tenderer_set = set() # 记录所有预测为中标的实体集合
|
|
|
# print(PackageList)
|
|
|
#拿到各个实体的packageName,packageCode
|
|
|
+ main_contain_winner = False # 2024/10/11 判断正文是否包含中标人
|
|
|
+ for entity in list_entity:
|
|
|
+ if entity.entity_type in ['org','company'] and entity.label==2 and entity.values[entity.label]>0.7 and entity.in_attachment==False:
|
|
|
+ main_contain_winner = True
|
|
|
+ break
|
|
|
+
|
|
|
for entity in list_entity:
|
|
|
if entity.entity_type in ['org','company']:
|
|
|
#限制附件里角色values[label]最大概率prob
|
|
@@ -477,6 +483,8 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
|
|
|
values = entity.values
|
|
|
role_prob = float(values[int(entity.label)])
|
|
|
if role_prob>=on_value and str(entity.label)!="5":
|
|
|
+ if main_contain_winner and entity.in_attachment and entity.label in [2,3,4]: # 2024/10/11 正文包含中标人,不再提取附件中标人 避免 例:504046747 附件角色OCR错字变两个标段
|
|
|
+ continue
|
|
|
if str(entity.label) in ["0","1"]:
|
|
|
packageName = "Project"
|
|
|
else:
|
|
@@ -583,7 +591,7 @@ def getPackageScopePattern():
|
|
|
for item in df["list_word"]:
|
|
|
item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
|
|
|
pattern += item+"|"
|
|
|
- pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
|
|
|
+ pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}|##attachment##"
|
|
|
return pattern
|
|
|
|
|
|
pattern_packageScope = getPackageScopePattern()
|
|
@@ -824,11 +832,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
|
|
|
scope_begin = [PackageList_scope[j]["sentence_index"],
|
|
|
PackageList_scope[j]["offsetWords_begin"]]
|
|
|
else:
|
|
|
- if j == 0:
|
|
|
- scope_begin = [0, 0]
|
|
|
- else:
|
|
|
- scope_begin = [PackageList_scope[j - 1]["sentence_index"],
|
|
|
- PackageList_scope[j - 1]["offsetWords_begin"]]
|
|
|
+ scope_begin = [PackageList_scope[j]["sentence_index"], 0] # 2024/10/10 改为包作用域开始位置为包号所在句子开头
|
|
|
+ # if j == 0:
|
|
|
+ # scope_begin = [0, 0]
|
|
|
+ # else:
|
|
|
+ # scope_begin = [PackageList_scope[j - 1]["sentence_index"],
|
|
|
+ # PackageList_scope[j - 1]["offsetWords_begin"]]
|
|
|
+
|
|
|
if j == len(PackageList_scope) - 1:
|
|
|
scope_end = [list_sentence[-1].sentence_index,
|
|
|
changeIndexFromWordToWords(list_sentence[-1].tokens,
|
|
@@ -943,7 +953,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
|
|
|
def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
|
|
|
for i in range(len(packDict[packageName]["roleList"])):
|
|
|
- if packDict[packageName]["roleList"][i].entity_text==entity:
|
|
|
+ if packDict[packageName]["roleList"][i].entity_text==entity and not packDict[packageName]["roleList"][i].serviceTime:
|
|
|
# packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
|
|
|
packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
|
|
|
|
|
@@ -1591,7 +1601,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
|
|
|
continue
|
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
- if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
+ if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
+ continue
|
|
|
+ # 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
+ if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
|
|
|
continue
|
|
|
if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
|
|
|
if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
|
|
@@ -2024,7 +2037,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if entity.label in [2, 3, 4] and distance>=20:
|
|
|
break
|
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
|
- if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
+ break
|
|
|
+ # 角色为招标/代理人,排除"纪检|监察"相关的联系人
|
|
|
+ if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
break
|
|
|
if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
|
|
|
if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
|
|
@@ -2109,7 +2125,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
new_split_list[split_index][1]:
|
|
|
mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
|
|
|
if re.search(key_phone, mid_sentence):
|
|
|
- if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
|
|
|
+ if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
|
|
|
pass
|
|
|
else:
|
|
|
distance = 1
|
|
@@ -2162,7 +2178,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
|
|
|
if next_entity.entity_type == 'person' and _phone in p_phone:
|
|
|
pass
|
|
|
- elif entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
|
|
|
+ elif entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
|
|
|
pass
|
|
|
else:
|
|
|
distance = (tokens_num_dict[
|
|
@@ -2913,6 +2929,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
if get_tenderee_contacts:
|
|
|
break
|
|
|
|
|
|
+ # 如果同一个电话连到了不同的单位就直接去掉(2024-09-03 新增)
|
|
|
+ get_phone_dict = dict()
|
|
|
+ for k in PackDict.keys():
|
|
|
+ for i in range(len(PackDict[k]["roleList"])):
|
|
|
+ for item in PackDict[k]["roleList"][i].linklist:
|
|
|
+ if item[1]:
|
|
|
+ if item[1] not in get_phone_dict:
|
|
|
+ get_phone_dict[item[1]] = set()
|
|
|
+ get_phone_dict[item[1]].add(PackDict[k]["roleList"][i].entity_text)
|
|
|
+ # print(get_phone_dict)
|
|
|
+ remove_phone = []
|
|
|
+ for phone,role_list in get_phone_dict.items():
|
|
|
+ if len(role_list)>1:
|
|
|
+ remove_phone.append(phone)
|
|
|
+ for k in PackDict.keys():
|
|
|
+ for i in range(len(PackDict[k]["roleList"])):
|
|
|
+ remove_list = []
|
|
|
+ for item in PackDict[k]["roleList"][i].linklist:
|
|
|
+ if item[1] and item[1] in remove_phone:
|
|
|
+ remove_list.append(item)
|
|
|
+ for _item in remove_list:
|
|
|
+ PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
+
|
|
|
for pack in PackDict.keys():
|
|
|
for i in range(len(PackDict[pack]["roleList"])):
|
|
|
PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
|
|
@@ -4223,9 +4262,9 @@ def limit_maximum_amount(dic, list_entity):
|
|
|
if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
|
|
|
# print('单位元小金额且格式类似万元的乘以万倍')
|
|
|
l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
|
|
|
- else:
|
|
|
- # print('中标金额小于限额:%d元 去除' % minximum_amount)
|
|
|
- l["role_money"]['money'] = 0
|
|
|
+ # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
|
|
|
+ # # print('中标金额小于限额:%d元 去除' % minximum_amount)
|
|
|
+ # l["role_money"]['money'] = 0
|
|
|
|
|
|
if float(value['tendereeMoney']) > maximum_amount:
|
|
|
flag = 1
|
|
@@ -4246,9 +4285,9 @@ def limit_maximum_amount(dic, list_entity):
|
|
|
if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
|
|
|
# print('单位元小金额且格式类似万元的乘以万倍')
|
|
|
value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
|
|
|
- else:
|
|
|
- # print('招标金额小于限额:%d元 去除' % minximum_amount)
|
|
|
- value['tendereeMoney'] = 0
|
|
|
+ # else: # 20241011 取消小于最低金额改为0 避免小金额不提取 例:520248605
|
|
|
+ # # print('招标金额小于限额:%d元 去除' % minximum_amount)
|
|
|
+ # value['tendereeMoney'] = 0
|
|
|
|
|
|
|
|
|
def limit_maximum_amount_backup(prem, industry):
|
|
@@ -4296,69 +4335,66 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
|
|
|
:return:
|
|
|
'''
|
|
|
try:
|
|
|
- if 'win_tenderer' in str(prem) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
|
|
|
+ if 'win_tenderer' in str(prem[0]['prem']) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
|
|
|
sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
|
|
|
- for project in prem[0].values():
|
|
|
- if not isinstance(project, dict):
|
|
|
- continue
|
|
|
- for v in project.values():
|
|
|
- for d in v['roleList']:
|
|
|
- if d.get('role_name', '') == 'win_tenderer':
|
|
|
- winner = d.get('role_text')
|
|
|
- join_l = [winner]
|
|
|
- for list_entity in list_entitys:
|
|
|
- for i in range(len(list_entity)-1):
|
|
|
- _entity = list_entity[i]
|
|
|
- b = _entity.wordOffset_begin
|
|
|
- e = _entity.wordOffset_end
|
|
|
- if _entity.entity_type in ['org', 'company'] and _entity.label==2\
|
|
|
- and _entity.entity_text==winner:
|
|
|
- s = sentences[_entity.sentence_index].sentence_text
|
|
|
- find_joint = 0 # 是否包含联合体
|
|
|
- for j in range(i+1, len(list_entity)):
|
|
|
- behind_entity = list_entity[j]
|
|
|
- b2 = behind_entity.wordOffset_begin
|
|
|
- e2 = behind_entity.wordOffset_end
|
|
|
- if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
|
|
|
- and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
|
|
|
- re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
|
|
|
- join_l.append(behind_entity.entity_text)
|
|
|
- b = b2
|
|
|
- e = e2
|
|
|
- find_joint = 1
|
|
|
- elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
|
|
|
- join_l.append(behind_entity.entity_text)
|
|
|
- b = b2
|
|
|
- e = e2
|
|
|
- elif e == e2: # 修复重复实体导致中断情况
|
|
|
- continue
|
|
|
- else:
|
|
|
- break
|
|
|
- if len(join_l)>1:
|
|
|
- d['win_tenderer_joint'] = ','.join(set(join_l))
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- # behind_entity = list_entity[i + 1]
|
|
|
- # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
|
|
|
- # and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
|
|
|
- # s = sentences[_entity.sentence_index].sentence_text
|
|
|
- # b = _entity.wordOffset_begin
|
|
|
- # e = _entity.wordOffset_end
|
|
|
- # b2 = behind_entity.wordOffset_begin
|
|
|
- # e2 = behind_entity.wordOffset_end
|
|
|
- # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
|
|
|
- # print('联合体:', s[max(0, b-10):e2+10])
|
|
|
- # d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
|
|
|
- # break
|
|
|
- # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
|
|
|
- # d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
|
|
|
- # print('联合体:', s[max(0, b - 10):e2 + 10])
|
|
|
- # break
|
|
|
+ for v in prem[0]['prem'].values():
|
|
|
+ for d in v['roleList']:
|
|
|
+ if d.get('role_name', '') == 'win_tenderer':
|
|
|
+ winner = d.get('role_text')
|
|
|
+ join_l = [winner]
|
|
|
+ for list_entity in list_entitys:
|
|
|
+ for i in range(len(list_entity)-1):
|
|
|
+ _entity = list_entity[i]
|
|
|
+ b = _entity.wordOffset_begin
|
|
|
+ e = _entity.wordOffset_end
|
|
|
+ if _entity.entity_type in ['org', 'company'] and _entity.label==2\
|
|
|
+ and _entity.entity_text==winner:
|
|
|
+ s = sentences[_entity.sentence_index].sentence_text
|
|
|
+ find_joint = 0 # 是否包含联合体
|
|
|
+ for j in range(i+1, len(list_entity)):
|
|
|
+ behind_entity = list_entity[j]
|
|
|
+ b2 = behind_entity.wordOffset_begin
|
|
|
+ e2 = behind_entity.wordOffset_end
|
|
|
+ if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
|
|
|
+ and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
|
|
|
+ re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
|
|
|
+ join_l.append(behind_entity.entity_text)
|
|
|
+ b = b2
|
|
|
+ e = e2
|
|
|
+ find_joint = 1
|
|
|
+ elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
|
|
|
+ join_l.append(behind_entity.entity_text)
|
|
|
+ b = b2
|
|
|
+ e = e2
|
|
|
+ elif e == e2: # 修复重复实体导致中断情况
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ if len(join_l)>1:
|
|
|
+ d['win_tenderer_joint'] = ','.join(set(join_l))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ # behind_entity = list_entity[i + 1]
|
|
|
+ # if _entity.sentence_index== behind_entity.sentence_index and _entity.entity_type in ['org', 'company'] and _entity.label==2\
|
|
|
+ # and _entity.entity_text==winner and behind_entity.entity_type in ['org', 'company'] and behind_entity.label==5:
|
|
|
+ # s = sentences[_entity.sentence_index].sentence_text
|
|
|
+ # b = _entity.wordOffset_begin
|
|
|
+ # e = _entity.wordOffset_end
|
|
|
+ # b2 = behind_entity.wordOffset_begin
|
|
|
+ # e2 = behind_entity.wordOffset_end
|
|
|
+ # if re.search('(联合体)', s[e2:e2+6]) and b2-e<3:
|
|
|
+ # print('联合体:', s[max(0, b-10):e2+10])
|
|
|
+ # d['win_tenderer_joint'] = '%s,%s'%(_entity.entity_text, behind_entity.entity_text)
|
|
|
+ # break
|
|
|
+ # elif re.search('(联合体((牵头|主办)(人|方|单位)|主体)|牵头(人|方|单位))|(联合体)?成员:|特殊普通合伙:', s[e:b2]) and b2-e<10:
|
|
|
+ # d['win_tenderer_joint'] = '%s,%s' % (_entity.entity_text, behind_entity.entity_text)
|
|
|
+ # print('联合体:', s[max(0, b - 10):e2 + 10])
|
|
|
+ # break
|
|
|
except Exception as e:
|
|
|
print('获取联合体抛出异常', e)
|
|
|
|
|
|
-def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
|
|
|
'''
|
|
|
获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
|
|
|
:param channel_dic:
|
|
@@ -4369,7 +4405,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
'''
|
|
|
|
|
|
def add_multi_winner(pack_l, winner_l):
|
|
|
- if len(prem[0]) > 1 and len(set([it[0] for it in pack_l])) > 1: # 多标段多中标人处理
|
|
|
+ if len(prem[0]['prem']) > 1 and len(set([it[0] for it in pack_l])) > 1: # 多标段多中标人处理
|
|
|
pk_dic = {}
|
|
|
for ent in winner_l:
|
|
|
for i in range(len(pack_l)):
|
|
@@ -4395,40 +4431,33 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
multi_winner = multi_winner - tenderee_or_agency
|
|
|
if len(multi_winner) < 2:
|
|
|
continue
|
|
|
- for project in prem[0].values():
|
|
|
- if not isinstance(project, dict):
|
|
|
- continue
|
|
|
- for k, v in project.items():
|
|
|
- if pk == k:
|
|
|
- for d in v['roleList']:
|
|
|
- if d.get('role_name', '') == 'win_tenderer':
|
|
|
- if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
|
|
|
- d['multi_winner'] = ','.join(set(multi_winner))
|
|
|
- else:
|
|
|
- multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
|
|
|
- if len(multi_winner) > 1:
|
|
|
- for project in prem[0].values():
|
|
|
- if not isinstance(project, dict):
|
|
|
- continue
|
|
|
- for v in project.values():
|
|
|
+ for k, v in prem[0]['prem'].items():
|
|
|
+ if pk == k:
|
|
|
for d in v['roleList']:
|
|
|
if d.get('role_name', '') == 'win_tenderer':
|
|
|
if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
|
|
|
d['multi_winner'] = ','.join(set(multi_winner))
|
|
|
- break
|
|
|
+ elif 0 < len(prem[0]['prem']) < 3: # 修复 单包多中标人 例:285780273
|
|
|
+ multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
|
|
|
+ if len(multi_winner) > 1:
|
|
|
+ for v in prem[0]['prem'].values():
|
|
|
+ for d in v['roleList']:
|
|
|
+ if d.get('role_name', '') == 'win_tenderer':
|
|
|
+ if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
|
|
|
+ d['multi_winner'] = ','.join(set(multi_winner))
|
|
|
+ break
|
|
|
|
|
|
moneys = []
|
|
|
moneys_attachment = []
|
|
|
- if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
|
|
|
+ if channel_dic['docchannel']['life_docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
|
|
|
sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
|
|
|
- entitys = sorted(list_entitys[0], key=lambda x: x.sentence_index)
|
|
|
finalists = [] # 入围供应商
|
|
|
multi_winner_l = [] # 保存中标人名称列表
|
|
|
tenderee_or_agency = set()
|
|
|
package_l = []
|
|
|
i = 0
|
|
|
- while i < len(entitys)-1:
|
|
|
- ent = entitys[i]
|
|
|
+ while i < len(list_entitys[0])-1:
|
|
|
+ ent = list_entitys[0][i]
|
|
|
b_idx_fr = ent.wordOffset_begin
|
|
|
e_idx_fr = ent.wordOffset_end
|
|
|
i += 1
|
|
@@ -4440,19 +4469,18 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
moneys.append(money)
|
|
|
elif ent.entity_type in ['package']:
|
|
|
package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
|
|
|
- elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
|
|
|
- tenderee_or_agency.add(ent.entity_text)
|
|
|
- elif ent.entity_type in ['org', 'company'] and ent.label == 2:
|
|
|
+ elif ent.entity_type in ['org', 'company']:
|
|
|
sentence_text = sentences[ent.sentence_index].sentence_text
|
|
|
pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
|
|
|
- if ent.values[ent.label] > 0.8:
|
|
|
+ if ent.label in [0,1] and ent.values[ent.label] > 0.8:
|
|
|
+ tenderee_or_agency.add(ent.entity_text)
|
|
|
+ elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
|
|
|
multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
|
|
|
for j in range(i, len(list_entitys[0])):
|
|
|
ent_bh = list_entitys[0][j]
|
|
|
b_idx_bh = ent_bh.wordOffset_begin
|
|
|
e_idx_bh = ent_bh.wordOffset_end
|
|
|
- if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
|
|
|
- sentence_text = sentences[ent_bh.sentence_index].sentence_text
|
|
|
+ if ent_bh.entity_type in ['org','company'] and ent_bh.label in [2,5] and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
|
|
|
if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
|
|
|
len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
|
|
|
multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
|
|
@@ -4460,7 +4488,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
i = j + 1
|
|
|
else:
|
|
|
break
|
|
|
- elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
|
|
|
+ elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
|
|
|
multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
|
|
|
e_idx_fr = e_idx_bh
|
|
|
i = j + 1
|
|
@@ -4470,6 +4498,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
|
|
|
break
|
|
|
if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
|
|
|
finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
|
|
|
+ elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
|
|
|
+ multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
|
|
|
|
|
|
if len(multi_winner_l)>=2:
|
|
|
winner_main = [it for it in multi_winner_l if not it[3]]
|
|
@@ -4532,6 +4562,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
|
|
|
k = list(old_prem.keys()-set(['Project']))[0]
|
|
|
k_new = list(new_prem.keys())[0]
|
|
|
new_prem[k] = new_prem.pop(k_new)
|
|
|
+ elif len(old_prem) == 1 and len(new_prem) == 1 and 'Project' not in old_prem and set(new_prem)&set(old_prem)==set(): # 如果表格提取包与非表格提取都是一个包且不同,把表格提取包名替换为非表格包名
|
|
|
+ k = list(old_prem.keys()-set(['Project']))[0]
|
|
|
+ k_new = list(new_prem.keys())[0]
|
|
|
+ new_prem[k] = new_prem.pop(k_new)
|
|
|
|
|
|
if len(new_prem) == len(old_prem) == 1 and 'Project' not in new_prem and 'Project' in old_prem: # 如果表格提取到包号,非表格没提取到,合并到Project
|
|
|
k = list(new_prem.keys())[0]
|
|
@@ -4552,6 +4586,8 @@ def update_prem(old_prem, new_prem, in_attachment=False):
|
|
|
tmp_l.append(d2)
|
|
|
if d2['role_text'] != "":
|
|
|
d['role_text'] = d2['role_text']
|
|
|
+ if d2['serviceTime'] != "":
|
|
|
+ d['serviceTime'] = d2['serviceTime']
|
|
|
if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
|
|
|
d['role_money']['money'] = d2['role_money']['money']
|
|
|
d['role_money']['money_unit'] = d2['role_money']['money_unit']
|
|
@@ -4585,12 +4621,14 @@ def update_prem(old_prem, new_prem, in_attachment=False):
|
|
|
tmp_l.append(d2)
|
|
|
if d2['role_text'] != "":
|
|
|
d['role_text'] = d2['role_text']
|
|
|
+ if d2['serviceTime'] != "":
|
|
|
+ d['serviceTime'] = d2['serviceTime']
|
|
|
if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
|
|
|
d['role_money']['money'] = d2['role_money']['money']
|
|
|
d['role_money']['money_unit'] = d2['role_money']['money_unit']
|
|
|
- for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
|
|
|
- if d2[k]:
|
|
|
- d[k] = d2[k]
|
|
|
+ for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
|
|
|
+ if d2[k2]:
|
|
|
+ d[k2] = d2[k2]
|
|
|
for d2 in v['roleList']:
|
|
|
if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
|
|
|
old_prem[k]['roleList'].append(d2)
|
|
@@ -4601,7 +4639,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
|
|
|
|
|
|
# return old_prem
|
|
|
|
|
|
-def confirm_prem(prem, channel_dic):
|
|
|
+def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
|
|
|
'''
|
|
|
规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
|
|
|
:param prem: prem 字段字典
|
|
@@ -4610,6 +4648,8 @@ def confirm_prem(prem, channel_dic):
|
|
|
if len(prem) > 1: # 表格提取到中标人的,去掉project包中标人
|
|
|
pro_winner = set()
|
|
|
other_winner = set()
|
|
|
+ other_winner_prob = 0
|
|
|
+ pro_winner_prob = 0
|
|
|
empty_roleList = []
|
|
|
for k in prem:
|
|
|
prem[k]['uuid'] = str(uuid.uuid4()) # 20240627 每个包都添加uuid
|
|
@@ -4623,21 +4663,33 @@ def confirm_prem(prem, channel_dic):
|
|
|
pro_winner.update(set(d['win_tenderer_joint'].split(',')))
|
|
|
if 'multi_winner' in d:
|
|
|
pro_winner.update(set(d['multi_winner'].split(',')))
|
|
|
+ if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
|
|
|
+ pro_winner_prob = d.get('role_prob', 0)
|
|
|
else:
|
|
|
other_winner.add(d['role_text'])
|
|
|
if 'win_tenderer_joint' in d:
|
|
|
other_winner.update(set(d['win_tenderer_joint'].split(',')))
|
|
|
if 'multi_winner' in d:
|
|
|
other_winner.update(set(d['multi_winner'].split(',')))
|
|
|
- if pro_winner & other_winner != set():
|
|
|
+ if d['role_name'] == 'win_tenderer' and d.get('role_prob', 0)>0.6:
|
|
|
+ other_winner_prob = d.get('role_prob', 0)
|
|
|
+ if pro_winner!=set() and (pro_winner & other_winner != set() or other_winner_prob>pro_winner_prob): # 如果默认包与其他包中标人重复或其他包中标人概率比默认包大,删除默认包中标人
|
|
|
prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
|
|
|
d['role_name'] not in ['win_tenderer', 'second_tenderer',
|
|
|
'third_tenderer']]
|
|
|
+ elif other_winner_prob<pro_winner_prob and len(prem)==2: # 两个包情况,如果默认包中标人概率比其他包大,删除其他包
|
|
|
+ rm_k = [k for k in prem if k != 'Project']
|
|
|
+ for k in rm_k:
|
|
|
+ prem.pop(k)
|
|
|
if other_winner and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
|
|
|
for k in empty_roleList:
|
|
|
prem.pop(k)
|
|
|
elif "Project" in prem:
|
|
|
prem['Project']['uuid'] = str(uuid.uuid4())
|
|
|
+ if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段,把总投资作招标金额
|
|
|
+ for k in prem:
|
|
|
+ if float(prem[k]['tendereeMoney'])==0:
|
|
|
+ prem[k]['tendereeMoney'] = total_tendereeMoney
|
|
|
|
|
|
|
|
|
def fix_single_source(prem, channel_dic, original_docchannel):
|