|
@@ -867,6 +867,7 @@ class EPCPredict():
|
|
|
|
|
|
data_x = []
|
|
data_x = []
|
|
points_entitys = []
|
|
points_entitys = []
|
|
|
|
+ pre_texts = []
|
|
for list_entity,list_sentence in zip(list_entitys,list_sentences):
|
|
for list_entity,list_sentence in zip(list_entitys,list_sentences):
|
|
|
|
|
|
p_entitys = 0
|
|
p_entitys = 0
|
|
@@ -883,6 +884,7 @@ class EPCPredict():
|
|
item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
|
|
item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
|
|
data_x.append(item_x)
|
|
data_x.append(item_x)
|
|
points_entitys.append(entity)
|
|
points_entitys.append(entity)
|
|
|
|
+ pre_texts.append(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20))
|
|
|
|
|
|
p_entitys += 1
|
|
p_entitys += 1
|
|
|
|
|
|
@@ -890,13 +892,14 @@ class EPCPredict():
|
|
return None
|
|
return None
|
|
|
|
|
|
# return [data_x,points_entitys,dianhua]
|
|
# return [data_x,points_entitys,dianhua]
|
|
- return [data_x,points_entitys]
|
|
|
|
|
|
+ return [data_x,points_entitys, pre_texts]
|
|
|
|
|
|
def predict_person(self,list_sentences, list_entitys):
|
|
def predict_person(self,list_sentences, list_entitys):
|
|
datas = self.search_person_data(list_sentences, list_entitys)
|
|
datas = self.search_person_data(list_sentences, list_entitys)
|
|
if datas is None:
|
|
if datas is None:
|
|
return
|
|
return
|
|
points_entitys = datas[1]
|
|
points_entitys = datas[1]
|
|
|
|
+ pre_texts = datas[2]
|
|
# phone = datas[2]
|
|
# phone = datas[2]
|
|
if USE_PAI_EAS:
|
|
if USE_PAI_EAS:
|
|
_data = datas[0]
|
|
_data = datas[0]
|
|
@@ -922,6 +925,11 @@ class EPCPredict():
|
|
for i in range(len(predict_y)):
|
|
for i in range(len(predict_y)):
|
|
entity = points_entitys[i]
|
|
entity = points_entitys[i]
|
|
label = np.argmax(predict_y[i])
|
|
label = np.argmax(predict_y[i])
|
|
|
|
+ pre_text = ''.join(pre_texts[i][0])
|
|
|
|
+ # print('pre_text', pre_text)
|
|
|
|
+ if label==0 and re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text):
|
|
|
|
+ # print(entity.entity_text, re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text).group(0))
|
|
|
|
+ label = 4
|
|
values = []
|
|
values = []
|
|
for item in predict_y[i]:
|
|
for item in predict_y[i]:
|
|
values.append(item)
|
|
values.append(item)
|
|
@@ -4821,9 +4829,9 @@ class TablePremExtractor(object):
|
|
def __init__(self):
|
|
def __init__(self):
|
|
'''各要素表头规则'''
|
|
'''各要素表头规则'''
|
|
self.head_rule_dic = {
|
|
self.head_rule_dic = {
|
|
- 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])(编号|编码)",
|
|
|
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
- "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
|
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
"win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
|
|
"win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
@@ -4878,7 +4886,7 @@ class TablePremExtractor(object):
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
return flag,contain_header, header_dic
|
|
return flag,contain_header, header_dic
|
|
- elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
|
|
|
+ elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
contain_header = True
|
|
contain_header = True
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
|
|
|
|
@@ -5039,8 +5047,9 @@ class TablePremExtractor(object):
|
|
"role_text": tenderer,
|
|
"role_text": tenderer,
|
|
"serviceTime": ""
|
|
"serviceTime": ""
|
|
})
|
|
})
|
|
- if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃
|
|
|
|
|
|
+ if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
|
|
prem_dic.pop(package)
|
|
prem_dic.pop(package)
|
|
|
|
+ break
|
|
if multi_same_package:
|
|
if multi_same_package:
|
|
for k, v in package_fix2raw.items():
|
|
for k, v in package_fix2raw.items():
|
|
if k in prem_dic:
|
|
if k in prem_dic:
|
|
@@ -5164,7 +5173,7 @@ class CandidateExtractor(object):
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
- elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
|
|
|
+ elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
contain_header = True
|
|
contain_header = True
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
|
|
|