|
@@ -60,6 +60,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
|
|
"district": {"predictor": None, "Lock": RLock()},
|
|
"district": {"predictor": None, "Lock": RLock()},
|
|
'tableprem': {"predictor": None, "Lock": RLock()},
|
|
'tableprem': {"predictor": None, "Lock": RLock()},
|
|
'candidate': {"predictor": None, "Lock": RLock()},
|
|
'candidate': {"predictor": None, "Lock": RLock()},
|
|
|
|
+ 'websource_tenderee': {"predictor": None, "Lock": RLock()},
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -107,6 +108,8 @@ def getPredictor(_type):
|
|
dict_predictor[_type]["predictor"] = TablePremExtractor()
|
|
dict_predictor[_type]["predictor"] = TablePremExtractor()
|
|
if _type == 'candidate':
|
|
if _type == 'candidate':
|
|
dict_predictor[_type]["predictor"] = CandidateExtractor()
|
|
dict_predictor[_type]["predictor"] = CandidateExtractor()
|
|
|
|
+ if _type == 'websource_tenderee':
|
|
|
|
+ dict_predictor[_type]['predictor'] = WebsourceTenderee()
|
|
return dict_predictor[_type]["predictor"]
|
|
return dict_predictor[_type]["predictor"]
|
|
raise NameError("no this type of predictor")
|
|
raise NameError("no this type of predictor")
|
|
|
|
|
|
@@ -1564,7 +1567,7 @@ class RoleRulePredictor():
|
|
break
|
|
break
|
|
if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
|
|
if str(_span[0][-len(str(_name)):]+_span[1] + _span[2][:len(str(_name))]).find(
|
|
_name) >= 0:
|
|
_name) >= 0:
|
|
- if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
|
|
|
|
|
|
+ if p_entity.entity_text in agency_set or re.search('(代理|管理|咨询|招投?标|采购)\w{,6}公司', p_entity.entity_text): # 在代理人集合的作为代理人
|
|
find_flag = True
|
|
find_flag = True
|
|
_label = 1
|
|
_label = 1
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
@@ -1575,6 +1578,8 @@ class RoleRulePredictor():
|
|
_label = 0
|
|
_label = 0
|
|
p_entity.label = _label
|
|
p_entity.label = _label
|
|
p_entity.values[int(_label)] = on_value
|
|
p_entity.values[int(_label)] = on_value
|
|
|
|
+ if 6<len(p_entity.entity_text) < 20: # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
|
|
|
|
+ p_entity.values[int(_label)] += 0.005
|
|
break
|
|
break
|
|
if p_entity.sentence_index >= 4:
|
|
if p_entity.sentence_index >= 4:
|
|
break
|
|
break
|
|
@@ -5322,7 +5327,10 @@ class TableTag2List():
|
|
try:
|
|
try:
|
|
if text_process != None:
|
|
if text_process != None:
|
|
# text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
|
|
# text = [re.sub('\xa0', '', text_process(cell, final=False)), 0]
|
|
- td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
|
|
|
|
+ # td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
|
|
+ td_text = re.sub('\s|\xa0', '', str(cell.get_text())) # 修复 370835008 td 内公司被p标签拆分为两半情况
|
|
|
|
+ if len(td_text)>30:
|
|
|
|
+ td_text = re.sub('\xa0', '', text_process(cell, final=False))
|
|
if td_text == "":
|
|
if td_text == "":
|
|
td_text = ' '
|
|
td_text = ' '
|
|
text = [td_text,0]
|
|
text = [td_text,0]
|
|
@@ -5472,6 +5480,7 @@ class TablePremExtractor(object):
|
|
'''
|
|
'''
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
|
|
+ text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
if len(text) > 50 or len(text)<4:
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -5794,6 +5803,7 @@ class CandidateExtractor(object):
|
|
'''
|
|
'''
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
, ',', text)
|
|
, ',', text)
|
|
|
|
+ text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
if text in nlp_enterprise:
|
|
if text in nlp_enterprise:
|
|
return text
|
|
return text
|
|
if len(text) > 50 or len(text)<4:
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -6056,6 +6066,57 @@ def role_special_predictor(web_source_name, content, nlp_enterprise):
|
|
if ser and ser.group(1) in nlp_enterprise:
|
|
if ser and ser.group(1) in nlp_enterprise:
|
|
return ser.group(1)
|
|
return ser.group(1)
|
|
|
|
|
|
|
|
+class WebsourceTenderee():
|
|
|
|
+ def __init__(self):
|
|
|
|
+ with open(os.path.dirname(__file__)+'/websource_tenderee.pkl', 'r', encoding='utf-8') as f:
|
|
|
|
+ self.webno2ree = json.load(f)
|
|
|
|
+
|
|
|
|
+ def get_websource_tenderee(self, web_source_no, prem):
|
|
|
|
+ '''
|
|
|
|
+ 通过数据源唯一招标人召回调整prem中的招标人,
|
|
|
|
+ :param web_source_no:
|
|
|
|
+ :param prem:
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ p = '(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校|银行|研究院|血站|红十字会|防治院|研究所)'
|
|
|
|
+ web_ree = self.webno2ree.get(web_source_no, '')
|
|
|
|
+ if web_ree != '':
|
|
|
|
+ if 'Project' in prem[0]['prem']:
|
|
|
|
+ find_tenderee = False
|
|
|
|
+ for d in prem[0]['prem']['Project']['roleList']:
|
|
|
|
+ if d['role_name'] == 'tenderee':
|
|
|
|
+ find_tenderee = True
|
|
|
|
+ if d['role_text'] == "":
|
|
|
|
+ d['role_text'] = web_ree
|
|
|
|
+ # elif re.search(p, web_ree) and (re.search(p, d['role_text'])==None and len(d['role_text'])<6): # 数据源唯一招标人以医院等结尾,角色中无相关关键词的,替换为数据源招标人
|
|
|
|
+ # d['role_text'] = web_ree
|
|
|
|
+ # elif re.search('有限(责任)?公司', web_ree) and (re.search('有限(责任)?公司', d['role_text'])==None and len(d['role_text'])<6):
|
|
|
|
+ # d['role_text'] = web_ree
|
|
|
|
+ break
|
|
|
|
+ if not find_tenderee: # 没招标人的添加
|
|
|
|
+ prem[0]['prem']['Project']['roleList'].append({'role_name': 'tenderee',
|
|
|
|
+ 'role_text': '%s' % web_ree,
|
|
|
|
+ 'role_money': {'money': 0, 'money_unit': '',
|
|
|
|
+ 'floating_ratio': '',
|
|
|
|
+ 'downward_floating_ratio': '',
|
|
|
|
+ 'discount_ratio': ''},
|
|
|
|
+ 'linklist': [],
|
|
|
|
+ 'serviceTime': '',
|
|
|
|
+ 'address': ''})
|
|
|
|
+ else:
|
|
|
|
+ prem[0]['prem']['Project'] = {'code': '',
|
|
|
|
+ 'tendereeMoney': 0,
|
|
|
|
+ 'roleList': [
|
|
|
|
+ {'role_name': 'tenderee',
|
|
|
|
+ 'role_text': '%s' % web_ree,
|
|
|
|
+ 'role_money': {'money': 0, 'money_unit': '', 'floating_ratio': '',
|
|
|
|
+ 'downward_floating_ratio': '', 'discount_ratio': ''},
|
|
|
|
+ 'linklist': [],
|
|
|
|
+ 'serviceTime': '',
|
|
|
|
+ 'address': ''}
|
|
|
|
+ ]}
|
|
|
|
+ return prem
|
|
|
|
+
|
|
|
|
|
|
def getSavedModel():
|
|
def getSavedModel():
|
|
#predictor = FormPredictor()
|
|
#predictor = FormPredictor()
|