|
@@ -277,8 +277,8 @@ class CodeNamePredict():
|
|
|
|
|
|
def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
|
|
def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
|
|
#@summary: 获取每篇文章的code和name
|
|
#@summary: 获取每篇文章的code和name
|
|
- pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
|
|
|
|
-
|
|
|
|
|
|
+ # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
|
|
|
|
+ pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院')
|
|
result = []
|
|
result = []
|
|
index_unk = self.word2index.get("<unk>")
|
|
index_unk = self.word2index.get("<unk>")
|
|
# index_pad = self.word2index.get("<pad>")
|
|
# index_pad = self.word2index.get("<pad>")
|
|
@@ -393,20 +393,40 @@ class CodeNamePredict():
|
|
|
|
|
|
#add code to entitys
|
|
#add code to entitys
|
|
list_entity.append(temp_entitys[h])
|
|
list_entity.append(temp_entitys[h])
|
|
-
|
|
|
|
- if the_code not in code_set:
|
|
|
|
|
|
+ if re.search(',|/|;|、|,', the_code) and len(the_code)>25:
|
|
|
|
+ for it in re.split(',|/|;|、|,', the_code):
|
|
|
|
+ if len(it) > 8:
|
|
|
|
+ if it not in code_set:
|
|
|
|
+ code_set.add(it)
|
|
|
|
+ item['code'].append(it)
|
|
|
|
+ elif len(item['code']) > 0:
|
|
|
|
+ new_it = item['code'][-1] + re.search(',|/|;|、|,', the_code).group(0) + it
|
|
|
|
+ if new_it not in code_set:
|
|
|
|
+ code_set.add(new_it)
|
|
|
|
+ item['code'][-1] = new_it
|
|
|
|
+ else:
|
|
|
|
+ if the_code not in code_set:
|
|
|
|
+ code_set.add(the_code)
|
|
|
|
+ item['code'].append(the_code)
|
|
|
|
+ break
|
|
|
|
+ elif the_code not in code_set:
|
|
code_set.add(the_code)
|
|
code_set.add(the_code)
|
|
- item['code'] = list(code_set)
|
|
|
|
|
|
+ item['code'].append(the_code)
|
|
|
|
+
|
|
|
|
+ # if the_code not in code_set:
|
|
|
|
+ # code_set.add(the_code)
|
|
|
|
+ # item['code'] = list(code_set)
|
|
for iter in re.finditer(self.PN_pattern,join_predict):
|
|
for iter in re.finditer(self.PN_pattern,join_predict):
|
|
_name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
|
|
_name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
|
|
|
|
|
|
#add name to entitys
|
|
#add name to entitys
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
_entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
|
|
list_entity.append(_entity)
|
|
list_entity.append(_entity)
|
|
- w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
|
|
|
|
|
|
+ # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
|
|
|
|
+ w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
|
|
if _name not in dict_name_freq_score:
|
|
if _name not in dict_name_freq_score:
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
|
|
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
|
|
else:
|
|
else:
|
|
dict_name_freq_score[_name][0] += 1
|
|
dict_name_freq_score[_name][0] += 1
|
|
'''
|
|
'''
|
|
@@ -423,18 +443,21 @@ class CodeNamePredict():
|
|
_begin_index += _LEN
|
|
_begin_index += _LEN
|
|
|
|
|
|
list_name_freq_score = []
|
|
list_name_freq_score = []
|
|
|
|
+ # print('模型预测项目名称:', dict_name_freq_score)
|
|
|
|
|
|
# 2020/11/23 大网站规则调整
|
|
# 2020/11/23 大网站规则调整
|
|
if len(dict_name_freq_score) == 0:
|
|
if len(dict_name_freq_score) == 0:
|
|
- name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
|
|
|
|
+ # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
|
|
|
|
+ name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
|
|
for sentence in list_sentence:
|
|
for sentence in list_sentence:
|
|
# pad_sentence = sentence.sentence_text
|
|
# pad_sentence = sentence.sentence_text
|
|
othername = re.search(name_re1, sentence.sentence_text)
|
|
othername = re.search(name_re1, sentence.sentence_text)
|
|
if othername != None:
|
|
if othername != None:
|
|
- project_name = othername.group(3)
|
|
|
|
|
|
+ project_name = othername.group('name')
|
|
beg = find_index([project_name], sentence.sentence_text)[0]
|
|
beg = find_index([project_name], sentence.sentence_text)[0]
|
|
end = beg + len(project_name)
|
|
end = beg + len(project_name)
|
|
_name = self.fitDataByRule(sentence.sentence_text[beg:end])
|
|
_name = self.fitDataByRule(sentence.sentence_text[beg:end])
|
|
|
|
+ # print('规则召回项目名称:', _name)
|
|
# add name to entitys
|
|
# add name to entitys
|
|
_entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
|
|
_entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
|
|
sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
|
|
sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
|
|
@@ -444,7 +467,7 @@ class CodeNamePredict():
|
|
w = 1
|
|
w = 1
|
|
if _name not in dict_name_freq_score:
|
|
if _name not in dict_name_freq_score:
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
|
|
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
|
|
else:
|
|
else:
|
|
dict_name_freq_score[_name][0] += 1
|
|
dict_name_freq_score[_name][0] += 1
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
@@ -461,6 +484,8 @@ class CodeNamePredict():
|
|
if len(list_name_freq_score)>0:
|
|
if len(list_name_freq_score)>0:
|
|
list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
|
|
list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
|
|
item['name'] = list_name_freq_score[0][0]
|
|
item['name'] = list_name_freq_score[0][0]
|
|
|
|
+ # for it in list_name_freq_score:
|
|
|
|
+ # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
|
|
# if list_name_freq_score[0][1][0]>1:
|
|
# if list_name_freq_score[0][1][0]>1:
|
|
# item[1]['name'] = list_name_freq_score[0][0]
|
|
# item[1]['name'] = list_name_freq_score[0][0]
|
|
# else:
|
|
# else:
|
|
@@ -474,9 +499,10 @@ class CodeNamePredict():
|
|
# if othercode != None:
|
|
# if othercode != None:
|
|
# item[1]['code'].append(othercode.group(2))
|
|
# item[1]['code'].append(othercode.group(2))
|
|
# 2020/11/23 大网站规则调整
|
|
# 2020/11/23 大网站规则调整
|
|
- othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
|
|
|
|
|
|
+ othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
|
|
if othercode != None:
|
|
if othercode != None:
|
|
- item['code'].append(othercode.group(3))
|
|
|
|
|
|
+ item['code'].append(othercode.group('code'))
|
|
|
|
+ # print('规则召回项目编号:', othercode.group('code'))
|
|
item['code'] = [code for code in item['code'] if len(code)<500]
|
|
item['code'] = [code for code in item['code'] if len(code)<500]
|
|
item['code'].sort(key=lambda x:len(x),reverse=True)
|
|
item['code'].sort(key=lambda x:len(x),reverse=True)
|
|
result.append(item)
|
|
result.append(item)
|
|
@@ -692,6 +718,7 @@ class PREMPredict():
|
|
text_tup = text_list[i]
|
|
text_tup = text_list[i]
|
|
front, middle, behind = text_tup
|
|
front, middle, behind = text_tup
|
|
whole = "".join(text_tup)
|
|
whole = "".join(text_tup)
|
|
|
|
+ # print('模型预测角色:', front, entity.entity_text, label, values)
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
|
|
label = 5
|
|
label = 5
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
|
|
@@ -770,6 +797,7 @@ class PREMPredict():
|
|
whole = "".join(text_tup)
|
|
whole = "".join(text_tup)
|
|
# print('金额: ', entity.entity_text, label, values, front, middle, behind)
|
|
# print('金额: ', entity.entity_text, label, values, front, middle, behind)
|
|
if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
|
|
|
|
+ # print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
|
|
label = 2
|
|
label = 2
|
|
elif label == 1: # 错误中标金额处理
|
|
elif label == 1: # 错误中标金额处理
|
|
if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
|
|
@@ -1435,6 +1463,7 @@ class RoleRulePredictor():
|
|
self.pattern_money_other, _span[0]) is None:
|
|
self.pattern_money_other, _span[0]) is None:
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.label = 0
|
|
p_entity.label = 0
|
|
|
|
+ # print('规则召回预算金额:', p_entity.entity_text, _span[0])
|
|
if re.search(self.pattern_money_tenderer, _span[0]) is not None:
|
|
if re.search(self.pattern_money_tenderer, _span[0]) is not None:
|
|
if re.search(self.pattern_money_other, _span[0]) is not None:
|
|
if re.search(self.pattern_money_other, _span[0]) is not None:
|
|
if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
|
|
if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
|
|
@@ -1453,6 +1482,7 @@ class RoleRulePredictor():
|
|
, _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
|
|
, _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
p_entity.label = 0
|
|
p_entity.label = 0
|
|
|
|
+ # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
|
|
|
|
|
|
# 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
# 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
|
|
list_p = []
|
|
list_p = []
|