|
@@ -676,10 +676,10 @@ class PREMPredict():
|
|
for i in range(len(predict_y)):
|
|
for i in range(len(predict_y)):
|
|
entity = points_entitys[i]
|
|
entity = points_entitys[i]
|
|
label = np.argmax(predict_y[i])
|
|
label = np.argmax(predict_y[i])
|
|
- values = []
|
|
|
|
- for item in predict_y[i]:
|
|
|
|
- values.append(item)
|
|
|
|
- entity.set_Money(label,values)
|
|
|
|
|
|
+ values = predict_y[i]
|
|
|
|
+ if label ==0 and entity.notes=="投资":
|
|
|
|
+ values[label] = 0.49
|
|
|
|
+ entity.set_Money(label, values)
|
|
|
|
|
|
def predict(self,list_sentences,list_entitys):
|
|
def predict(self,list_sentences,list_entitys):
|
|
self.predict_role(list_sentences,list_entitys)
|
|
self.predict_role(list_sentences,list_entitys)
|
|
@@ -1065,25 +1065,25 @@ class FormPredictor():
|
|
class RoleRulePredictor():
|
|
class RoleRulePredictor():
|
|
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|:|:|\s*$))"
|
|
|
|
|
|
+ self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|:|:|\s*)$)"
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
|
|
self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
|
|
- self.pattern_tenderee_right = "(?P<tenderee_right>(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?)|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)"
|
|
|
|
|
|
+ self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
|
|
|
|
|
|
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*$)|(受.{,20}委托))"
|
|
|
|
- self.pattern_agency_right = "(?P<agency_right>(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.*委托)"
|
|
|
|
|
|
+ self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
|
|
|
|
+ self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
# 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
|
|
- self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[^必须]{,4}[::是为]|(供应商|供货商|服务商|选定单位|指定的中介服务机构))[^必须]{,4}[::是为].{,2}|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)|((中标|成交)(结果|信息))(是|为|:|:|\s*$)|(单一来源采购(供应商|供货商|服务商))|((分包|标包).*供应商|供应商名称|服务机构|供方[::]))"
|
|
|
|
- self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[^必须]{,4}[::是为])"
|
|
|
|
- self.pattern_winTenderer_right = "(?P<winTenderer_right>[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
|
|
|
|
- self.pattern_winTenderer_whole = "(?P<winTenderer_whole>贵公司.*以.*中标|最终由.*竞买成功|经.*[以由].*中标|成交供应商,成交供应商名称:|谈判结果:由.{5,20}供货)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
+ self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
|
|
|
|
+ # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
|
|
|
|
+ self.pattern_winTenderer_right = "(?P<winTenderer_right>^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
|
|
|
|
+ self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
|
|
|
|
|
|
- self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[^必须]{,4}[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
|
|
|
|
|
|
+ # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
|
|
|
|
|
|
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(是|为|:|:|\s*$))|((评审结果|名次|排名)[::]第?[二2]名?))"
|
|
|
|
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
+ self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
|
|
|
|
+ self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))|((评审结果|名次|排名)[::]第?[三3]名?))"
|
|
|
|
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
+ self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
|
|
|
|
+ self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
|
|
|
|
|
|
self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
|
|
self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
|
|
["C",self.pattern_tenderee_center],
|
|
["C",self.pattern_tenderee_center],
|
|
@@ -1091,7 +1091,7 @@ class RoleRulePredictor():
|
|
"1":[["L",self.pattern_agency_left],
|
|
"1":[["L",self.pattern_agency_left],
|
|
["R",self.pattern_agency_right]],
|
|
["R",self.pattern_agency_right]],
|
|
"2":[["L",self.pattern_winTenderer_left],
|
|
"2":[["L",self.pattern_winTenderer_left],
|
|
- ["C",self.pattern_winTenderer_center],
|
|
|
|
|
|
+ # ["C",self.pattern_winTenderer_center],
|
|
["R",self.pattern_winTenderer_right],
|
|
["R",self.pattern_winTenderer_right],
|
|
["W",self.pattern_winTenderer_whole]],
|
|
["W",self.pattern_winTenderer_whole]],
|
|
"3":[["L",self.pattern_secondTenderer_left],
|
|
"3":[["L",self.pattern_secondTenderer_left],
|
|
@@ -1183,7 +1183,7 @@ class RoleRulePredictor():
|
|
|
|
|
|
#使用正则+距离解决冲突
|
|
#使用正则+距离解决冲突
|
|
# 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
# 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
|
|
- list_spans = [spans[0][-30:],spans[0][-20:]+spans[1],spans[2]]
|
|
|
|
|
|
+ list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]]
|
|
for _i_span in range(len(list_spans)):
|
|
for _i_span in range(len(list_spans)):
|
|
# print(list_spans[_i_span],p_entity.entity_text)
|
|
# print(list_spans[_i_span],p_entity.entity_text)
|
|
for _pattern in self.pattern_whole:
|
|
for _pattern in self.pattern_whole:
|
|
@@ -1710,11 +1710,14 @@ class ProductAttributesPredictor():
|
|
order_begin = "%s-%s-01" % (year, month)
|
|
order_begin = "%s-%s-01" % (year, month)
|
|
order_end = "%s-%s-%s" % (year, month, num)
|
|
order_end = "%s-%s-%s" % (year, month, num)
|
|
return order_begin, order_end
|
|
return order_begin, order_end
|
|
- if re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)\d{1,2}日?$', text):
|
|
|
|
- text = re.sub('年|月|/|-', '-', text)
|
|
|
|
- text = text.replace('日', '')
|
|
|
|
- order_begin = text
|
|
|
|
- order_end = text
|
|
|
|
|
|
+ t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
|
|
|
|
+ if t2:
|
|
|
|
+ y = t2.group(1)
|
|
|
|
+ m = t2.group(3)
|
|
|
|
+ d = t2.group(5)
|
|
|
|
+ m = '0'+ m if len(m)<2 else m
|
|
|
|
+ d = '0'+d if len(d)<2 else d
|
|
|
|
+ order_begin = order_end = "%s-%s-%s"%(y,m,d)
|
|
return order_begin, order_end
|
|
return order_begin, order_end
|
|
all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
|
|
all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
|
|
'(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
|
|
'(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
|
|
@@ -1743,9 +1746,10 @@ class ProductAttributesPredictor():
|
|
y2 = y1 if y2 == "" else y2
|
|
y2 = y1 if y2 == "" else y2
|
|
d1 = '1' if d1 == "" else d1
|
|
d1 = '1' if d1 == "" else d1
|
|
d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
|
|
d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
|
|
- for it in (m1,d1,m2,d2):
|
|
|
|
- if len(it)<2:
|
|
|
|
- it = '0'+it
|
|
|
|
|
|
+ m1 = '0' + m1 if len(m1) < 2 else m1
|
|
|
|
+ m2 = '0' + m2 if len(m2) < 2 else m2
|
|
|
|
+ d1 = '0' + d1 if len(d1) < 2 else d1
|
|
|
|
+ d2 = '0' + d2 if len(d2) < 2 else d2
|
|
order_begin = "%s-%s-%s"%(y1,m1,d1)
|
|
order_begin = "%s-%s-%s"%(y1,m1,d1)
|
|
order_end = "%s-%s-%s"%(y2,m2,d2)
|
|
order_end = "%s-%s-%s"%(y2,m2,d2)
|
|
return order_begin, order_end
|
|
return order_begin, order_end
|
|
@@ -2101,6 +2105,8 @@ class DocChannel():
|
|
doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
|
|
doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
|
|
else:
|
|
else:
|
|
doc_sens = ' '.join(doc_word_list[:self.sequen_len])
|
|
doc_sens = ' '.join(doc_word_list[:self.sequen_len])
|
|
|
|
+ # print('标题:',segword_title)
|
|
|
|
+ # print('正文:',segword_content)
|
|
datas.append(doc_sens.split())
|
|
datas.append(doc_sens.split())
|
|
datas_title.append(segword_title.split())
|
|
datas_title.append(segword_title.split())
|
|
# print('完成预处理')
|
|
# print('完成预处理')
|
|
@@ -2131,7 +2137,10 @@ class DocChannel():
|
|
tokens = [it for l in token_l for it in l]
|
|
tokens = [it for l in token_l for it in l]
|
|
content = ' '.join(tokens[:500])
|
|
content = ' '.join(tokens[:500])
|
|
|
|
|
|
- data_content, data_title = self.predict_process(docid='', doctitle=title[:50], dochtmlcon=content) # 标题最多取50字
|
|
|
|
|
|
+ title = re.sub('[^\u4e00-\u9fa5]', '', title)
|
|
|
|
+ if len(title)>50:
|
|
|
|
+ title = title[:20]+title[-30:]
|
|
|
|
+ data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
|
|
text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
|
|
text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
|
|
title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
|
|
title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
|
|
|
|
|
|
@@ -2147,6 +2156,7 @@ class DocChannel():
|
|
)
|
|
)
|
|
id = np.argmax(pred, axis=1)[0]
|
|
id = np.argmax(pred, axis=1)[0]
|
|
prob = pred[0][id]
|
|
prob = pred[0][id]
|
|
|
|
+ # print('公告类别:', self.id2type[id], '概率:',prob)
|
|
if id == 0:
|
|
if id == 0:
|
|
pred = self.lift_sess.run(self.lift_softmax,
|
|
pred = self.lift_sess.run(self.lift_softmax,
|
|
feed_dict={
|
|
feed_dict={
|
|
@@ -2158,6 +2168,7 @@ class DocChannel():
|
|
)
|
|
)
|
|
id = np.argmax(pred, axis=1)[0]
|
|
id = np.argmax(pred, axis=1)[0]
|
|
prob = pred[0][id]
|
|
prob = pred[0][id]
|
|
|
|
+ # print('生命周期:',self.id2life[id], '概率:',prob)
|
|
if id == 6:
|
|
if id == 6:
|
|
if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
|
|
if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
|
|
# return '候选人公示', prob
|
|
# return '候选人公示', prob
|