|
@@ -4,7 +4,6 @@ Created on 2019年1月4日
|
|
@author: User
|
|
@author: User
|
|
'''
|
|
'''
|
|
import os
|
|
import os
|
|
-
|
|
|
|
from bs4 import BeautifulSoup, Comment
|
|
from bs4 import BeautifulSoup, Comment
|
|
import copy
|
|
import copy
|
|
import re
|
|
import re
|
|
@@ -24,10 +23,11 @@ import BiddingKG.dl.interface.Preprocessing as Preprocessing
|
|
import BiddingKG.dl.interface.getAttributes as getAttributes
|
|
import BiddingKG.dl.interface.getAttributes as getAttributes
|
|
import BiddingKG.dl.complaint.punish_predictor as punish_rule
|
|
import BiddingKG.dl.complaint.punish_predictor as punish_rule
|
|
import json
|
|
import json
|
|
|
|
+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
|
|
|
|
+from BiddingKG.dl.ratio.re_ratio import extract_ratio
|
|
|
|
|
|
|
|
|
|
-
|
|
|
|
-#自定义jsonEncoder
|
|
|
|
|
|
+# 自定义jsonEncoder
|
|
class MyEncoder(json.JSONEncoder):
|
|
class MyEncoder(json.JSONEncoder):
|
|
def default(self, obj):
|
|
def default(self, obj):
|
|
if isinstance(obj, np.ndarray):
|
|
if isinstance(obj, np.ndarray):
|
|
@@ -41,39 +41,40 @@ class MyEncoder(json.JSONEncoder):
|
|
return obj
|
|
return obj
|
|
return json.JSONEncoder.default(self, obj)
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
|
|
+
|
|
def predict(doc_id,text,title="",page_time="",**kwargs):
|
|
def predict(doc_id,text,title="",page_time="",**kwargs):
|
|
cost_time = dict()
|
|
cost_time = dict()
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
- log("start process doc %s"%(str(doc_id)))
|
|
|
|
|
|
+ # log("start process doc %s"%(str(doc_id)))
|
|
list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
|
|
list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
|
|
- log("get preprocessed done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get preprocessed done of doc_id%s"%(doc_id))
|
|
cost_time["preprocess"] = round(time.time()-start_time,2)
|
|
cost_time["preprocess"] = round(time.time()-start_time,2)
|
|
cost_time.update(_cost_time)
|
|
cost_time.update(_cost_time)
|
|
|
|
|
|
- #依赖句子顺序
|
|
|
|
|
|
+ # 依赖句子顺序
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
|
|
list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
|
|
cost_time["channel"] = round(time.time()-start_time,2)
|
|
cost_time["channel"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
|
|
codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
|
|
- log("get codename done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get codename done of doc_id%s"%(doc_id))
|
|
cost_time["codename"] = round(time.time()-start_time,2)
|
|
cost_time["codename"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
predictor.getPredictor("prem").predict(list_sentences,list_entitys)
|
|
predictor.getPredictor("prem").predict(list_sentences,list_entitys)
|
|
- log("get prem done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get prem done of doc_id%s"%(doc_id))
|
|
cost_time["prem"] = round(time.time()-start_time,2)
|
|
cost_time["prem"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
predictor.getPredictor("product").predict(list_sentences,list_entitys)
|
|
predictor.getPredictor("product").predict(list_sentences,list_entitys)
|
|
- log("get product done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get product done of doc_id%s"%(doc_id))
|
|
cost_time["product"] = round(time.time()-start_time,2)
|
|
cost_time["product"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
|
|
product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
|
|
- log("get product attributes done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get product attributes done of doc_id%s"%(doc_id))
|
|
cost_time["product_attrs"] = round(time.time()-start_time,2)
|
|
cost_time["product_attrs"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
@@ -82,12 +83,12 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
predictor.getPredictor("epc").predict(list_sentences,list_entitys)
|
|
predictor.getPredictor("epc").predict(list_sentences,list_entitys)
|
|
- log("get epc done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get epc done of doc_id%s"%(doc_id))
|
|
cost_time["person"] = round(time.time()-start_time,2)
|
|
cost_time["person"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
predictor.getPredictor("time").predict(list_sentences, list_entitys)
|
|
predictor.getPredictor("time").predict(list_sentences, list_entitys)
|
|
- log("get time done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get time done of doc_id%s"%(doc_id))
|
|
cost_time["time"] = round(time.time()-start_time,2)
|
|
cost_time["time"] = round(time.time()-start_time,2)
|
|
|
|
|
|
# 需在getPredictor("prem")后 getAttributes.getPREMs 前
|
|
# 需在getPredictor("prem")后 getAttributes.getPREMs 前
|
|
@@ -104,11 +105,46 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
|
|
_entity.values[1] = 0.51
|
|
_entity.values[1] = 0.51
|
|
_entity.set_Money(1, _entity.values)
|
|
_entity.set_Money(1, _entity.values)
|
|
|
|
|
|
- #依赖句子顺序
|
|
|
|
|
|
+ # 2021-12-08新增:提取:总价,单价,比率
|
|
|
|
+ total_money_list = []
|
|
|
|
+ unit_money_list = []
|
|
|
|
+ ratio_list = []
|
|
|
|
+ for i in range(len(list_entitys)):
|
|
|
|
+ list_entity = list_entitys[i]
|
|
|
|
+
|
|
|
|
+ # 总价单价
|
|
|
|
+ for _entity in list_entity:
|
|
|
|
+ if _entity.entity_type == 'money':
|
|
|
|
+ word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
|
|
|
|
+ # 总价在中投标金额中
|
|
|
|
+ if _entity.label == 1:
|
|
|
|
+ result = extract_total_money(word_of_sentence,
|
|
|
|
+ _entity.entity_text,
|
|
|
|
+ [_entity.wordOffset_begin, _entity.wordOffset_end])
|
|
|
|
+ if result:
|
|
|
|
+ total_money_list.append(result)
|
|
|
|
+
|
|
|
|
+ # 单价在普通金额中
|
|
|
|
+ else:
|
|
|
|
+ result = extract_unit_money(word_of_sentence,
|
|
|
|
+ _entity.entity_text,
|
|
|
|
+ [_entity.wordOffset_begin, _entity.wordOffset_end])
|
|
|
|
+ if result:
|
|
|
|
+ unit_money_list.append(result)
|
|
|
|
+
|
|
|
|
+ # 比率
|
|
|
|
+ all_sentence = ""
|
|
|
|
+ for sentence in list_sentences[i]:
|
|
|
|
+ all_sentence += sentence.sentence_text + ","
|
|
|
|
+ result = extract_ratio(all_sentence)
|
|
|
|
+ if result:
|
|
|
|
+ ratio_list.append(result)
|
|
|
|
+
|
|
|
|
+ # 依赖句子顺序
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
entityLink.link_entitys(list_entitys)
|
|
entityLink.link_entitys(list_entitys)
|
|
prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
|
|
prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
|
|
- log("get attributes done of doc_id%s"%(doc_id))
|
|
|
|
|
|
+ # log("get attributes done of doc_id%s"%(doc_id))
|
|
cost_time["attrs"] = round(time.time()-start_time,2)
|
|
cost_time["attrs"] = round(time.time()-start_time,2)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
@@ -121,13 +157,17 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
|
|
if product in d['project_name']:
|
|
if product in d['project_name']:
|
|
d['product'].append(product) #把产品在项目名称中的添加进需求要素中
|
|
d['product'].append(product) #把产品在项目名称中的添加进需求要素中
|
|
|
|
|
|
- #print(prem)
|
|
|
|
|
|
+ # print(prem)
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
|
|
# data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
|
|
data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
|
|
data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
|
|
data_res["cost_time"] = cost_time
|
|
data_res["cost_time"] = cost_time
|
|
data_res["success"] = True
|
|
data_res["success"] = True
|
|
|
|
|
|
|
|
+ data_res["total_money"] = total_money_list
|
|
|
|
+ data_res["unit_money"] = unit_money_list
|
|
|
|
+ data_res["ratio"] = ratio_list
|
|
|
|
+
|
|
# for _article in list_articles:
|
|
# for _article in list_articles:
|
|
# log(_article.content)
|
|
# log(_article.content)
|
|
#
|
|
#
|