Prechádzať zdrojové kódy

Merge remote-tracking branch 'origin/master'

fangjiasheng 2 rokov pred
rodič
commit
bd48e7f132

+ 4 - 3
BiddingKG/dl/interface/extract.py

@@ -189,9 +189,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    start_time = time.time() #失信数据要素提取
-    list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
-    cost_time["punish"] = round(time.time()-start_time,2)
+    #暂时不执行
+    # start_time = time.time() #失信数据要素提取
+    # list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # cost_time["punish"] = round(time.time()-start_time,2)
 
 
     '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''

+ 10 - 10
BiddingKG/dl/interface/modelFactory.py

@@ -41,7 +41,7 @@ class Model_role_classify():
             return self.getModel().predict([x[0],x[1]])
     
 class Model_role_classify_word():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
@@ -49,7 +49,7 @@ class Model_role_classify_word():
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
-        self.sess_role = tf.Session(graph=tf.Graph())
+        self.sess_role = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -94,12 +94,12 @@ class Model_role_classify_word():
         
     
 class Model_money_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_money_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
         self.model_money = None
-        self.sess_money = tf.Session(graph=tf.Graph())
+        self.sess_money = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -345,12 +345,12 @@ class Model_relation_extraction():
 
     
 class Model_person_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_person_file = os.path.dirname(__file__)+"/../person/models/model_person.model.hdf5"
         self.model_person = None
-        self.sess_person = tf.Session(graph=tf.Graph())
+        self.sess_person = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -436,10 +436,10 @@ class Model_form_line():
             return self.getModel().predict(x)
     
 class Model_form_item():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file = os.path.dirname(__file__)+"/../form/log/ep039-loss0.038-val_loss0.064-f10.9783.h5"
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 
@@ -485,9 +485,9 @@ class Model_form_item():
         '''
 
 class Model_form_context():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 

+ 34 - 27
BiddingKG/dl/interface/predictor.py

@@ -27,6 +27,13 @@ import calendar
 import datetime
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
+cpu_num = int(os.environ.get("CPU_NUM",0))
+sess_config = tf.ConfigProto(
+                        inter_op_parallelism_threads = cpu_num,
+                        intra_op_parallelism_threads = cpu_num,
+                        log_device_placement=True)
+sess_config = None
+
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
@@ -51,11 +58,11 @@ def getPredictor(_type):
         with dict_predictor[_type]["Lock"]:
             if dict_predictor[_type]["predictor"] is None:
                 if _type == "codeName":
-                    dict_predictor[_type]["predictor"] = CodeNamePredict()
+                    dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
                 if _type == "prem":
-                    dict_predictor[_type]["predictor"] = PREMPredict()
+                    dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
                 if _type == "epc":
-                    dict_predictor[_type]["predictor"] = EPCPredict()
+                    dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
                 if _type == "roleRule":
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type == "roleRuleFinal":
@@ -63,17 +70,17 @@ def getPredictor(_type):
                 if _type == "tendereeRuleRecall":
                     dict_predictor[_type]["predictor"] = TendereeRuleRecall()
                 if _type == "form":
-                    dict_predictor[_type]["predictor"] = FormPredictor()
+                    dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
                 if _type == "time":
-                    dict_predictor[_type]["predictor"] = TimePredictor()
+                    dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
                 if _type == "punish":
                     dict_predictor[_type]["predictor"] = Punish_Extract()
                 if _type == "product":
-                    dict_predictor[_type]["predictor"] = ProductPredictor()
+                    dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
                 if _type == "product_attrs":
                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                 if _type == "channel":
-                    dict_predictor[_type]["predictor"] = DocChannel()
+                    dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
                 if _type == 'deposit_payment_way':
                     dict_predictor[_type]["predictor"] = DepositPaymentWay()
                 if _type == 'total_unit_money':
@@ -87,7 +94,7 @@ def getPredictor(_type):
 # 编号名称模型
 class CodeNamePredict():
     
-    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
+    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
         
         self.model = None
         self.MAX_LEN = None
@@ -123,8 +130,8 @@ class CodeNamePredict():
         
         self.inputs = None
         self.outputs = None
-        self.sess_codename = tf.Session(graph=tf.Graph())
-        self.sess_codesplit = tf.Session(graph=tf.Graph())
+        self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
+        self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         if not lazyLoad:
@@ -535,11 +542,11 @@ class CodeNamePredict():
 class PREMPredict():
 
     
-    def __init__(self):
+    def __init__(self,config=None):
         #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
         self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
-        self.model_role = Model_role_classify_word()
-        self.model_money = Model_money_classify()
+        self.model_role = Model_role_classify_word(config=config)
+        self.model_money = Model_money_classify(config=config)
         
         return
     
@@ -734,8 +741,8 @@ class PREMPredict():
 #联系人模型    
 class EPCPredict():
     
-    def __init__(self):
-        self.model_person = Model_person_classify()
+    def __init__(self,config=None):
+        self.model_person = Model_person_classify(config=config)
 
 
     
@@ -1074,13 +1081,13 @@ class EPCPredict():
 #表格预测
 class FormPredictor():
     
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
         self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
-        self.model_form_item = Model_form_item()
-        self.model_form_context = Model_form_context()
+        self.model_form_item = Model_form_item(config=config)
         self.model_dict = {"line":[None,self.model_file_line]}
-        
+        self.model_form_context = Model_form_context(config=config)
+
         
     def getModel(self,type):
         if type=="item":
@@ -1690,8 +1697,8 @@ class TendereeRuleRecall():
 
 # 时间类别
 class TimePredictor():
-    def __init__(self):
-        self.sess = tf.Session(graph=tf.Graph())
+    def __init__(self,config=None):
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         self.input_shape = (2,40,128)
@@ -1795,11 +1802,11 @@ class TimePredictor():
 
 # 产品字段提取
 class ProductPredictor():
-    def __init__(self):
+    def __init__(self,config=None):
         vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
         self.vocab = load(vocabpath)
         self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
-        self.sess = tf.Session(graph=tf.Graph())
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.load_model()
 
     def load_model(self):
@@ -2515,9 +2522,9 @@ class ProductAttributesPredictor():
 
 # docchannel类型提取
 class DocChannel():
-  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
-    self.mask, self.mask_title = self.load_life(life_model)
+    self.mask, self.mask_title = self.load_life(life_model,config)
     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
     self.type_mask, self.type_mask_title = self.load_type(type_model)
     self.sequen_len = 200  # 150 200
@@ -2578,7 +2585,7 @@ class DocChannel():
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
       }
 
-  def load_life(self,life_model):
+  def load_life(self,life_model,config):
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       with open(os.path.dirname(__file__)+life_model, 'rb') as f:
@@ -2586,7 +2593,7 @@ class DocChannel():
         tf.import_graph_def(output_graph_def, name='')
         # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
-        sess = tf.Session(graph=graph)
+        sess = tf.Session(graph=graph,config=config)
         sess.run(tf.global_variables_initializer())
         inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
         prob = sess.graph.get_tensor_by_name('inputs/dropout:0')

+ 6 - 0
BiddingKG/dl_dev/test/test4.py

@@ -109,6 +109,12 @@ def run_one():
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-a)
+    print("start")
+    _time1 = time.time()
+    print(predict("12", content,"打印机",original_docchannel=52))
+    # test(12,content)
+    # test(12,text)
+    print("takes",time.time()-a)
     pass
 
 if __name__=="__main__":

+ 2 - 6
BiddingKG/readme/start.md

@@ -3,21 +3,17 @@
 #项目路径在/data/python/BiddingKG
 
 #11022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
 cd /data/python
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/gunicorn -w 17 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
 
 #19022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
 cd /data/python
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 6 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/gunicorn -w 5 --limit-request-fields 0 --limit-request-line 0 -t 1000  --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &

+ 8 - 1
BiddingKG/run_extract_server.py

@@ -17,6 +17,13 @@ os.environ["KERAS_BACKEND"] = "tensorflow"
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
 
+limit_num = "4"
+os.environ["OMP_NUM_THREADS"] = limit_num # 1为一个核,设置为5的时候,系统显示用了10个核,不太清楚之间的具体数量关系
+os.environ["OMP_NUM_THREADS"] = limit_num # export OMP_NUM_THREADS=1
+os.environ["OPENBLAS_NUM_THREADS"] = limit_num # export OPENBLAS_NUM_THREADS=1
+os.environ["MKL_NUM_THREADS"] = limit_num # export MKL_NUM_THREADS=1
+os.environ["VECLIB_MAXIMUM_THREADS"] = limit_num # export VECLIB_MAXIMUM_THREADS=1
+os.environ["NUMEXPR_NUM_THREADS"] = limit_num # export NUMEXPR_NUM_THREADS=1
 
 import time
 import uuid
@@ -30,7 +37,7 @@ import traceback
 import json
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 sys.path.append(os.path.abspath("."))