Ver Fonte

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/interface/extract.py
lsm há 2 anos atrás
pai
commit
adf15d43a1

+ 15 - 13
BiddingKG/dl/entityLink/entityLink.py

@@ -76,17 +76,19 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
             if _entity.entity_type in ["org","company"]:
                 range_entity.append(_entity)
         range_entity = range_entity[:1000]
-        for first_i in range(len(range_entity)):
-            _entity = range_entity[first_i]
-            for second_i in range(first_i+1,len(range_entity)):
-                _ent = range_entity[second_i]
-                # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
-                if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
-                    continue
-                _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
-                if _entity.entity_text!=_ent.entity_text and _score>=on_value:
-                    _entity.linked_entitys.append(_ent)
-                    _ent.linked_entitys.append(_entity)
+        #替换公司的逻辑有问题,先取消
+        # for first_i in range(len(range_entity)):
+        #     _entity = range_entity[first_i]
+        #     for second_i in range(first_i+1,len(range_entity)):
+        #         _ent = range_entity[second_i]
+        #         # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
+        #         if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
+        #             continue
+        #         _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
+        #         if _entity.entity_text!=_ent.entity_text and _score>=on_value:
+        #             _entity.linked_entitys.append(_ent)
+        #             _ent.linked_entitys.append(_entity)
+        #             print("=-===",_entity.entity_text,_ent.entity_text,_score)
         #替换公司名称
         for _entity in range_entity:
             if re.search("公司",_entity.entity_text) is None:
@@ -433,6 +435,6 @@ if __name__=="__main__":
     # print(match_enterprise_max_first(sentences))
     #
     # print("takes %d s"%(time.time()-_time))
-    fix_LEGAL_ENTERPRISE()
-    # print(jaccard_score("中国南方航空股份有限公司上海分公司","南方航空上海分公司"))
+    # fix_LEGAL_ENTERPRISE()
+    print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
     # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))

+ 18 - 15
BiddingKG/dl/interface/Preprocessing.py

@@ -343,7 +343,7 @@ def tableToText(soup):
             same_value = inner_table[h][0][0]
             for w in range(width):
                 if last_head is not None:
-                    if inner_table[h-1][w][0]!=fix_value and inner_table[h-1][w][1] == 0:
+                    if inner_table[h-1][w][0] != fix_value and inner_table[h-1][w][0] != "" and inner_table[h-1][w][1] == 0:
                         is_all_key = False
 
                     if inner_table[h][w][0]==1:
@@ -362,18 +362,26 @@ def tableToText(soup):
 
             last_head = h
 
+            # print("h", h)
+            # print("last_is_same_value", last_is_same_value)
+            # print("is_same_value", is_same_value)
+            # print("is_all_key", is_all_key)
+            # print("is_same_with_lastHead", is_same_with_lastHead)
             if last_is_same_value:
                 last_is_same_value = is_same_value
                 continue
 
             if is_same_value:
-                head_list.append(h)
-                last_is_same_value = is_same_value
-                continue
+                # 该块只有表头一行不合法
+                if h - head_list[-1] > 1:
+                    head_list.append(h)
+                    last_is_same_value = is_same_value
+                    continue
             if not is_all_key:
                 if not is_same_with_lastHead:
-                    head_list.append(h)
-
+                    # 该块只有表头一行不合法
+                    if h - head_list[-1] > 1:
+                        head_list.append(h)
 
         head_list.append(height)
         return head_list
@@ -420,6 +428,7 @@ def tableToText(soup):
         return inner_table,head_list
 
     def set_head_model(inner_table):
+        origin_inner_table = copy.deepcopy(inner_table)
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 # 删掉单格前后符号,以免影响表头预测
@@ -430,18 +439,11 @@ def tableToText(soup):
 
         # 模型预测表头
         predict_list = predict(inner_table)
-        # with open(r"C:\Users\Administrator\Desktop\table_head_test.txt", "a") as f:
-        #     for i in range(len(predict_list)):
-        #         f.write(str(i) + " " + str(inner_table[i]) + "\n")
-        #         f.write(str(i) + " " + str(predict_list[i]) + "\n")
-        #     f.write("\n")
-
-        # print("table_list", inner_table)
-        # print("predict_list", predict_list)
 
+        # 组合结果
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
-                inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
+                inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
         head_list = sliceTable(inner_table)
         return inner_table, head_list
 
@@ -1019,6 +1021,7 @@ def tableToText(soup):
             inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print("table_head", inner_table)
+            # print("head_list", head_list)
             # for begin in range(len(head_list[:-1])):
             #     for item in inner_table[head_list[begin]:head_list[begin+1]]:
             #         print(item)

+ 4 - 3
BiddingKG/dl/interface/extract.py

@@ -192,9 +192,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     '''获取联合体信息'''
     getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
 
-    start_time = time.time() #失信数据要素提取
-    list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
-    cost_time["punish"] = round(time.time()-start_time,2)
+    #暂时不执行
+    # start_time = time.time() #失信数据要素提取
+    # list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # cost_time["punish"] = round(time.time()-start_time,2)
 
 
     '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''

+ 10 - 10
BiddingKG/dl/interface/modelFactory.py

@@ -41,7 +41,7 @@ class Model_role_classify():
             return self.getModel().predict([x[0],x[1]])
     
 class Model_role_classify_word():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
@@ -49,7 +49,7 @@ class Model_role_classify_word():
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
-        self.sess_role = tf.Session(graph=tf.Graph())
+        self.sess_role = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -94,12 +94,12 @@ class Model_role_classify_word():
         
     
 class Model_money_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_money_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
         self.model_money = None
-        self.sess_money = tf.Session(graph=tf.Graph())
+        self.sess_money = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -345,12 +345,12 @@ class Model_relation_extraction():
 
     
 class Model_person_classify():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         if USE_PAI_EAS:
             lazyLoad = True
         self.model_person_file = os.path.dirname(__file__)+"/../person/models/model_person.model.hdf5"
         self.model_person = None
-        self.sess_person = tf.Session(graph=tf.Graph())
+        self.sess_person = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
         
@@ -436,10 +436,10 @@ class Model_form_line():
             return self.getModel().predict(x)
     
 class Model_form_item():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file = os.path.dirname(__file__)+"/../form/log/ep039-loss0.038-val_loss0.064-f10.9783.h5"
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 
@@ -485,9 +485,9 @@ class Model_form_item():
         '''
 
 class Model_form_context():
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_form = None
-        self.sess_form = tf.Session(graph=tf.Graph())
+        self.sess_form = tf.Session(graph=tf.Graph(),config=config)
         if not lazyLoad:
             self.getModel()
 

+ 34 - 27
BiddingKG/dl/interface/predictor.py

@@ -27,6 +27,13 @@ import calendar
 import datetime
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
+cpu_num = int(os.environ.get("CPU_NUM",0))
+sess_config = tf.ConfigProto(
+                        inter_op_parallelism_threads = cpu_num,
+                        intra_op_parallelism_threads = cpu_num,
+                        log_device_placement=True)
+sess_config = None
+
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
@@ -51,11 +58,11 @@ def getPredictor(_type):
         with dict_predictor[_type]["Lock"]:
             if dict_predictor[_type]["predictor"] is None:
                 if _type == "codeName":
-                    dict_predictor[_type]["predictor"] = CodeNamePredict()
+                    dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
                 if _type == "prem":
-                    dict_predictor[_type]["predictor"] = PREMPredict()
+                    dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
                 if _type == "epc":
-                    dict_predictor[_type]["predictor"] = EPCPredict()
+                    dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
                 if _type == "roleRule":
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type == "roleRuleFinal":
@@ -63,17 +70,17 @@ def getPredictor(_type):
                 if _type == "tendereeRuleRecall":
                     dict_predictor[_type]["predictor"] = TendereeRuleRecall()
                 if _type == "form":
-                    dict_predictor[_type]["predictor"] = FormPredictor()
+                    dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
                 if _type == "time":
-                    dict_predictor[_type]["predictor"] = TimePredictor()
+                    dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
                 if _type == "punish":
                     dict_predictor[_type]["predictor"] = Punish_Extract()
                 if _type == "product":
-                    dict_predictor[_type]["predictor"] = ProductPredictor()
+                    dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
                 if _type == "product_attrs":
                     dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                 if _type == "channel":
-                    dict_predictor[_type]["predictor"] = DocChannel()
+                    dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
                 if _type == 'deposit_payment_way':
                     dict_predictor[_type]["predictor"] = DepositPaymentWay()
                 if _type == 'total_unit_money':
@@ -87,7 +94,7 @@ def getPredictor(_type):
 # 编号名称模型
 class CodeNamePredict():
     
-    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
+    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
         
         self.model = None
         self.MAX_LEN = None
@@ -123,8 +130,8 @@ class CodeNamePredict():
         
         self.inputs = None
         self.outputs = None
-        self.sess_codename = tf.Session(graph=tf.Graph())
-        self.sess_codesplit = tf.Session(graph=tf.Graph())
+        self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
+        self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         if not lazyLoad:
@@ -535,11 +542,11 @@ class CodeNamePredict():
 class PREMPredict():
 
     
-    def __init__(self):
+    def __init__(self,config=None):
         #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
         self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
-        self.model_role = Model_role_classify_word()
-        self.model_money = Model_money_classify()
+        self.model_role = Model_role_classify_word(config=config)
+        self.model_money = Model_money_classify(config=config)
         
         return
     
@@ -737,8 +744,8 @@ class PREMPredict():
 #联系人模型    
 class EPCPredict():
     
-    def __init__(self):
-        self.model_person = Model_person_classify()
+    def __init__(self,config=None):
+        self.model_person = Model_person_classify(config=config)
 
 
     
@@ -1077,13 +1084,13 @@ class EPCPredict():
 #表格预测
 class FormPredictor():
     
-    def __init__(self,lazyLoad=getLazyLoad()):
+    def __init__(self,lazyLoad=getLazyLoad(),config=None):
         self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
         self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
-        self.model_form_item = Model_form_item()
-        self.model_form_context = Model_form_context()
+        self.model_form_item = Model_form_item(config=config)
         self.model_dict = {"line":[None,self.model_file_line]}
-        
+        self.model_form_context = Model_form_context(config=config)
+
         
     def getModel(self,type):
         if type=="item":
@@ -1692,8 +1699,8 @@ class TendereeRuleRecall():
 
 # 时间类别
 class TimePredictor():
-    def __init__(self):
-        self.sess = tf.Session(graph=tf.Graph())
+    def __init__(self,config=None):
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.inputs_code = None
         self.outputs_code = None
         self.input_shape = (2,40,128)
@@ -1797,11 +1804,11 @@ class TimePredictor():
 
 # 产品字段提取
 class ProductPredictor():
-    def __init__(self):
+    def __init__(self,config=None):
         vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
         self.vocab = load(vocabpath)
         self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
-        self.sess = tf.Session(graph=tf.Graph())
+        self.sess = tf.Session(graph=tf.Graph(),config=config)
         self.load_model()
 
     def load_model(self):
@@ -2517,9 +2524,9 @@ class ProductAttributesPredictor():
 
 # docchannel类型提取
 class DocChannel():
-  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
     self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
-    self.mask, self.mask_title = self.load_life(life_model)
+    self.mask, self.mask_title = self.load_life(life_model,config)
     self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
     self.type_mask, self.type_mask_title = self.load_type(type_model)
     self.sequen_len = 200  # 150 200
@@ -2580,7 +2587,7 @@ class DocChannel():
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
       }
 
-  def load_life(self,life_model):
+  def load_life(self,life_model,config):
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       with open(os.path.dirname(__file__)+life_model, 'rb') as f:
@@ -2588,7 +2595,7 @@ class DocChannel():
         tf.import_graph_def(output_graph_def, name='')
         # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
-        sess = tf.Session(graph=graph)
+        sess = tf.Session(graph=graph,config=config)
         sess.run(tf.global_variables_initializer())
         inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
         prob = sess.graph.get_tensor_by_name('inputs/dropout:0')

BIN
BiddingKG/dl/table_head/best_tiny.hdf5


+ 119 - 1
BiddingKG/dl/table_head/models/model.py

@@ -73,6 +73,124 @@ def model_1(input_shape, output_shape):
     return model
 
 
+def model_1_small(input_shape, output_shape):
+    # Input (batch, 10, 60)
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # ----------- Three box sequence -----------
+    # Concat (batch, 30, 60)
+    concat_1 = layers.concatenate([input_1, input_2, input_3], axis=-2, name='seq_concat')
+    concat_2 = layers.concatenate([input_4, input_5, input_6], axis=-2)
+
+    # Bi-LSTM (batch, 30, 128)
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(concat_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(concat_2)
+
+    # Self-Attention (batch, 30, 128)
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+
+    # Dense (batch, 30, 1)
+    dense_1 = layers.Dense(output_shape[0], activation="relu")(self_attention_1)
+    dense_2 = layers.Dense(output_shape[0], activation="relu")(self_attention_2)
+
+    # Squeeze (batch, 30)
+    squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_1)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_2)
+
+    # ----------- One box feature -----------
+    # Bi-LSTM (batch, 10, 128)
+    bi_lstm = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(input_2)
+
+    # Self-Attention (batch, 10, 128)
+    self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+
+    # mask mean pooling
+    # pool_1 = MyAveragePooling1D(axis=-1)(self_attention_1)
+
+    # Dense (batch, 10, 1)
+    dense = layers.Dense(output_shape[0], activation="relu")(self_attention)
+
+    # Squeeze (batch, 10) - one box feature
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # ----------- Three box sequence & One box feature -----------
+    # Dense (batch, 1)
+    concat = layers.concatenate([squeeze, squeeze_1, squeeze_2])
+    output = layers.Dense(32, activation='relu')(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    # model.summary()
+    return model
+
+
+def model_1_tiny(input_shape, output_shape):
+    # Input (batch, 10, 60)
+    input_1 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_2 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_3 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_4 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_5 = layers.Input(shape=input_shape[1:], dtype="float32")
+    input_6 = layers.Input(shape=input_shape[1:], dtype="float32")
+
+    # ----------- Three box sequence -----------
+    # Concat (batch, 30, 60)
+    concat_1 = layers.concatenate([input_1, input_2, input_3], axis=-2, name='seq_concat')
+    concat_2 = layers.concatenate([input_4, input_5, input_6], axis=-2)
+
+    # Bi-LSTM (batch, 30, 128)
+    bi_lstm_1 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(concat_1)
+    bi_lstm_2 = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(concat_2)
+
+    # Self-Attention (batch, 30, 128)
+    self_attention_1 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_1)
+    self_attention_2 = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm_2)
+
+    # Dense (batch, 30, 1)
+    dense_1 = layers.Dense(output_shape[0], activation="relu")(self_attention_1)
+    dense_2 = layers.Dense(output_shape[0], activation="relu")(self_attention_2)
+
+    # Squeeze (batch, 30)
+    squeeze_1 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_1)
+    squeeze_2 = Lambda(lambda x: K.squeeze(x, axis=-1))(dense_2)
+
+    # ----------- One box feature -----------
+    # Bi-LSTM (batch, 10, 128)
+    bi_lstm = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(input_2)
+
+    # Self-Attention (batch, 10, 128)
+    self_attention = SeqSelfAttention(attention_activation='sigmoid')(bi_lstm)
+
+    # mask mean pooling
+    # pool_1 = MyAveragePooling1D(axis=-1)(self_attention_1)
+
+    # Dense (batch, 10, 1)
+    dense = layers.Dense(output_shape[0], activation="relu")(self_attention)
+
+    # Squeeze (batch, 10) - one box feature
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=-1))(dense)
+
+    # ----------- Three box sequence & One box feature -----------
+    # Dense (batch, 1)
+    concat = layers.concatenate([squeeze, squeeze_1, squeeze_2])
+    output = layers.Dense(16, activation='relu')(concat)
+    output = layers.Dense(1, activation="sigmoid", name='output')(output)
+
+    model = models.Model(inputs=[input_1, input_2, input_3, input_4, input_5, input_6],
+                         outputs=output)
+
+    # model.summary()
+    return model
+
+
 def model_2(input_shape, output_shape):
     # input_shape = (None, None, 10, 60)
     # (batch_size, row_num, col_num, character_num, character_embedding)
@@ -266,7 +384,7 @@ def model_3(input_shape, output_shape):
 
 def get_model(input_shape, output_shape, model_id):
     if model_id == 1:
-        return model_1(input_shape, output_shape)
+        return model_1_tiny(input_shape, output_shape)
     elif model_id == 2:
         return model_2(input_shape, output_shape)
     elif model_id == 3:

+ 31 - 8
BiddingKG/dl/table_head/pre_process.py

@@ -441,9 +441,7 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
 
     else:
         new_data_list = []
-        for j in range(batch_size):
-            if i >= data_num:
-                i = 0
+        for j in range(len(data_list)):
             # 中文字符映射为Embedding
             data = data_list[i]
             data = embedding_word(data, output_shape)
@@ -451,11 +449,36 @@ def my_data_loader(data_list, data_label_list, batch_size, is_train=True):
                 new_data_list.append(data)
             i += 1
 
-        new_data_list = np.array(new_data_list)
-        X = new_data_list
-        X = np.transpose(X, (1, 0, 2, 3))
-        yield {'input_1': X[0], 'input_2': X[1], 'input_3': X[2],
-               'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
+        for j in range(0, len(data_list), batch_size):
+            sub_data_list = np.array(new_data_list[j: j+batch_size])
+            X = sub_data_list
+            X = np.transpose(X, (1, 0, 2, 3))
+            # print(X)
+            # return X
+            yield {'input_1': X[0], 'input_2': X[1], 'input_3': X[2],
+                   'input_4': X[3], 'input_5': X[4], 'input_6': X[5], }
+
+
+def my_data_loader_predict(data_list, data_label_list, batch_size):
+    data_num = len(data_list)
+
+    # 定义Embedding输出
+    output_shape = (6, 20, 60)
+
+    i = 0
+    new_data_list = []
+    for j in range(len(data_list)):
+        # 中文字符映射为Embedding
+        data = data_list[i]
+        data = embedding_word(data, output_shape)
+        if data.shape == output_shape:
+            new_data_list.append(data)
+        i += 1
+
+    sub_data_list = np.array(new_data_list)
+    X = sub_data_list
+    X = np.transpose(X, (1, 0, 2, 3))
+    return X
 
 
 def my_data_loader_2(table_list, table_label_list, batch_size, is_train=True):

+ 18 - 16
BiddingKG/dl/table_head/predict.py

@@ -1,6 +1,7 @@
 #coding:utf-8
 import copy
 import json
+import math
 import os
 import sys
 import time
@@ -11,7 +12,7 @@ from flask import Flask
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
 from models.model import get_model
 from post_process import table_post_process, table_post_process_2
-from pre_process import my_data_loader, table_pre_process, table_pre_process_2, my_data_loader_2
+from pre_process import my_data_loader, table_pre_process, table_pre_process_2, my_data_loader_2, my_data_loader_predict
 
 # from BiddingKG.dl.interface.Preprocessing import tableToText, segment
 
@@ -23,9 +24,12 @@ if model_id == 1:
 else:
     input_shape = (None, None, 20, 60)
     output_shape = (None, None)
-keras_model_path = os.path.abspath(os.path.dirname(__file__)) + "/best.hdf5"
+keras_model_path = os.path.abspath(os.path.dirname(__file__)) + "/best_tiny.hdf5"
 # keras模型加载预测都使用同一个session、同一个graph,即可多进程推理
-sess = tf.Session(graph=tf.Graph())
+session_conf = tf.ConfigProto(
+    intra_op_parallelism_threads=5,
+    inter_op_parallelism_threads=5)
+sess = tf.Session(graph=tf.Graph(), config=session_conf)
 # graph = tf.get_default_graph()
 
 # tf_model_path = os.path.abspath(os.path.dirname(__file__)) + '/best_pb/1'
@@ -49,7 +53,6 @@ sess = tf.Session(graph=tf.Graph())
 
 def predict(table_text_list, model_id=1):
     start_time = time.time()
-
     if globals().get("model") is None:
         print("="*15, "init table_head model", "="*15)
         with sess.as_default():
@@ -58,7 +61,6 @@ def predict(table_text_list, model_id=1):
                 # load weights
                 model.load_weights(keras_model_path)
         globals()["model"] = model
-        # print("="*15, "finish init", "="*15)
     else:
         model = globals().get("model")
 
@@ -69,34 +71,34 @@ def predict(table_text_list, model_id=1):
         data_list = table_pre_process(table_text_list_copy, [], 0, is_train=False)
     else:
         data_list = table_pre_process_2(table_text_list_copy, [], 0, is_train=False, padding=False)
-    batch_size = len(data_list)
-    # print("batch_size", batch_size)
-    # print("data_list", data_list)
 
     # 数据预处理
+    batch_size = len(data_list)
     if model_id == 1:
-        predict_x = my_data_loader(data_list, [], batch_size, is_train=False)
+        predict_x = my_data_loader_predict(data_list, [], batch_size)
     else:
         predict_x = my_data_loader_2(data_list, [], 1, is_train=False)
-    # print(time.time()-start_time)
 
     # 预测
-    # with graph.as_default():
+    # start_time = time.time()
     with sess.as_default():
         with sess.graph.as_default():
-            predict_result = model.predict_generator(predict_x, steps=1)
-    # print("predict_result", predict_result)
-    # print(time.time()-start_time)
-    # print("predict_result", predict_result.shape)
+            # predict_result = model.predict_generator(predict_x, steps=1)
+            # 设置batch size为1最快,默认为32很慢
+            predict_result = model.predict([predict_x[0], predict_x[1], predict_x[2],
+                                            predict_x[3], predict_x[4], predict_x[5]],
+                                           batch_size=256)
+    # print("table head predict time", time.time()-start_time, predict_x.shape)
 
     # 数据后处理
     if model_id == 1:
         table_label_list = table_post_process(table_text_list_copy, predict_result)
     else:
         table_label_list = table_post_process_2(table_text_list_copy, predict_result)
-    # print(time.time()-start_time)
+
     # 打印保存结构
     # save_print_result(table_text_list, table_label_list)
+    # print("table_head predict cost", str(time.time()-start_time))
     return table_label_list
 
 

+ 3 - 3
BiddingKG/dl/table_head/train.py

@@ -19,10 +19,10 @@ if model_id == 1:
     output_shape = (1,)
     batch_size = 128
     epochs = 1000
-    PRETRAINED = True
+    PRETRAINED = False
     CHECKPOINT = False
     # 用GPU
-    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 else:
     input_shape = (None, None, 20, 60)
     output_shape = (None, None)
@@ -92,7 +92,7 @@ def train():
                   # loss_weights={"output": 0.5},
                   metrics=['acc', precision, recall, f1])
 
-    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.1, patience=10,
+    rlu = ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=10,
                             verbose=1, mode='max', cooldown=0, min_lr=0)
 
     model.fit_generator(train_data_loader,

+ 9 - 3
BiddingKG/dl_dev/test/test4.py

@@ -53,9 +53,9 @@ def test(name,content,_url=None):
     # _resp = requests.post(list_url[_i], json=user, headers=myheaders, verify=True)
 
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:15030/test"
-    _url = "http://192.168.2.102:15030/industry_extract"
-    _url = "http://192.168.2.102:15030/content_extract"
+    _url = "http://127.0.0.1:15030/content_extract"
+    # _url = "http://192.168.2.102:15030/industry_extract"
+    # _url = "http://192.168.2.102:15030/content_extract"
 
     _resp = session.post(_url, json=user,verify=True,timeout=1000)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
@@ -103,6 +103,12 @@ def run_one():
     # text = '''
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # '''
+    # print("start")
+    # _time1 = time.time()
+    # print(predict("12", content,"打印机",original_docchannel=52))
+    # # test(12,content)
+    # # test(12,text)
+    # print("takes",time.time()-a)
     print("start")
     _time1 = time.time()
     print(predict("12", content,"打印机",original_docchannel=52))

+ 6 - 0
BiddingKG/hello.html

@@ -0,0 +1,6 @@
+from Flask</title>
+{% if name %}
+<h1>Hello {{ name }}!</h1>
+{% else %}
+<h1>Hello World!</h1>
+{% endif %}

+ 5 - 7
BiddingKG/readme/start.md

@@ -3,21 +3,19 @@
 #项目路径在/data/python/BiddingKG
 
 #11022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
 cd /data/python
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+#nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
 
 #19022启动要素提取接口
-#激活环境
-source activate py37
 #切换目录
-cd /data/python
+cd /data/python 
 #关闭接口
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
-nohup /data/anaconda3/envs/py37/bin/gunicorn -w 6 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+#nohup /data/anaconda3/envs/py37/bin/gunicorn -w 5 --limit-request-fields 0 --limit-request-line 0 -t 1000  --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=7 &

+ 42 - 5
BiddingKG/run_extract_server.py

@@ -17,11 +17,16 @@ os.environ["KERAS_BACKEND"] = "tensorflow"
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
 
+limit_num = "4"
+os.environ["OMP_NUM_THREADS"] = limit_num # 1为一个核,设置为5的时候,系统显示用了10个核,不太清楚之间的具体数量关系
+os.environ["OMP_NUM_THREADS"] = limit_num # export OMP_NUM_THREADS=1
+os.environ["OPENBLAS_NUM_THREADS"] = limit_num # export OPENBLAS_NUM_THREADS=1
+os.environ["MKL_NUM_THREADS"] = limit_num # export MKL_NUM_THREADS=1
+os.environ["VECLIB_MAXIMUM_THREADS"] = limit_num # export VECLIB_MAXIMUM_THREADS=1
+os.environ["NUMEXPR_NUM_THREADS"] = limit_num # export NUMEXPR_NUM_THREADS=1
 
 import time
 import uuid
-from BiddingKG.dl.common.Utils import log
-from BiddingKG.dl.interface.extract import predict
 import numpy as np
 import ctypes
 import inspect
@@ -98,6 +103,9 @@ def run_thread(data,list_result):
 
 @app.route("/test",methods=['POST'])
 def test():
+    from BiddingKG.dl.common.Utils import log
+    from BiddingKG.dl.interface.extract import predict
+    global predict,log
     _time = time.time()
     a = request.form.get("content")
     log("get form takes %.2fs"%(time.time()-_time))
@@ -107,7 +115,9 @@ def test():
 
 @app.route('/content_extract', methods=['POST'])
 def text_predict():
-
+    from BiddingKG.dl.common.Utils import log
+    from BiddingKG.dl.interface.extract import predict
+    global predict,log
     _time = time.time()
     data = request.json
 
@@ -136,6 +146,7 @@ def text_predict():
 
 def getPort(argv):
     port = 15030
+    print(argv)
     for item in argv:
         _l = str(item).split("port=")
         if len(_l)>1:
@@ -143,8 +154,34 @@ def getPort(argv):
             break
     return port
 
-if __name__ == '__main__':
+def getWorkers(argv):
+    worker = 15
+    for item in argv:
+        _l = str(item).split("worker=")
+        if len(_l)>1:
+            worker = int(_l[-1])
+            break
+    return worker
+
+def start_with_tornado(port,process_num):
+    from tornado.wsgi import WSGIContainer
+    from tornado.httpserver import HTTPServer
+    from tornado.ioloop import IOLoop
+
+    http_server = HTTPServer(WSGIContainer(app))
+    # http_server.listen(port) #shortcut for bind and start
+    http_server.bind(port)
+    http_server.start(process_num)
+    IOLoop.instance().start()
+
+def start_with_flask():
     port = getPort(argv=sys.argv)
     app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
     log("ContentExtractor running")
-    # app.run()
+    # app.run()
+
+if __name__ == '__main__':
+    port = getPort(argv=sys.argv)
+    workers = getWorkers(argv=sys.argv)
+    start_with_tornado(port,workers)
+    pass

+ 35 - 0
BiddingKG/test_deployment.py

@@ -0,0 +1,35 @@
+
+
+from flask import Flask,render_template
+from flask import request
+
+app = Flask(__name__)
+app.config['JSON_AS_ASCII'] = False
+
+@app.route("/test")
+def test():
+    data = request.json
+    j = 0
+    for i in range(10000):
+       j += i**2
+
+    return render_template("hello.html")
+
+@app.route("/render")
+def render():
+    return render_template("hello.html")
+
+
+def test_with_tornado():
+    from tornado.httpserver import HTTPServer
+    from tornado.wsgi import WSGIContainer
+    from tornado.ioloop import IOLoop
+
+    httpserver = HTTPServer(WSGIContainer(app))
+    httpserver.bind(15000)
+    httpserver.start(1)
+    IOLoop.instance().start()
+
+if __name__ == '__main__':
+    test_with_tornado()
+