Selaa lähdekoodia

采购意向时间格式优化

znj 3 vuotta sitten
vanhempi
commit
747a9a1c8f

+ 4 - 0
BiddingKG/dl/common/Utils.py

@@ -25,6 +25,10 @@ lock_model_w2v = RLock()
 USE_PAI_EAS = False
 
 Lazy_load = False
+# API_URL = "http://192.168.2.103:8802"
+API_URL = "http://127.0.0.1:888"
+# USE_API = True
+USE_API = False
 
 def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
     _time = time.strftime(format,time.localtime())

+ 4 - 25
BiddingKG/dl/foolnltk/selffool/predictor.py

@@ -73,30 +73,9 @@ class Predictor(object):
             inputs.append(sent_ids)
         inputs = np.array(inputs, dtype=np.int32)
         
-        if USE_PAI_EAS and self.url and self.authorization:
-            request = tf_predict_pb2.PredictRequest()
-            request.inputs["char_inputs"].dtype = tf_predict_pb2.DT_INT32
-            request.inputs["char_inputs"].array_shape.dim.extend(np.shape(inputs))
-            request.inputs["char_inputs"].int_val.extend(np.array(inputs,dtype=np.int32).reshape(-1))
-            request.inputs["lengths"].dtype = tf_predict_pb2.DT_INT32
-            request.inputs["lengths"].array_shape.dim.extend(np.shape(lengths))
-            request.inputs["lengths"].int_val.extend(np.array(lengths,dtype=np.int32).reshape(-1))
-            request.inputs["dropout"].dtype = tf_predict_pb2.DT_FLOAT
-            request.inputs["dropout"].float_val.extend([1.0])
-            request_data = request.SerializeToString()
-            list_outputs = ["logits","trans"]
-            result = vpc_requests(self.url, self.authorization, request_data, list_outputs)
-            if result is not None:
-                logits = result["logits"]
-                trans = result["trans"]
-            else:
-                feed_dict = {
-                self.input_x: inputs,
-                self.lengths: lengths,
-                self.dropout: 1.0
-                }
-                
-                logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict)
+        if USE_API and self.url and self.authorization:
+            requests_result = requests.post(API_URL + "/predict_tokens",json={"inputs": inputs.tolist(), 'lengths': lengths}, verify=True)
+            path = json.loads(requests_result.text)['result']
         else:
         
             feed_dict = {
@@ -105,6 +84,6 @@ class Predictor(object):
                 self.dropout: 1.0
             }
             logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict)
-        path = decode(logits, trans, lengths, self.num_class)
+            path = decode(logits, trans, lengths, self.num_class)
         labels = [[self.id_to_tag.get(l) for l in p] for p in path]
         return labels

+ 4 - 25
BiddingKG/dl/foolnltk/selffool/selffool_ner.py

@@ -90,30 +90,9 @@ class SelfNer():
             sent_ids += padding
             inputs.append(sent_ids)
         inputs = np.array(inputs, dtype=np.int32)
-        if USE_PAI_EAS:
-            request = tf_predict_pb2.PredictRequest()
-            request.inputs["char_inputs"].dtype = tf_predict_pb2.DT_INT32
-            request.inputs["char_inputs"].array_shape.dim.extend(np.shape(inputs))
-            request.inputs["char_inputs"].int_val.extend(np.array(inputs,dtype=np.int32).reshape(-1))
-            request.inputs["lengths"].dtype = tf_predict_pb2.DT_INT32
-            request.inputs["lengths"].array_shape.dim.extend(np.shape(lengths))
-            request.inputs["lengths"].int_val.extend(np.array(lengths,dtype=np.int32).reshape(-1))
-            request.inputs["dropout"].dtype = tf_predict_pb2.DT_FLOAT
-            request.inputs["dropout"].float_val.extend([1.0])
-            request_data = request.SerializeToString()
-            list_outputs = ["logits","trans"]
-            result = vpc_requests(selffool_url, selffool_authorization, request_data, list_outputs)
-            if result is not None:
-                logits = result["logits"]
-                trans = result["trans"]
-            else:
-                feed_dict = {
-                self.char_inputs: inputs,
-                self.lengths: lengths,
-                self.dropout: 1.0
-                }
-                
-                logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
+        if USE_API:
+            requests_result = requests.post(API_URL + "/predict_selfNer", json={"inputs": inputs.tolist(),'lengths':lengths}, verify=True)
+            path = json.loads(requests_result.text)['result']
         else:
             feed_dict = {
                 self.char_inputs: inputs,
@@ -123,7 +102,7 @@ class SelfNer():
             
             
             logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
-        path = decode(logits, trans, lengths, self.num_tags)
+            path = decode(logits, trans, lengths, self.num_tags)
         labels = [[self.id_to_tag.get(l) for l in p] for p in path]
         return labels
     

+ 3 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1855,9 +1855,10 @@ def article_limit(soup,limit_words=30000):
                 attachment_skip = False
                 for part in attachment_part.find_all(recursive=False):
                     if not attachment_skip:
-                        attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
+                        last_attachment_text_nums = attachment_text_nums
+                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
                         if attachment_text_nums>=limit_words:
-                            part.string = str(part.get_text())[:attachment_text_nums-limit_words]
+                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
                             attachment_skip = True
                     else:
                         part.decompose()

+ 2 - 1
BiddingKG/dl/interface/extract.py

@@ -232,6 +232,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
 
     '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果
     if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
@@ -245,7 +246,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0],**getAttributes.getOtherAttributes(list_entitys[0]), **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     # 要素的个数

+ 2 - 16
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -1378,30 +1378,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             else:
                 if temp_data:
                     deal_data += len(temp_data)
-                    if deal_data>3:
+                    if deal_data>4:
                         break
                     for _text_data, _pre_data in temp_data:
                         relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
                     temp_data = []
             start = start + maxlen - 120
         # print("预测数据:",len(temp_data))
-        # if len(temp_data)<=6:
-        #     for _text_data,_pre_data in temp_data:
-        #         relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
-        # else:
-        #     relation_list = []
         # 去重结果
         relation_list = list(set(relation_list))
     # print(relation_list)
-    # tokens_num_dict = dict()
-    # last_tokens_num = 0
-    # for sentence in list_sentence:
-    #     _index = sentence.sentence_index
-    #     if _index == 0:
-    #         tokens_num_dict[_index] = 0
-    #     else:
-    #         tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
-    #     last_tokens_num = len(sentence.tokens)
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()

+ 40 - 18
BiddingKG/dl/interface/modelFactory.py

@@ -14,7 +14,7 @@ from keras.preprocessing.sequence import pad_sequences
 from keras import optimizers,losses,metrics
 from BiddingKG.dl.common.Utils import *
 import tensorflow as tf
-
+import json
 
     
 class Model_role_classify():
@@ -277,18 +277,39 @@ class Model_relation_extraction():
         if company_relation < 2 and person_relation < 2:
             return False
         return True
+    def predict_by_api(self,text_in,words,sentence_vetor):
+        status_code = 0
+        # save([words,sentence_vetor.tolist()],"C:/Users/Administrator/Desktop/test_data.pk")
+        try:
+            requests_result = requests.post(API_URL + "/predict_relation", json={"sentence_vetor": sentence_vetor.tolist(), "words": words},
+                                            verify=True)
+            status_code = requests_result.status_code
+            triple_index_list = json.loads(requests_result.text)['triple_list']
+            # print("triple_list:",json.loads(requests_result.text)['triple_list'])
+            print("cost_time:",json.loads(requests_result.text)['cost_time'])
+            triple_list = [(text_in[triple[0]], triple[1], text_in[triple[2]]) for triple in triple_index_list]
+            return triple_list,status_code
+        except Exception as e:
+            print(e)
+            return [],status_code
 
     def predict(self,text_in, words, rate=0.5):
-        # 没有需要预测的链接属性,直接return
-        # if self.check_data(words):
-        #     return []
+        _t2 = np.zeros((len(words), self.words_size))
+        for i in range(len(words)):
+            _t2[i] = np.array(get_words_matrix(words[i]))
+
+        # a = time.time()
+        # triple_list, status_code = self.predict_by_api(text_in, words,_t2)
+        # print('time',time.time()-a)
+        # print("status_code",status_code)
+        # if status_code==200:
+        #     return triple_list
+        # else:
         # 使用模型预测
         triple_list = []
         # print("tokens:",words)
         # _t2 = [self.words2id.get(c, 1) for c in words]
-        _t2 = np.zeros((len(words), self.words_size))
-        for i in range(len(words)):
-            _t2[i] = np.array(get_words_matrix(words[i]))
+
         _t2 = np.array([_t2])
         _t3 = [1 for _ in words]
         _t3 = np.array([_t3])
@@ -448,19 +469,20 @@ class Model_form_item():
         return encodeInput_form(data)
 
     def predict(self,x):
+        if USE_API:
+            requests_result = requests.post(API_URL+"/predict_form_item",json={"inputs":x.tolist()}, verify=True)
+            list_result = json.loads(requests_result.text)['result']
+        else:
+            model_form = self.getModel()
 
-      model_form = self.getModel()
-
-      list_result = limitRun(self.sess_form,[model_form[1]],feed_dict={model_form[0][0]:x})[0]
-      return list_result
-      #return self.sess_form.run(model_form[1],feed_dict={model_form[0][0]:x})
-
-
+            list_result = limitRun(self.sess_form,[model_form[1]],feed_dict={model_form[0][0]:x})[0]
+        return list_result
+        # return self.sess_form.run(model_form[1],feed_dict={model_form[0][0]:x})
 
-      '''
-      with self.graph.as_default():
-          return self.getModel().predict(x)
-      '''
+        '''
+          with self.graph.as_default():
+              return self.getModel().predict(x)
+        '''
 
 class Model_form_context():
     def __init__(self,lazyLoad=getLazyLoad()):

+ 84 - 31
BiddingKG/dl/interface/predictor.py

@@ -250,7 +250,6 @@ class CodeNamePredict():
     
     def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
         #@summary: 获取每篇文章的code和name
-        
         pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
 
         result = []
@@ -291,20 +290,11 @@ class CodeNamePredict():
                 x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
                 x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
 
-                if USE_PAI_EAS:
-                    request = tf_predict_pb2.PredictRequest()
-                    request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
-                    request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
-                    request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
-                    request_data = request.SerializeToString()
-                    list_outputs = ["outputs"]
-                    _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
-                    if _result is not None:
-                        predict_y = _result["outputs"]
-                    else:
-                        with self.sess_codename.as_default():
-                            t_input,t_output = self.getModel()
-                            predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
+                if USE_API:
+                    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True)
+                    predict_y = json.loads(requests_result.text)['result']
+                    # print("cost_time:", json.loads(requests_result.text)['cost_time'])
+                    # print(MAX_LEN,_LEN,_begin_index)
                 else:
                     with self.sess_codename.as_default():
                         t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
@@ -1816,12 +1806,18 @@ class ProductPredictor():
                 if fail and list_articles!=[]:
                     text_list = [list_articles[0].content[:MAX_AREA]]
                     chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
-                    lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
-                                                      feed_dict={
-                                                          self.char_input: np.asarray(chars),
-                                                          self.dropout: 1.0
-                                                      })
-                    batch_paths = self.decode(scores, lengths, tran_)
+                    if USE_API:
+                        requests_result = requests.post(API_URL + "/predict_product",
+                                               json={"inputs": chars}, verify=True)
+                        batch_paths = json.loads(requests_result.text)['result']
+                        lengths = json.loads(requests_result.text)['lengths']
+                    else:
+                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                          feed_dict={
+                                                              self.char_input: np.asarray(chars),
+                                                              self.dropout: 1.0
+                                                          })
+                        batch_paths = self.decode(scores, lengths, tran_)
                     for text, path, length in zip(text_list, batch_paths, lengths):
                         tags = ''.join([str(it) for it in path[:length]])
                         for it in re.finditer("12*3", tags):
@@ -1867,12 +1863,18 @@ class ProductPredictor():
                         chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                         chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
                         chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
-                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
-                                                          feed_dict={
-                                                                    self.char_input: np.asarray(chars),
-                                                                    self.dropout: 1.0
-                                                                    })
-                        batch_paths = self.decode(scores, lengths, tran_)
+                        if USE_API:
+                            requests_result = requests.post(API_URL + "/predict_product",
+                                                   json={"inputs": chars.tolist()}, verify=True)
+                            batch_paths = json.loads(requests_result.text)['result']
+                            lengths = json.loads(requests_result.text)['lengths']
+                        else:
+                            lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                              feed_dict={
+                                                                        self.char_input: np.asarray(chars),
+                                                                        self.dropout: 1.0
+                                                                        })
+                            batch_paths = self.decode(scores, lengths, tran_)
                         for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
                             tags = ''.join([str(it) for it in path[:length]])
                             for it in re.finditer("12*3", tags):
@@ -2067,7 +2069,7 @@ class ProductAttributesPredictor():
                 order_end = "%s-%s-%s" % (y, m, num)
             return order_begin, order_end
 
-        t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
+        t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
         if t1:
             year = t1.group(1)
             month = t1.group(3)
@@ -2079,7 +2081,7 @@ class ProductAttributesPredictor():
             order_begin = "%s-%s-01" % (year, month)
             order_end = "%s-%s-%s" % (year, month, num)
             return order_begin, order_end
-        t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
+        t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
         if t2:
             y = t2.group(1)
             m = t2.group(3)
@@ -2088,8 +2090,31 @@ class ProductAttributesPredictor():
             d = '0'+d if len(d)<2 else d
             order_begin = order_end = "%s-%s-%s"%(y,m,d)
             return order_begin, order_end
-        all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
-                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
+        # 时间样式:"202105"
+        t3 = re.search("^(20\d{2})(\d{1,2})$",text)
+        if t3:
+            year = t3.group(1)
+            month = t3.group(2)
+            if int(month)>0 and int(month)<=12:
+                num = self.get_monthlen(year, month)
+                if len(month) < 2:
+                    month = '0' + month
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (year, month)
+                order_end = "%s-%s-%s" % (year, month, num)
+                return order_begin, order_end
+        # 时间样式:"20210510"
+        t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
+        if t4:
+            year = t4.group(1)
+            month = t4.group(2)
+            day = t4.group(3)
+            if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
+                order_begin = order_end = "%s-%s-%s"%(year,month,day)
+                return order_begin, order_end
+        all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
+                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
                                 '(?:(?P<d2>\d{1,2})日)?)?$', text)
         y1 = m1 = d1 = y2 = m2 = d2 = ""
         found_math = False
@@ -3373,3 +3398,31 @@ if __name__=="__main__":
     #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
     #     print(np.argmax(y,-1))
     '''
+
+    MAX_LEN = 1000
+    vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
+    vocab = load(vocabpath)
+    word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
+    index_unk = word2index.get("<unk>")
+    sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \
+               "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \
+               "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件,"
+    sentence = sentence*5
+    list_sentence = [sentence]*200
+    # print(list_sentence)
+    x = [[word2index.get(word, index_unk) for word in sentence] for sentence in
+         list_sentence]
+    x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
+    # print(x_len)
+    x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
+
+    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
+                                    verify=True)
+    # predict_y = json.loads(requests_result.text)['result']
+    print("cost_time:", json.loads(requests_result.text)['cost_time'])
+    print(MAX_LEN, len(sentence), len(list_sentence))
+    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
+                                    verify=True)
+    # predict_y = json.loads(requests_result.text)['result']
+    print("cost_time:", json.loads(requests_result.text)['cost_time'])
+    print(MAX_LEN, len(sentence), len(list_sentence))

+ 1 - 1
BiddingKG/dl/time/re_servicetime.py

@@ -11,7 +11,7 @@ def re_serviceTime(text):
     text_index_list = []
 
     before = '(?P<before>' \
-             '工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
+             '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
              '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
              '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
              '|交货时间|工期\(日历天\)' \