Procházet zdrojové kódy

产品字段提取问题解决

rogel před 4 roky
rodič
revize
44b04b7e29

+ 1 - 0
.gitignore

@@ -6,3 +6,4 @@
 /BiddingKG/dl/projectCode/traindata/
 /BiddingKG/dl/role/traindata/
 /BiddingKG/dl/test/traindata/
+/BiddingKG/dl/product/data/

+ 1 - 0
.idea/encodings.xml

@@ -2,5 +2,6 @@
 <project version="4">
   <component name="Encoding">
     <file url="file://$PROJECT_DIR$/BiddingKG/dl/form/websource_67000_table.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BiddingKG/dl/product/test/2021-01-29-2021-01-29公告信息.xlsx" charset="GBK" />
   </component>
 </project>

+ 22 - 102
BiddingKG/app.py

@@ -7,15 +7,13 @@ Created on 2019年12月3日
 import allspark
 import sys
 import os
+os.environ["KERAS_BACKEND"] = "tensorflow"
 import json
 import re
 import time
 import uuid
 from BiddingKG.dl.common.Utils import log
-import BiddingKG.dl.interface.predictor as predictor
-import BiddingKG.dl.interface.Preprocessing as Preprocessing
-import BiddingKG.dl.interface.getAttributes as getAttributes
-import BiddingKG.dl.entityLink.entityLink as entityLink
+from BiddingKG.dl.interface.extract import predict
 import numpy as np
 import ctypes
 import inspect
@@ -129,7 +127,7 @@ class MyProcessor(allspark.BaseProcessor):
                         print(entity.entity_text,entity.entity_type,entity.sentence_index,entity.begin_index,entity.label,entity.values)
                 '''
                 #print(prem)
-                data_res = Preprocessing.union_result(codeName, prem)[0][1]
+                data_res = predict(docid)
                 data_res["cost_time"] = cost_time
                 data_res["success"] = True
                 #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
@@ -151,10 +149,6 @@ class MyProcessor(allspark.BaseProcessor):
              do service intialization and load models in this function.
         """'''
         '''
-        self.codeNamePredict = predictor.CodeNamePredict()
-        self.premPredict = predictor.PREMPredict()
-        self.epcPredict = predictor.EPCPredict()
-        self.roleRulePredict = predictor.RoleRulePredictor()
         self.timeout = 60
         self.status_types = 5
         self.timeOfType = self.timeout//self.status_types
@@ -176,102 +170,28 @@ class MyProcessor(allspark.BaseProcessor):
         """
         data = data.decode("utf8")
         data = json.loads(data,encoding="utf8")
-        # k = str(uuid.uuid4())
-        # cost_time = dict()
-        # if "doc_id" in data:
-        #   _doc_id = data['doc_id']
-        # else:
-        #   _doc_id = ""
-        # if "title" in data:
-        #   _title = data["title"]
-        # else:
-        #   _title = ""
-        # data_res = ""
-        # try:
-        #     if "content" in data:
-        #         log("get request of doc_id:%s"%(_doc_id))
-        #         k = str(uuid.uuid4())
-        #         cost_time = dict()
-        #         content = data['content']
-        #         start_time = time.time()
-        #         list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_articles_processed([[k,content,"",_doc_id,_title]],useselffool=True)
-        #         log("get preprocessed done of doc_id%s"%(_doc_id))
-        #         cost_time["preprocess"] = time.time()-start_time
-        #         cost_time.update(_cost_time)
-        #         '''
-        #         for articles in list_articles:
-        #             print(articles.content)
-        #
-        #         '''
-        #         start_time = time.time()
-        #         codeName = self.codeNamePredict.predict(list_articles,MAX_AREA=2000)
-        #         log("get codename done of doc_id%s"%(_doc_id))
-        #         cost_time["codename"] = time.time()-start_time
-        #
-        #         start_time = time.time()
-        #         self.premPredict.predict(list_sentences,list_entitys)
-        #         log("get prem done of doc_id%s"%(_doc_id))
-        #         cost_time["prem"] = time.time()-start_time
-        #         start_time = time.time()
-        #         self.roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-        #         cost_time["rule"] = time.time()-start_time
-        #         start_time = time.time()
-        #         self.epcPredict.predict(list_sentences,list_entitys)
-        #         log("get epc done of doc_id%s"%(_doc_id))
-        #         cost_time["person"] = time.time()-start_time
-        #         start_time = time.time()
-        #         entityLink.link_entitys(list_entitys)
-        #         '''
-        #         for list_entity in list_entitys:
-        #             for _entity in list_entity:
-        #                 for _ent in _entity.linked_entitys:
-        #                     print(_entity.entity_text,_ent.entity_text)
-        #         '''
-        #         prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-        #         log("get attributes done of doc_id%s"%(_doc_id))
-        #         cost_time["attrs"] = time.time()-start_time
-        #
-        #
-        #         '''
-        #
-        #
-        #         for entitys in list_entitys:
-        #             for entity in entitys:
-        #                 print(entity.entity_text,entity.entity_type,entity.sentence_index,entity.begin_index,entity.label,entity.values)
-        #         '''
-        #         #print(prem)
-        #         data_res = Preprocessing.union_result(codeName, prem)[0][1]
-        #         data_res["cost_time"] = cost_time
-        #         data_res["success"] = True
-        #         #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-        #     else:
-        #         data_res = {"success":False,"msg":"content not passed"}
-        #
-        #
-        # except Exception as e:
-        #     data_res = {"success":False,"msg":str(e)}
-        # # 以json形式返回结果
-        # _resp = json.dumps(data_res,cls=MyEncoder)
-        # #log(str(data["flag"])+str(data))
-        # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
-        _timeout = self.timeout
+
+        _doc_id = data.get("doc_id","")
+        _title = data.get("title","")
+        _content = data.get("content","")
 
         status_code = 200
-        if "timeout" in data:
-            _timeout = data["timeout"]
+        # if "timeout" in data:
+        #     _timeout = data["timeout"]
         list_result = []
-        t = Thread(target=self.run_thread,args=(data,list_result))
-        start_time = time.time()
-        t.start()
-        t.join(_timeout)
-        if t.is_alive():
-            stop_thread(t)
-            status_code = 302#超时被kill
-            data_res = {"success":False,"msg":"timeout"}
-        else:
-            status_code += int((time.time()-start_time)//self.timeOfType+1)
-            data_res = list_result[0]
-        _resp = json.dumps(data_res,cls=MyEncoder)
+        # t = Thread(target=self.run_thread,args=(data,list_result))
+        # start_time = time.time()
+        # t.start()
+        # t.join(_timeout)
+        # if t.is_alive():
+        #     stop_thread(t)
+        #     status_code = 302#超时被kill
+        #     data_res = {"success":False,"msg":"timeout"}
+        # else:
+        #     status_code += int((time.time()-start_time)//self.timeOfType+1)
+        #     data_res = list_result[0]
+        # _resp = json.dumps(data_res,cls=MyEncoder)
+        _resp = predict(doc_id=_doc_id,text=_content,title=_title)
 
         return self.post_process(_resp),status_code
         

+ 3 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1060,6 +1060,9 @@ def segment(soup,final=True):
 
     #替换连续的标点
 
+    if final:
+        text = re.sub("##space##"," ",text)
+
     punc_pattern = "(?P<del>[。,;::,\s]+)"
 
     list_punc = re.findall(punc_pattern,text)
@@ -1098,8 +1101,6 @@ def segment(soup,final=True):
             LOOP_BEGIN += LOOP_LEN
         text = _text
 
-    if final:
-        text = re.sub("##space##"," ",text)
 
     return text
 

+ 82 - 0
BiddingKG/dl/interface/extract.py

@@ -0,0 +1,82 @@
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+from BiddingKG.dl.common.Utils import *
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import BiddingKG.dl.complaint.punish_predictor as punish_rule
+import json
+
+
+
+''''''
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+timePredict = predictor.TimePredictor()
+punish = punish_rule.Punish_Extract()
+productPredict = predictor.ProductPredictor()
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+def predict(doc_id,text,title=""):
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
+
+    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+
+    premPredict.predict(list_sentences,list_entitys)
+    productPredict.predict(list_sentences,list_entitys)
+
+    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    epcPredict.predict(list_sentences,list_entitys)
+    timePredict.predict(list_sentences, list_entitys)
+    entityLink.link_entitys(list_entitys)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+
+
+def test(name,content):
+    user = {
+        "content": content,
+        "id":name
+    }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+    resp_json = _resp.content.decode("utf-8")
+    print(resp_json)
+    return resp_json
+
+
+if __name__=="__main__":
+    pass

+ 38 - 32
BiddingKG/dl/interface/predictor.py

@@ -17,7 +17,7 @@ from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
 from tensorflow.python.framework import graph_util
-from BiddingKG.dl.product.data_util import decode, process_data, result_to_json
+from BiddingKG.dl.product.data_util import decode, process_data
 from BiddingKG.dl.interface.Entitys import Entity
 
 from threading import RLock
@@ -524,7 +524,9 @@ class PREMPredict():
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
-            
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
             p_entitys = 0
             p_sentences = 0
             while(p_entitys<len(list_entity)):
@@ -559,7 +561,9 @@ class PREMPredict():
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
-            
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
             p_entitys = 0
     
             while(p_entitys<len(list_entity)):
@@ -585,10 +589,12 @@ class PREMPredict():
     
     def predict_role(self,list_sentences, list_entitys):
         datas = self.search_role_data(list_sentences, list_entitys)
+
         if datas is None:
             return
         points_entitys = datas[1]
-        
+
+
         if USE_PAI_EAS:
             _data = datas[0]
             _data = np.transpose(np.array(_data),(1,0,2))
@@ -1038,36 +1044,36 @@ class RoleRulePredictor():
 
 
                 #确定性强的特殊修改
-                for s_index in range(len(list_sentence)):
-                    if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
-                        tokens = list_sentence[s_index].tokens
-                        begin_index = p_entity.begin_index
-                        end_index = p_entity.end_index
-                        size = 15
-                        spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
-                        #距离
-                        list_distance = [100,100,100,100,100]
-                        _flag = False
-                        for _key in self.dict_list_pattern.keys():
-                            for pattern in self.dict_list_pattern[_key]:
-                                if pattern[0]=="W":
-                                    spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
-                                    for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
-                                        _flag = True
-                                        if _iter.span()[0]<list_distance[int(_key)]:
-                                            list_distance[int(_key)] = _iter.span()[0]
-                        #得到结果
-                        _label = np.argmin(list_distance)
-                        if _flag:
-                            if _label==2 and min(list_distance[3:])<100:
-                                _label += np.argmin(list_distance[3:])+1
-                            if _label in [2,3,4]:
-                                if p_entity.entity_type in ["company","org"]:
+                if p_entity.entity_type in ["company","org"]:
+                    for s_index in range(len(list_sentence)):
+                        if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index:
+                            tokens = list_sentence[s_index].tokens
+                            begin_index = p_entity.begin_index
+                            end_index = p_entity.end_index
+                            size = 15
+                            spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False)
+                            #距离
+                            list_distance = [100,100,100,100,100]
+                            _flag = False
+                            for _key in self.dict_list_pattern.keys():
+                                for pattern in self.dict_list_pattern[_key]:
+                                    if pattern[0]=="W":
+                                        spans = spanWindow(tokens, begin_index, end_index, size=30, center_include=True, word_flag=True, use_text=False)
+                                        for _iter in re.finditer(pattern[1], spans[0][-10:]+spans[1]+spans[2]):
+                                            _flag = True
+                                            if _iter.span()[0]<list_distance[int(_key)]:
+                                                list_distance[int(_key)] = _iter.span()[0]
+                            #得到结果
+                            _label = np.argmin(list_distance)
+                            if _flag:
+                                if _label==2 and min(list_distance[3:])<100:
+                                    _label += np.argmin(list_distance[3:])+1
+                                if _label in [2,3,4]:
+                                        p_entity.label = _label
+                                        p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
+                                else:
                                     p_entity.label = _label
                                     p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
-                            else:
-                                p_entity.label = _label
-                                p_entity.values[int(_label)] = on_value+p_entity.values[int(_label)]/10
                 if p_entity.entity_type in ["money"]:
                     if str(p_entity.label)=="2":
                         for _sentence in list_sentence:

+ 5 - 3
BiddingKG/dl/product/data_util.py

@@ -5,7 +5,6 @@
 import re
 import math
 import random
-import psycopg2
 import numpy as np
 from tensorflow.contrib.crf import viterbi_decode
 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
@@ -15,10 +14,12 @@ word_model = getModel_word()
 vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
 word2id = {k: v for v, k in enumerate(vocab)}
 max_id = len(vocab)
-conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
-cursor = conn.cursor()
+
 
 def get_label_data():
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
     sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
       and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
     cursor.execute(sql)
@@ -77,6 +78,7 @@ def input_from_line(line):
     ids = [word2id.get(k, max_id) for k in string]
     tags = []
     return [[string], [ids], [tags]]
+
 def process_data(sentences):
     '''
     字符串数字化并统一长度

+ 0 - 0
BiddingKG/dl/product/test/__init__.py


+ 23 - 0
BiddingKG/dl/product/test/testProduct.py

@@ -0,0 +1,23 @@
+
+
+import requests
+import json
+import numpy as np
+
+def predictProduct():
+    url = "http://192.168.2.101:15030"
+    myheaders = {'Content-Type': 'application/json'}
+    doc_id = "12"
+    title = ""
+    content = "123123"
+
+    data = {"doc_id":doc_id,"title":title,"content":content}
+
+    resp = requests.post(url,json=data,headers=myheaders, verify=True)
+
+    print(resp.content.decode("utf8"))
+    print(json.loads(resp.content.decode("utf8"),"utf8"))
+
+
+if __name__=="__main__":
+    predictProduct()

+ 1 - 82
BiddingKG/dl/test/test4.py

@@ -16,91 +16,10 @@ import time
 _time1 = time.time()
 sys.path.append(os.path.abspath("../.."))
 import fool
-from BiddingKG.dl.interface.Connection import *
 from BiddingKG.dl.common.Utils import *
-from BiddingKG.dl.interface.Connection import getConnection
-import BiddingKG.dl.interface.predictor as predictor
-import BiddingKG.dl.interface.Preprocessing as Preprocessing
-import BiddingKG.dl.interface.getAttributes as getAttributes
-import BiddingKG.dl.entityLink.entityLink as entityLink
-# import BiddingKG.dl.complaint.punish_rule as punish_rule
-import BiddingKG.dl.complaint.punish_predictor as punish_rule
+from BiddingKG.dl.interface.extract import predict
 import json
 
-
-'''
-doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
-
-conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
-
-cursor = conn.cursor()
-
-cursor.execute(" select content from articles where id='"+doc_id+"' ")
-
-row = cursor.fetchall()[0]
-
-
-#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
-
-#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
-'''
-
-'''''' 
-codeNamePredict = predictor.CodeNamePredict()
-premPredict = predictor.PREMPredict()
-epcPredict = predictor.EPCPredict()
-roleRulePredict = predictor.RoleRulePredictor()
-timePredict = predictor.TimePredictor()
-punish = punish_rule.Punish_Extract()
-productPredict = predictor.ProductPredictor()
-
-#自定义jsonEncoder
-class MyEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        elif isinstance(obj, bytes):
-            return str(obj, encoding='utf-8')
-        elif isinstance(obj, (np.float_, np.float16, np.float32, 
-        np.float64)):
-            return float(obj)
-        elif isinstance(obj,str):
-            return obj
-        return json.JSONEncoder.default(self, obj)
-
-
-def predict(doc_id,text,title=""):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
-    for articles in list_articles:
-        print(articles.content)
-
-
-    ''''''
-        
-    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
-    print(codeName)
-    premPredict.predict(list_sentences,list_entitys)
-    productPredict.predict(list_sentences,list_entitys)
-    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-    print("epcPredict")
-    epcPredict.predict(list_sentences,list_entitys)
-    print("entityLink")
-    timePredict.predict(list_sentences, list_entitys)
-    print("timePredict")
-    entityLink.link_entitys(list_entitys)
-    print("getPREMs")
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-    print("getPREMs")
-    list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
-
-
-    for entitys in list_entitys:
-        for entity in entitys:
-            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
-    #print(prem)
-    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-
-         
 def test(name,content):
     user = {
             "content": content,