luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
							'''
Created on 2019年1月3日

@author: User
'''
import sys
import os
import json
import re
import pickle
import requests
import codecs
from bs4 import BeautifulSoup
import time
import shutil
from threading import Thread
import jpype

sys.path.append(os.path.abspath("../.."))


def save(object_to_save, path):
    '''
    保存对象
    @Arugs:
        object_to_save: 需要保存的对象

    @Return:
        保存的路径
    '''
    with open(path, 'wb') as f:
        pickle.dump(object_to_save, f)

def load(path):
    '''
    读取对象
    @Arugs:
        path: 读取的路径

    @Return:
        读取的对象
    '''
    with open(path, 'rb') as f:
        object1 = pickle.load(f)
        return object1
def test(name,content):
    user = {
        #"content": "XXXXXXXXXXXXXXXXXXX",
        "content": content,
        "id":name,
        "doc_id":"1234"
    }
    myheaders = {'Content-Type': 'application/json',"Authorization": "NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg==","appKey": "203780894","appSecret": "3rwyr0b8djsn6l3o4i8mplxe4giiy2ke"}
    try:
        #_resp = requests.post('http://pai-eas-vpc.cn-hangzhou.aliyuncs.com/api/predict/content_extract', json=user, headers=myheaders, verify=True)
        _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
        # _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
        # _resp = requests.post("http://127.0.0.1:15013" + '/content_extract', json=user, headers=myheaders, verify=True)
        # print("http://127.0.0.1:15014")
        resp_json = _resp.content.decode("utf-8")
        print("##",_resp.status_code)
        print("==",resp_json)
        print(json.loads(resp_json))
        if _resp.status_code==201:
            print(json.loads(resp_json))
            return resp_json
        else:
            print(resp_json)
            return None
    except Exception as e:
        print(str(e))
        return None

def getFile(filename):
    path = "C:\\Users\\User\\Desktop\\数据20191014\\"
    file = path+filename
    dest_dir = "C:\\Users\\User\\Desktop\\getfile"
    shutil.copy(file,dest_dir)
  
''' '''
class MyThread(Thread):
    
    cost_time = dict()
    list_result = []
    num_200 = []
    num_other = []
    
    data = load("list_contents.pk")
    def run(self):

        for item in self.data[:100]:
            filename = item[0]
            content = item[1]
            result = test(filename,content)
            if result is not None:
                self.num_200.append("")
                self.list_result.append(result)
                result = json.loads(result)
                _time = result["cost_time"]
                for _key in _time.keys():
                    if _key not in self.cost_time:
                        self.cost_time[_key] = 0
                    self.cost_time[_key] += _time[_key]
            else:
                self.num_other.append("")
    
        
def test_highConcurrency():
    thread_num = 10
    list_thread = []
    cost_time = dict()
    _start_time = time.time()
    for i in range(thread_num):
        t = MyThread()
        list_thread.append(t)
    for t in list_thread:
        t.start()
    for t in list_thread:
        t.join()
    t = list_thread[0]
    _time = t.cost_time
    for _key in _time.keys():
        if _key not in cost_time:
            cost_time[_key] = 0
        cost_time[_key] += _time[_key]
    num_200 = len(t.num_200)
    num_other = len(t.num_other)

    print("==================")
    print("cost:",time.time()-_start_time)
    print("num_200:",num_200,"num_other:",num_other)
    print(cost_time)


if __name__=="__main__":
    import os
    os.environ

    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
    start_time = time.time()
    content = str(BeautifulSoup(text,'lxml').find("div",id="pcontent"))
    print(content)
    test("12",content)
    print("takes %d"%(time.time()-start_time))
    
    '''
    data = load("list_contents.pk")
    for item in data[:100]:
        filename = item[0]
        content = item[1]
        a = time.time()
        #predict("12",content)
        test("12",content)
        print("takes",time.time()-a)
        break


    test_highConcurrency()
    '''