luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							'''
Created on 2018年12月26日

@author: User
'''

import sys
import os
import codecs
import re
sys.path.append(os.path.abspath("../.."))
import requests
from BiddingKG.dl.common.Connection import *
import time
import psycopg2
import glob
from BiddingKG.dl.common.Utils import *
import json


if __name__=="__main__":

    #测试接口的代码
    #conn = getConnection()
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    #validation
    #sql = " select content,id from articles where id in(select doc_id from articles_validation where exists(select 1 from articles_processed where id=doc_id)) order by id"
    #training
    #sql = " select content,id from articles where not exists(select 1 from articles_validation where doc_id=articles.id) order by id limit 5000"
    
    sql = " select id from articles_processed "
    cursor.execute(sql)
    
    rows = cursor.fetchall()
    
    ids = []
    for row in rows:
        ids.append(row[0])
        
    
    # 添加对应headers 及 tonken 用于数据传参和登录认证使用
    myheaders = {'Content-Type': 'application/json'}
    # 接口测试数据
    
    #guardian_base = 'http://47.110.128.185:15015'
    guardian_base1 = 'http://127.0.0.1:15013'
    guardian_base2 = 'http://192.168.2.101:15015'
    # 使用requests的post方法进行请求路由
    result = []
    #content = row[0]
    #content = "<div> <div> 110kV龙台变～净 化厂35kV电力线路(外电部分) </div> <a>附件下载，文件请用音频播放器或者360浏览器打开</a> <br> <a>公告来源：http://bulletin.cebpubservice.com/biddingBulletin/2019-02-12/1058505.html</a> </div>"
    
    i = 0
    a1 = time.time()
    same_flag = None
    # files = glob.glob("C:\\Users\\User\\Desktop\\测试数据20200312\\*.html")
    files = data = load("../test/label_0_1197.pk")
    for file in files:
        # name = file.split("\\")[-1]
        name = file["filename"]
        # content = codecs.open(file,"r",encoding="utf8").read()
        content = file["content"]
        i += 1
        
        print(i,len(files))
        if i>=2000:
            break
        # print(file)
        user = {
            "content": content,
            "title":"XXXXXX",
            "doc_id":"1234555"
            }
        a = time.time()
        # _resp = requests.post(guardian_base2 + '/article_extract', json=user, headers=myheaders, verify=True)
        # resp_json = _resp.content.decode("utf-8")
        # print(resp_json)
        resp_json = {"code":file["code"],"name":file["name"]}
        _resp1 = requests.post(guardian_base2 + '/content_extract', json=user, headers=myheaders, verify=True)
        resp_json1 = _resp1.content.decode("utf-8")
        resp_json1 = json.loads(resp_json1)
        resp_json1 = {"code":resp_json1["code"],"name":resp_json1["name"]}
        resp_json = str(resp_json).replace("（","(").replace("）",")")
        print(resp_json)
        print(resp_json1)
        if resp_json==resp_json1:
            same_flag = 0
        else:
            same_flag = 1
        result.append([name,same_flag,resp_json,resp_json1])
    
    ''''''
    #将结果输出到文件方便查看
    result.sort(key=lambda x:x[1],reverse=True)
    i = 0
    with codecs.open("testInterface.html", "w", encoding="utf8") as f:
        f.write('<html><head>\
        <meta http-equiv="Content-Type"\
        content="text/html; charset=UTF-8">\
        </head>\
        <body bgcolor="#FFFFFF">\
        <table border="1">\
        <tr>\
        <td>序号</td>\
        <td>doc_id</td>\
        <td>same</td>\
        <td width=40%>before</td>\
        <td width=40%>重新训练</td>\
        </tr>')
        for item in result:
            i += 1
            f.write("<tr>"+"<td>"+str(i)+"</td>"+"<td>"+str(item[0])+"</td>"+"<td>"+str(item[1])+"</td>"+"<td>"+str(item[2])+"</td>"+"<td>"+str(item[3])+"</td>"+"</tr>")
        f.write("</table></body>")