luojiehua 2 yıl önce
ebeveyn
işleme
fa2b64d986
38 değiştirilmiş dosya ile 4724 ekleme ve 0 silme
  1. 92 0
      BiddingKG/dl_dev/test/10.py
  2. 3 0
      BiddingKG/dl_dev/test/11.py
  3. 4 0
      BiddingKG/dl_dev/test/12.py
  4. BIN
      BiddingKG/dl_dev/test/1598545223446.jpg
  5. 3 0
      BiddingKG/dl_dev/test/2.py
  6. BIN
      BiddingKG/dl_dev/test/20201111181124.png
  7. 159 0
      BiddingKG/dl_dev/test/3.py
  8. 68 0
      BiddingKG/dl_dev/test/5.py
  9. 26 0
      BiddingKG/dl_dev/test/6.py
  10. 88 0
      BiddingKG/dl_dev/test/7.py
  11. 59 0
      BiddingKG/dl_dev/test/9.py
  12. 0 0
      BiddingKG/dl_dev/test/__init__.py
  13. 276 0
      BiddingKG/dl_dev/test/article_extract.py
  14. 122 0
      BiddingKG/dl_dev/test/compare.txt
  15. 0 0
      BiddingKG/dl_dev/test/compare1.txt
  16. BIN
      BiddingKG/dl_dev/test/data/t10k-images-idx3-ubyte.gz
  17. BIN
      BiddingKG/dl_dev/test/data/t10k-labels-idx1-ubyte.gz
  18. BIN
      BiddingKG/dl_dev/test/data/train-images-idx3-ubyte.gz
  19. BIN
      BiddingKG/dl_dev/test/data/train-labels-idx1-ubyte.gz
  20. BIN
      BiddingKG/dl_dev/test/list_sentence_entity.pk
  21. BIN
      BiddingKG/dl_dev/test/model_person_classify.model.hdf5
  22. BIN
      BiddingKG/dl_dev/test/model_person_classify_fjs.model.hdf5
  23. BIN
      BiddingKG/dl_dev/test/person_savedmodel_new/saved_model.pb
  24. BIN
      BiddingKG/dl_dev/test/person_savedmodel_new/variables/variables.data-00000-of-00001
  25. BIN
      BiddingKG/dl_dev/test/person_savedmodel_new/variables/variables.index
  26. 25 0
      BiddingKG/dl_dev/test/t2/1.py
  27. 0 0
      BiddingKG/dl_dev/test/t2/__init__.py
  28. 116 0
      BiddingKG/dl_dev/test/test4.py
  29. 1621 0
      BiddingKG/dl_dev/test/test_data_fjs.py
  30. 635 0
      BiddingKG/dl_dev/test/test_model_fjs.py
  31. 93 0
      BiddingKG/dl_dev/test/testocr.py
  32. 3 0
      BiddingKG/dl_dev/test/testp.py
  33. 121 0
      BiddingKG/dl_dev/test/val_fromiepy.py
  34. 67 0
      BiddingKG/dl_dev/test/val_multi.py
  35. 362 0
      BiddingKG/dl_dev/test/validation.py
  36. BIN
      BiddingKG/dl_dev/test/vocab_word.pk
  37. 304 0
      BiddingKG/dl_dev/test/测试所有提取信息.py
  38. 477 0
      BiddingKG/dl_dev/test/测试整个要素提取流程.py

+ 92 - 0
BiddingKG/dl_dev/test/10.py

@@ -0,0 +1,92 @@
+#coding:UTF8
+import logging
+import json
+import time,re
+time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+def process(extractjson,otherjson):
+    if extractjson is not None:
+        _extract = json.loads(extractjson)
+    else:
+        _extract = {}
+    if otherjson is not None:
+        _other = json.loads(otherjson)
+    else:
+        _other = {}
+    project_code = ""
+    project_name = ""
+    tenderee = ""
+    agency = ""
+    win_tenderer = ""
+    bidding_budget = ""
+    win_bid_price = ""
+    page_time_stamp = 0
+    docchannel = 0
+    extract_count = 0
+    page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
+    doctitle = _other.get("doctitle","")
+    doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  doctitle)
+    area = _other.get("area","")
+    province = _other.get("province","")
+    city = _other.get("city","")
+    district = _other.get("district","")
+    web_source_no = _other.get("webSourceNo","")
+    docchannel = _other.get("docchannel",0)
+    if re.search(time_pattern,page_time) is not None:
+        timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
+        page_time_stamp = int(time.mktime(timeArray))
+    list_code = _extract.get("code",[])
+    if len(list_code)>0:
+        project_code = list_code[0]
+    project_name = _extract.get("name","")
+    dict_pack = _extract.get("prem",{})
+    logging.info(dict_pack)
+    for _key in dict_pack.keys():
+        if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
+            extract_count += 1
+            if bidding_budget=="":
+                bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
+        for _role in dict_pack[_key]["roleList"]:
+            extract_count += 1
+            if _role[2]!='' and float(_role[2])>0:
+                extract_count += 1
+            if _role[0]=="tenderee":
+                tenderee = _role[1]
+            if _role[0]=="win_tenderer":
+                if  win_tenderer=="":
+                    win_tenderer = _role[1]
+                if _role[2]!='' and float(_role[2])>0:
+                    if win_bid_price=="":
+                        win_bid_price = str(float(_role[2]))
+            if _role[0]=="agency":
+                agency = _role[1]
+
+
+    if project_code!="":
+        extract_count += 1
+    if project_name!="":
+        extract_count += 1
+    logging.info('page_time=%s,doctitle=%s,doctitle_refine=%s,area=%s,province=%s,city=%s,'
+                 'district=%s,web_source_no=%s,project_code=%s,project_name=%s,tenderee=%s,agency=%s,win_tenderer=%s,bidding_budget=%s,win_bid_price=%s'%(page_time,doctitle,doctitle_refine,area,province,city,
+                 district,web_source_no,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price))
+
+
+if __name__=="__main__":
+    extractjson = '''
+    { "bidway": "竞争性谈判", "code": [ "SDGP370883202102000014" ], "cost_time": { "attrs": 0.012673616409301758, "codename": 0.17389655113220215, "nerToken": 0.47509217262268066, "person": 0.015163421630859375, "prem": 0.026870250701904297, "preprocess": 0.5857865810394287, "product": 0.19810962677001953, "punish": 0.16147398948669434, "rule": 0.004196882247924805, "tableToText": 0.06637310981750488, "time": 0.0062367916107177734 }, "docid": "", "moneysource": "", "name": "邹城市葛山摩崖石刻本体保护工程", "person_review": [], "prem": { "A": { "code": "", "roleList": [ [ "win_tenderer", "成都市屹华建筑工程公司", "2063787.0000", [ [ "袁华阳", "" ] ] ] ], "tendereeMoney": 0 }, "Project": { "code": "", "roleList": [ [ "tenderee", "邹城市文化和旅游局", 0, [ [ "王主任", "18653740099" ], [ "解华英", "" ] ] ], [ "agency", "山东省建设工程招标中心有限公司", 0, [ [ "张旋", "18653791560" ] ] ], [ "win_tenderer", "甘肃中铁建设工程有限公司", 2063787.0, [] ] ], "tendereeMoney": 0 } }, "product": [ "摩崖石刻本体保护工程", "石刻本体保护工程" ], "punish": {}, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_release": "2021-02-01" }
+    '''
+    otherjson = '''
+    {"city":"济南","pageTme":"2021-02-19","industry":"通用设备","uuid":"68c90d8b-7287-11eb-b5b7-c81f66ef0810","crtime":"2021-02-19 15:52:27","infoType":"机械设备","province":"山东","cruser":"superbxkcadmin","webSourceNo":"00049-1","docstatus":20,"area":"华东","webSourceName":"山东省政府采购信息公开平台","doccontent":"邹城市葛山摩崖石刻本体保护工程中标公告 邹城市葛山摩崖石刻本体保护工程中标公告 详细信息 邹城市葛山摩崖石刻本体保护工程成交公告 一、采购人:邹城市文化和旅游局 地址:邹城市太平东路2669号(邹城市文化和旅游局) 联系方式:3235508(邹城市文化和旅游局) 采购代理机构:山东省建设工程招标中心有限公司 地址:山东省济南市市中区县(区)经六路小纬四路46-1号 联系方式:0537-523","docid":134554287,"opertime":"2021-02-19 15:53:08","doctitle":"邹城市葛山摩崖石刻本体保护工程中标公告","infoSource":"政府采购","dockeywords":"邹城市:0.2218,葛山:0.1629,摩崖:0.1340,石刻:0.1287,本体:0.0945,较低:0.0931,得分:0.0917,屹:0.0863,占优势:0.0839,巴人:0.0817,偏低:0.0662,工程公司:0.0640,旅游局:0.0628,成都市:0.0543,保护:0.0526","partitionkey":288,"district":"市中","docchannel":101,"publishtime":"2021-02-19 15:53:08","status":8}
+    '''
+    process(extractjson,otherjson)
+
+    extractjson = '''
+    { "bidway": "磋商", "code": [ "SDGP370883202102000014" ], "cost_time": { "attrs": 0.010054588317871094, "codename": 0.07792425155639648, "nerToken": 0.14783811569213867, "person": 0.01098489761352539, "prem": 0.027380943298339844, "preprocess": 0.21127796173095703, "product": 0.08798742294311523, "punish": 0.0709075927734375, "rule": 0.0015246868133544922, "tableToText": 0.05526423454284668, "time": 0.0064449310302734375 }, "docid": "", "moneysource": "", "name": "邹城市葛山摩崖石刻本体保护工程", "person_review": [], "prem": { "A": { "code": "ZRGC20150476", "roleList": [ [ "win_tenderer", "成都市屹华建筑工程公司", "2063787", [ [ "袁华阳", "" ] ] ] ], "tendereeMoney": 0 }, "Project": { "code": "", "roleList": [ [ "tenderee", "邹城市文化和旅游局", 0, [ [ "王主任", "18653740099" ] ] ], [ "agency", "山东省建设工程招标中心有限公司", 0, [ [ "张旋", "18653791560" ] ] ] ], "tendereeMoney": 0 } }, "product": [ "石刻本体保护工程" ], "punish": {}, "serviceTime": "180日历天", "success": true, "time_bidclose": "", "time_bidopen": "", "time_release": "" }
+    '''
+    otherjson = '''
+    {"city":"济南","pageTime":"2021-02-19","industry":"通用设备","uuid":"03d1a62d-7285-11eb-b5b7-c81f66ef0810","crtime":"2021-02-19 15:35:18","infoType":"机械设备","province":"山东","cruser":"superbxkcadmin","webSourceNo":"03758-2","docstatus":20,"area":"华东","webSourceName":"山东省公共资源交易信息网","doccontent":"邹城市葛山摩崖石刻本体保护工程成交公告 邹城市葛山摩崖石刻本体保护工程成交公告 一、采购项目名称:邹城市葛山摩崖石刻本体保护工程 二、采购项目编号:SDGP370883202102000014 三、公告发布日期:2021年2月1日 四、成交日期:2021年2月19日 五、采购方式:竞争性谈判( ),竞争性磋商(√),询价( ),单一来源( )。 六、成交情况: 包 号 预中标供应商名称 工期 质量","docid":134551869,"opertime":"2021-02-19 15:36:06","doctitle":"邹城市葛山摩崖石刻本体保护工程成交公告","infoSource":"政府采购","dockeywords":"邹城市:0.1956,葛山:0.1437,得分:0.1213,摩崖:0.1182,石刻:0.1135,较低:0.0958,占优势:0.0863,巴人:0.0841,本体:0.0834,屹:0.0710,偏低:0.0681,成交:0.0545,文物:0.0535,工程公司:0.0527,旅游局:0.0517","partitionkey":370,"district":"市中","docchannel":101,"publishtime":"2021-02-19 15:36:06","status":8}
+    '''
+    process(extractjson,otherjson)
+
+    print(899463/8491933)

+ 3 - 0
BiddingKG/dl_dev/test/11.py

@@ -0,0 +1,3 @@
+a = "%d-"
+
+print(a%2)

+ 4 - 0
BiddingKG/dl_dev/test/12.py

@@ -0,0 +1,4 @@
+print("243705217")
+
+
+

BIN
BiddingKG/dl_dev/test/1598545223446.jpg


+ 3 - 0
BiddingKG/dl_dev/test/2.py

@@ -0,0 +1,3 @@
+a = [1,2,3]
+
+print(a/10)

BIN
BiddingKG/dl_dev/test/20201111181124.png


+ 159 - 0
BiddingKG/dl_dev/test/3.py

@@ -0,0 +1,159 @@
+'''
+Created on 2019年1月3日
+
+@author: User
+'''
+import sys
+import os
+import json
+import re
+import pickle
+import requests
+import codecs
+from bs4 import BeautifulSoup
+import time
+import shutil
+from threading import Thread
+import jpype
+
+sys.path.append(os.path.abspath("../.."))
+
+
+def save(object_to_save, path):
+    '''
+    保存对象
+    @Arugs:
+        object_to_save: 需要保存的对象
+
+    @Return:
+        保存的路径
+    '''
+    with open(path, 'wb') as f:
+        pickle.dump(object_to_save, f)
+
+def load(path):
+    '''
+    读取对象
+    @Arugs:
+        path: 读取的路径
+
+    @Return:
+        读取的对象
+    '''
+    with open(path, 'rb') as f:
+        object1 = pickle.load(f)
+        return object1
+def test(name,content):
+    user = {
+        #"content": "XXXXXXXXXXXXXXXXXXX",
+        "content": content,
+        "id":name,
+        "doc_id":"1234"
+    }
+    myheaders = {'Content-Type': 'application/json',"Authorization": "NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg==","appKey": "203780894","appSecret": "3rwyr0b8djsn6l3o4i8mplxe4giiy2ke"}
+    try:
+        #_resp = requests.post('http://pai-eas-vpc.cn-hangzhou.aliyuncs.com/api/predict/content_extract', json=user, headers=myheaders, verify=True)
+        _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+        # _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
+        # _resp = requests.post("http://127.0.0.1:15013" + '/content_extract', json=user, headers=myheaders, verify=True)
+        # print("http://127.0.0.1:15014")
+        resp_json = _resp.content.decode("utf-8")
+        print("##",_resp.status_code)
+        print("==",resp_json)
+        print(json.loads(resp_json))
+        if _resp.status_code==201:
+            print(json.loads(resp_json))
+            return resp_json
+        else:
+            print(resp_json)
+            return None
+    except Exception as e:
+        print(str(e))
+        return None
+
+def getFile(filename):
+    path = "C:\\Users\\User\\Desktop\\数据20191014\\"
+    file = path+filename
+    dest_dir = "C:\\Users\\User\\Desktop\\getfile"
+    shutil.copy(file,dest_dir)
+  
+''' '''
+class MyThread(Thread):
+    
+    cost_time = dict()
+    list_result = []
+    num_200 = []
+    num_other = []
+    
+    data = load("list_contents.pk")
+    def run(self):
+
+        for item in self.data[:100]:
+            filename = item[0]
+            content = item[1]
+            result = test(filename,content)
+            if result is not None:
+                self.num_200.append("")
+                self.list_result.append(result)
+                result = json.loads(result)
+                _time = result["cost_time"]
+                for _key in _time.keys():
+                    if _key not in self.cost_time:
+                        self.cost_time[_key] = 0
+                    self.cost_time[_key] += _time[_key]
+            else:
+                self.num_other.append("")
+    
+        
+def test_highConcurrency():
+    thread_num = 10
+    list_thread = []
+    cost_time = dict()
+    _start_time = time.time()
+    for i in range(thread_num):
+        t = MyThread()
+        list_thread.append(t)
+    for t in list_thread:
+        t.start()
+    for t in list_thread:
+        t.join()
+    t = list_thread[0]
+    _time = t.cost_time
+    for _key in _time.keys():
+        if _key not in cost_time:
+            cost_time[_key] = 0
+        cost_time[_key] += _time[_key]
+    num_200 = len(t.num_200)
+    num_other = len(t.num_other)
+
+    print("==================")
+    print("cost:",time.time()-_start_time)
+    print("num_200:",num_200,"num_other:",num_other)
+    print(cost_time)
+
+
+if __name__=="__main__":
+    import os
+    os.environ
+
+    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    start_time = time.time()
+    content = str(BeautifulSoup(text,'lxml').find("div",id="pcontent"))
+    print(content)
+    test("12",content)
+    print("takes %d"%(time.time()-start_time))
+    
+    '''
+    data = load("list_contents.pk")
+    for item in data[:100]:
+        filename = item[0]
+        content = item[1]
+        a = time.time()
+        #predict("12",content)
+        test("12",content)
+        print("takes",time.time()-a)
+        break
+
+
+    test_highConcurrency()
+    '''

+ 68 - 0
BiddingKG/dl_dev/test/5.py

@@ -0,0 +1,68 @@
+'''
+Created on 2019年1月15日
+
+@author: User
+'''
+
+import tensorflow as tf
+from tensorflow.contrib.crf import crf_log_likelihood
+
+path = "D://Anaconda3.4//envs//dl_nlp//fool//pos.pb"
+
+
+def loss_layer(project_logits,y_target,trans,max_steps):
+    with tf.variable_scope("crf_loss1"):
+        log_likelihood, trans = crf_log_likelihood(inputs=project_logits, tag_indices=y_target,
+                                                   transition_params=trans, sequence_lengths=max_steps)
+    return tf.reduce_mean(-log_likelihood)
+
+def load_graph(path):
+    with tf.gfile.GFile(path, mode='rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        for i,n in enumerate(graph_def.node):
+            print("Name of the node - %s" % n.name)
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name="prefix")
+    return graph
+'''
+with tf.gfile.GFile(path, mode='rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    for i,n in enumerate(graph_def.node):
+        print("Name of the node - %s" % n.name)
+with tf.Graph().as_default() as graph:
+    tf.import_graph_def(graph_def)
+    trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0")
+    logits = graph.get_tensor_by_name("prefix/project/logits:0")
+    y_target = tf.placeholder()
+    loss = loss_layer(logits, y_target, trans, 100)
+    summaryWriter = tf.summary.FileWriter('log/', graph)
+    #tf.Graph().get_operations()
+    
+'''
+
+def buildModel():
+    graph = load_graph(path)
+    with graph.as_default():
+        trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0")
+        lengths = graph.get_tensor_by_name("prefix/lengths:0")
+        logits = graph.get_tensor_by_name("prefix/project/logits:0")
+        print(logits)
+        print(trans)
+        y_target = tf.placeholder(dtype=tf.int32, shape=[None, None], name='y_target')
+        #loss = loss_layer(logits, y_target, trans, lengths)
+        summaryWriter = tf.summary.FileWriter('log/', graph)
+    
+if __name__=="__main__":
+    # import fool
+    # a = fool.LEXICAL_ANALYSER
+    # a._load_ner_model()
+    # _dict = a.ner_model.id_to_tag
+    # for _key in _dict.keys():
+    #     print(_key,_dict[_key])
+    # load_graph(path)
+    a = [1,2,3,44]
+    print(a[-100:])
+        
+    

+ 26 - 0
BiddingKG/dl_dev/test/6.py

@@ -0,0 +1,26 @@
+'''
+Created on 2019年10月24日
+
+@author: User
+'''
+
+import pyhdfs
+import os
+import codecs
+
+
+fs = pyhdfs.HdfsClient(hosts="192.168.48.178,50070",user_name='root1')
+print(fs.get_home_directory())
+print(fs.get_active_namenode())
+file = fs.get_home_directory()+"/ContentExtract.rar"
+localfile = os.path.abspath("./ContentExtract.rar")
+print(localfile)
+print(fs.exists(file))
+if not os.path.exists(localfile):
+    with codecs.open(localfile,"w") as f:
+        f.flush()
+#fs.copy_to_local(file,localfile)
+fs.delete(file)
+fs.copy_from_local(localfile,file)
+print(fs.listdir(fs.get_home_directory()))
+print(fs.exists(file))

+ 88 - 0
BiddingKG/dl_dev/test/7.py

@@ -0,0 +1,88 @@
+#coding:UTF-8
+a = 1<<10
+b = bin(a)
+print(int(b[2:],2)>>10)
+c = a | (1<<2)
+print(b)
+print(a)
+
+a = "1234"
+
+print("-",a[3:])
+
+import math
+print(2**32)
+print(math.pow(2,32))
+
+print(hex(ord('g')))
+
+def get_s16(val):
+    if val < 0x80000000:
+        return val
+    else:
+        return (val - 0x100000000)
+print(hex(336860180))
+
+print(0xf0551700)
+print(0xe8c81900)
+
+print([int(math.floor(abs(math.sin(i + 1)) * (2 ** 32))) for i in range(64)])
+
+import re
+_pattern = "(?P<projectDigest>项目概况.{10,1000})"
+# _pattern = "(建筑面积[约为是]*[\d,]+(\.\d+)?[十百千万]*(㎡|平方米))"
+
+text = '''
+项目名称:淮上区2020年市政道路维修及绿化养护施工工程监理 
+'''
+text = text.replace("\r","").replace("\n",'')
+
+
+
+def extract_proportion(content):
+    _pattern = "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万]*([\]】平方kK千万公㎡mM米里]*))"
+    _pattern_search = re.search(_pattern,content)
+    _proportion = ""
+    if _pattern_search is not None:
+        _proportion = _pattern_search.groupdict().get("proportion","")
+    return _proportion
+
+def extract_projectDigest(content):
+    _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
+    _pattern_search = re.search(_pattern,content)
+    _projectDigest = ""
+    _find = ""
+    if _pattern_search is not None:
+        _find = _pattern_search.groupdict().get("projectDigest","")
+    if len(_find)>0:
+        _projectDigest = "。".join(_find.split("。")[0:3])
+    return _projectDigest
+
+print(extract_proportion(text))
+
+print(re.findall(_pattern,text))
+print(extract_projectDigest(text))
+
+import uuid
+
+print(uuid.uuid4())
+
+def extract_legal_stage(content,_pattern):
+    dict_stage = {"设计阶段":"设计",
+                  "环评阶段":"环评",
+                  "施工准备":"监理",
+                  "施工在建":"施工"}
+    list_stage_v = []
+    for k,v in dict_stage.items():
+        list_stage_v.append("(?P<%s>%s)"%(k,v))
+    stage_pattern = "|".join(list_stage_v)
+    list_stage = []
+    for stage_search in re.finditer(stage_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[-1]
+    return None
+
+print(extract_legal_stage(text,"工程"))

+ 59 - 0
BiddingKG/dl_dev/test/9.py

@@ -0,0 +1,59 @@
+import requests
+import re
+
+import simhash
+
+url = "https://www.qimao.com/shuku/a-a-a-a-a-a-a-click-1/"
+
+def get_hexxor(s1, _0x4e08d8):
+    _0x5a5d3b = ''
+
+    for i in range(len(s1)):
+        if i % 2 != 0: continue
+        _0x401af1 = int(s1[i: i+2], 16)
+        _0x105f59 = int(_0x4e08d8[i: i+2], 16)
+        _0x189e2c_10 = (_0x401af1 ^ _0x105f59)
+        print("i==",_0x401af1,_0x105f59,_0x189e2c_10)
+        print("i==",hex(_0x401af1),hex(_0x105f59),hex(_0x189e2c_10))
+        _0x189e2c = hex(_0x189e2c_10)[2:]
+        print("_0x189e2c",_0x189e2c)
+        if len(_0x189e2c) == 1:
+            _0x189e2c = '0' + _0x189e2c
+            print("_0x189e2c",_0x189e2c)
+        _0x5a5d3b += _0x189e2c
+    return _0x5a5d3b
+
+def get_unsbox(arg1):
+    _0x4b082b = [0xf, 0x23, 0x1d, 0x18, 0x21, 0x10, 0x1, 0x26, 0xa, 0x9, 0x13, 0x1f, 0x28, 0x1b, 0x16, 0x17, 0x19, 0xd,
+                 0x6, 0xb, 0x27, 0x12, 0x14, 0x8, 0xe, 0x15, 0x20, 0x1a, 0x2, 0x1e, 0x7, 0x4, 0x11, 0x5, 0x3, 0x1c,
+                 0x22, 0x25, 0xc, 0x24]
+    print(_0x4b082b)
+    _0x4da0dc = []
+    _0x12605e = ''
+    for i in _0x4b082b:
+        print('i--',i,i-1)
+        _0x4da0dc.append(arg1[i-1])
+    _0x12605e = "".join(_0x4da0dc)
+    return _0x12605e
+
+
+# 第一次请求获取js代码
+headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
+
+r = requests.get(url, headers=headers)
+# 重js中匹配出 arg1
+arg1 = re.findall("arg1=\'(.*?)\'", r.text)[0]
+
+# 参数生成
+s1 = get_unsbox(arg1)
+print("===",arg1,s1)
+_0x4e08d8 = "3000176000856006061501533003690027800375"
+_0x12605e = get_hexxor(s1, _0x4e08d8)
+
+print(s1, _0x12605e)
+# 二次请求携带cookie 获取html文件
+headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
+           "cookie": "acw_sc__v2=%s" % _0x12605e}
+
+r = requests.get(url, headers=headers)
+# print(r.text)

+ 0 - 0
BiddingKG/dl_dev/test/__init__.py


+ 276 - 0
BiddingKG/dl_dev/test/article_extract.py

@@ -0,0 +1,276 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/4/24 0024 15:20 
+
+# coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+    
+    return os.path.dirname(dir_names[0])
+
+
+# 初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files, package_name):
+    import os, sys
+
+    if len(list_files) == 1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip %s -d %s" % (cmd_line, package_name))
+    elif len(list_files) > 1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " " + os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip temp.zip -d %s" % (package_name))
+    sys.path.append(os.path.abspath(package_name))
+
+
+# UDF主程序
+@annotate("string->string")
+class Extractor(object):
+    def __init__(self):
+        import logging as log
+        global log
+        import os
+        log.basicConfig(level=log.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logger = log.getLogger(__name__)
+
+        model_path = os.path.abspath(get_cache_file('attentiongruacc0.932.model').name)
+        log.info('model_path', model_path)
+        log.info(os.path.exists(model_path))
+
+        # init_env(['pyhanlp.z01', 'pyhanlp.z02','pyhanlp.z03','pyhanlp.z04'], 'pyhanlp')
+        init_env(['pyhanlp.z01', 'pyhanlp.z02'], 'pyhanlp')
+        # init_env(['envs_py37.zip.env'], 'envs_py37')
+        include_package_path("envs_py37.env.zip")
+        init_env(['so.env'], '.')
+        init_env(['pkl_csv.z01'], '.')
+        import pickle
+        
+        import csv
+        import re as re
+        import tensorflow as tf
+        import numpy as np
+        import keras.backend as K
+        from keras import models
+        from keras.engine.topology import Layer
+        
+        import json as json
+        global json
+        global re
+        global np
+        global tf
+
+        
+        log.info('import package done------------------')
+        # dirpath = os.path.abspath('pyhanlp')
+        # path = dirpath+'/pyhanlp/static/__init__.py'        # return dirpath
+        # dirpath = os.path.dirname(os.path.abspath(get_cache_file('pyhanlp.z01').name))
+        # return '; '.join([a for a in os.listdir(os.listdir(dirpath)[0])])
+        # path2 = os.path.abspath(get_cache_file('hanlpinit.txt').name)
+        # content = []
+        # with open(path2, encoding='utf-8') as f:
+        #     for line in f:
+        #         content.append(line)
+        # # return '; '.join(content)
+        # with open(path, 'w', encoding='utf-8') as f:
+        #     f.writelines(content)
+        # log.info('rewrite hanlp path done--------------------')
+        # archive_files = get_cache_archive('token_stopwds.zip')
+        # names = [os.path.dirname(os.path.normpath(f.name)) for f in archive_files]
+        # with open(names[0]+'/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+        #     self.stopwords = [row[0] for row in csv.reader(f)]
+        # with open(names[0]+'/word_index_955871.pk', 'rb') as f:
+        #     self.word_index = pickle.load(f)
+
+        from pyhanlp import HanLP, JClass
+        HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+        HanLP.Config.ShowTermNature = False
+        self.hanlp = HanLP
+        log.info('import hanlp done---------------------')
+
+        class Attention(Layer):
+            log.info('******attention****************')
+            print('-------attention------------------')
+
+            def __init__(self, **kwargs):
+                super(Attention, self).__init__(**kwargs)
+
+            def build(self, input_shape):
+                # W: (EMBED_SIZE, 1)
+                # b: (MAX_TIMESTEPS, 1)
+                # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+                self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                         shape=(input_shape[-1], 1),
+                                         initializer="normal")
+                self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                         shape=(input_shape[1], 1),
+                                         initializer="zeros")
+                self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                         shape=(input_shape[1], input_shape[1]),
+                                         initializer="normal")
+                super(Attention, self).build(input_shape)
+
+            def call(self, x, mask=None):
+                # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                # et: (BATCH_SIZE, MAX_TIMESTEPS)
+                et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+                # at: (BATCH_SIZE, MAX_TIMESTEPS)
+                at = K.dot(et, self.u)
+                at = K.exp(at)
+                if mask is not None:
+                    at *= K.cast(mask, K.floatx())
+                # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+                atx = K.expand_dims(at, axis=-1)
+                ot = atx * x
+                # output: (BATCH_SIZE, EMBED_SIZE)
+                return K.sum(ot, axis=1)
+
+            def compute_mask(self, input, input_mask=None):
+                # do not pass the mask to the next layers
+                return None
+
+            def compute_output_shape(self, input_shape):
+                # output shape: (BATCH_SIZE, EMBED_SIZE)
+                return (input_shape[0], input_shape[-1])
+
+            def get_config(self):
+                return super(Attention, self).get_config()
+
+        
+        self.model = models.load_model(model_path,
+                                       custom_objects={'precision': precision,
+                                                       'recall': recall,
+                                                       'f1_score': f1_score,
+                                                       'Attention': Attention})
+        log.info('init model end  --')
+
+        pk_path = os.path.abspath('pkl_csv')
+        with open(pk_path + '/label_mapping210.pkl', 'rb') as f:
+            self.label_map = pickle.load(f)
+        print('load label_map done')
+        with open(pk_path + '/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+            self.stopwords = [row[0] for row in csv.reader(f)]
+        with open(pk_path + '/word_index_955871.pk', 'rb') as f:
+            self.word_index = pickle.load(f)
+        with open(pk_path + '/class_subclass_dic211.pk', 'rb') as f:
+            self.class_dic = pickle.load(f)
+        log.info('classs init done ----')
+
+    def evaluate(self, text):
+        # 去除html标签
+        text = re.sub('\s', '', str(text))
+        text = re.sub('<\s*script[^>]*>.*?<\s*/\s*script\s*>', '', text)
+        text = re.sub('<\s*style[^>]*>.*?<\s*/\s*style\s*>', '', text)
+        text = re.sub('</?\w+[^>]*>', '', text)
+        # 清除干扰字符(英文、日期、数字、标点符号), 返回前500字
+        text = re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]', '', text)[:500]
+        # hanlp分词
+        result = self.hanlp.segment(text)
+        text_list = [str(result.get(i)) for i in range(result.size())]
+        # 过滤停用词
+        text_list = [word for word in text_list if word not in self.stopwords and len(word) > 1]
+        # 顺序去重
+        l2 = []
+        [l2.append(i) for i in text_list if i not in l2]
+        # 数字化
+        text_list = [str(self.word_index.get(word, 0)) for word in l2]
+        # padding and trans to array
+        text_list = text_list[:100] if len(text_list) > 100 else text_list + ['0'] * (100 - len(text_list))
+        features = np.array([text_list[:100] if len(text_list) > 100 else text_list + [0] * (100 - len(text_list))])
+        log.info('数字化结束-------------------')
+        # features = np.array([s.split(',')[:100] if len(s.split(','))>100 else s.split(',')+[0]*(100-len(s.split(',')))])
+        with tf.get_default_graph().as_default():
+            log.info('准备预测-------------------')
+            logits = self.model.predict(features)
+            # return ','.join(logits[0])
+            # result = self.label_map(np.argmax(logits[0]))
+            # return result
+            log.info('预测结束-------------------')
+            top3 = np.argsort(-logits[0], axis=-1)[:3]
+            prob = ['%.4f' % (logits[0][i]) for i in top3]
+            pre = [self.label_map[i] for i in top3]
+            rd = {}
+            i = 1
+            for a in pre:
+                sub, father = self.class_dic[a].split(',')
+                rd['top' + str(i)] = {'subclass': sub, 'class_name': a, 'class': father}
+                i += 1
+
+            log.info('准备返回字符串')
+            return json.dumps(rd,ensure_ascii=False)    

+ 122 - 0
BiddingKG/dl_dev/test/compare.txt

@@ -0,0 +1,122 @@
+['比地_101_61298868.html', 7, set()]---['比地_101_61298868.html', 7, {'云南区域公司'}]---['比地_101_61298868.html', 7, set()]
+['比地_101_61299979.html', 15, {'吐哈油田'}]---['比地_101_61299979.html', 15, {'吐哈油田'}]---['比地_101_61299979.html', 15, {'吐哈油田'}]
+['比地_101_61299982.html', 16, {'鄯善县联华超市'}]---['比地_101_61299982.html', 16, {'鄯善县联华超市'}]---['比地_101_61299982.html', 16, {'鄯善县联华超市'}]
+['比地_101_61299983.html', 17, {'吐哈油田'}]---['比地_101_61299983.html', 17, {'吐哈油田'}]---['比地_101_61299983.html', 17, {'吐哈油田'}]
+['比地_101_61300008.html', 18, set()]---['比地_101_61300008.html', 18, {'南京信息工程大学'}]---['比地_101_61300008.html', 18, {'南京信息工程大学', '深圳雅学资讯有限公司公司'}]
+['比地_101_61300847.html', 22, set()]---['比地_101_61300847.html', 22, {'河南理工大学'}]---['比地_101_61300847.html', 22, {'河南理工大学'}]
+['比地_101_61301640.html', 27, {'湖南理工职业技术学院'}]---['比地_101_61301640.html', 27, {'湖南理工职业技术学院'}]---['比地_101_61301640.html', 27, {'湖南理工职业技术学院'}]
+['比地_101_61302111.html', 30, set()]---['比地_101_61302111.html', 30, set()]---['比地_101_61302111.html', 30, {'临沭县创展电脑销售有限公司'}]
+['比地_101_61302147.html', 31, {'联想京东'}]---['比地_101_61302147.html', 31, set()]---['比地_101_61302147.html', 31, {'联想京东'}]
+['比地_101_61304530.html', 42, {'电力工程公司'}]---['比地_101_61304530.html', 42, set()]---['比地_101_61304530.html', 42, set()]
+['比地_101_61305197.html', 45, {'好孩子(中国)商贸有限公司'}]---['比地_101_61305197.html', 45, {'好孩子(中国)商贸有限公司'}]---['比地_101_61305197.html', 45, {'好孩子(中国)商贸有限公司'}]
+['比地_101_61307182.html', 48, {'漳浦县景秀园艺场'}]---['比地_101_61307182.html', 48, {'漳浦县景秀园艺场'}]---['比地_101_61307182.html', 48, {'漳浦县景秀园艺场'}]
+['比地_101_61307601.html', 58, {'北京城建'}]---['比地_101_61307601.html', 58, {'北京城建'}]---['比地_101_61307601.html', 58, {'北京城建'}]
+['比地_101_61309234.html', 61, {'南通大学'}]---['比地_101_61309234.html', 61, {'南通大学'}]---['比地_101_61309234.html', 61, {'南通大学'}]
+['比地_101_61313070.html', 68, {'中粮昌吉'}]---['比地_101_61313070.html', 68, {'中粮昌吉'}]---['比地_101_61313070.html', 68, {'中粮昌吉'}]
+['比地_101_61316516.html', 70, set()]---['比地_101_61316516.html', 70, {'网络信息与现代教育技术中心'}]---['比地_101_61316516.html', 70, {'网络信息与现代教育技术中心'}]
+['比地_101_61316555.html', 71, {'党委宣传部'}]---['比地_101_61316555.html', 71, set()]---['比地_101_61316555.html', 71, {'党委宣传部'}]
+['比地_101_61320731.html', 77, {'北京师范大学厦门海沧附属学校'}]---['比地_101_61320731.html', 77, set()]---['比地_101_61320731.html', 77, set()]
+['比地_101_61333318.html', 103, {'澳柯玛股份有限公司', '青岛杰诚机械制造厂'}]---['比地_101_61333318.html', 103, {'青岛杰诚机械制造厂'}]---['比地_101_61333318.html', 103, {'青岛杰诚机械制造厂'}]
+['比地_101_61335978.html', 108, set()]---['比地_101_61335978.html', 108, set()]---['比地_101_61335978.html', 108, {'中粮昌吉'}]
+['比地_101_61338377.html', 118, set()]---['比地_101_61338377.html', 118, {'中铁二十五局集团有限公司'}]---['比地_101_61338377.html', 118, set()]
+['比地_101_61343884.html', 123, set()]---['比地_101_61343884.html', 123, set()]---['比地_101_61343884.html', 123, {'新院区儿童医院'}]
+['比地_101_61344251.html', 124, {'南昌航空大学'}]---['比地_101_61344251.html', 124, {'南昌航空大学'}]---['比地_101_61344251.html', 124, {'南昌航空大学'}]
+['比地_101_61347152.html', 132, {'蚌埠三院'}]---['比地_101_61347152.html', 132, {'蚌埠三院'}]---['比地_101_61347152.html', 132, {'蚌埠三院', '蚌埠三院采招办'}]
+['比地_101_61348179.html', 137, {'上海商学院商务部国际商务官员研修基地'}]---['比地_101_61348179.html', 137, set()]---['比地_101_61348179.html', 137, {'上海商学院商务部国际商务官员研修基地'}]
+['比地_101_61348276.html', 141, {'重庆尚尚广告传媒策划有限公司'}]---['比地_101_61348276.html', 141, set()]---['比地_101_61348276.html', 141, set()]
+['比地_101_61348597.html', 147, {'苏州工业设备安装集团', '江苏元田建设集团'}]---['比地_101_61348597.html', 147, {'苏州工业设备安装集团', '江苏元田建设集团'}]---['比地_101_61348597.html', 147, {'江苏元田建设集团', '苏州工业设备安装集团'}]
+['比地_101_61349073.html', 151, {'龙沙区恩启电子产品经销处'}]---['比地_101_61349073.html', 151, {'龙沙区恩启电子产品经销处'}]---['比地_101_61349073.html', 151, {'龙沙区恩启电子产品经销处'}]
+['比地_101_61349507.html', 156, set()]---['比地_101_61349507.html', 156, {'姑苏区优策贸易商行'}]---['比地_101_61349507.html', 156, set()]
+['比地_101_61350445.html', 167, {'南京市江宁区圣村塑料加工厂'}]---['比地_101_61350445.html', 167, set()]---['比地_101_61350445.html', 167, {'南京市江宁区圣村塑料加工厂'}]
+['比地_101_61350661.html', 171, {'好孩子(中国)商贸有限公司'}]---['比地_101_61350661.html', 171, {'好孩子(中国)商贸有限公司'}]---['比地_101_61350661.html', 171, {'好孩子(中国)商贸有限公司'}]
+['比地_101_61351066.html', 180, set()]---['比地_101_61351066.html', 180, set()]---['比地_101_61351066.html', 180, {'重庆展翼机械设备有限公司'}]
+['比地_101_61351596.html', 183, {'暨南大学'}]---['比地_101_61351596.html', 183, {'暨南大学'}]---['比地_101_61351596.html', 183, {'暨南大学'}]
+['比地_101_61352528.html', 190, set()]---['比地_101_61352528.html', 190, {'中铁九局集团电务工程有限公司'}]---['比地_101_61352528.html', 190, set()]
+['比地_101_61353124.html', 199, set()]---['比地_101_61353124.html', 199, {'沈阳体育学院'}]---['比地_101_61353124.html', 199, set()]
+['比地_101_61353778.html', 206, {'学前儿童卫生与保健实训室'}]---['比地_101_61353778.html', 206, {'学前儿童卫生与保健实训室'}]---['比地_101_61353778.html', 206, {'学前儿童卫生与保健实训室'}]
+['比地_101_61354532.html', 217, set()]---['比地_101_61354532.html', 217, {'中国船舶重工集团应急预警与救援装备股份有限公司赤壁分公司'}]---['比地_101_61354532.html', 217, {'中国船舶重工集团应急预警与救援装备股份有限公司赤壁分公司'}]
+['比地_101_61354855.html', 221, set()]---['比地_101_61354855.html', 221, {'材料科学与工程学院'}]---['比地_101_61354855.html', 221, {'材料科学与工程学院'}]
+['比地_101_61360907.html', 245, {'物美集团店'}]---['比地_101_61360907.html', 245, {'物美集团店'}]---['比地_101_61360907.html', 245, {'物美集团店'}]
+['比地_101_61363731.html', 262, {'温州医科大学'}]---['比地_101_61363731.html', 262, {'温州医科大学'}]---['比地_101_61363731.html', 262, {'温州医科大学'}]
+['比地_101_61364116.html', 268, set()]---['比地_101_61364116.html', 268, {'同济大学'}]---['比地_101_61364116.html', 268, set()]
+['比地_101_61364302.html', 273, {'徐州医科大学'}]---['比地_101_61364302.html', 273, {'徐州医科大学'}]---['比地_101_61364302.html', 273, {'徐州医科大学'}]
+['比地_101_61369404.html', 283, {'格实业集团有限公司'}]---['比地_101_61369404.html', 283, set()]---['比地_101_61369404.html', 283, set()]
+['比地_101_61373738.html', 290, set()]---['比地_101_61373738.html', 290, {'安徽省中西医结合医院'}]---['比地_101_61373738.html', 290, set()]
+['比地_101_61375900.html', 292, {'西园饭店', '会议中心集中采购办'}]---['比地_101_61375900.html', 292, {'会议中心集中采购办'}]---['比地_101_61375900.html', 292, {'会议中心集中采购办'}]
+['比地_101_61376472.html', 297, {'河南求实工程造价咨询有限公司', '新乡职业技术学院', '新乡市华成土石方工程有限公司'}]---['比地_101_61376472.html', 297, {'河南求实工程造价咨询有限公司', '新乡市华成土石方工程有限公司', '新乡职业技术学院'}]---['比地_101_61376472.html', 297, {'新乡市华成土石方工程有限公司', '新乡职业技术学院', '河南求实工程造价咨询有限公司'}]
+['比地_101_61377475.html', 303, {'江西财经大学教务处'}]---['比地_101_61377475.html', 303, set()]---['比地_101_61377475.html', 303, {'江西财经大学教务处'}]
+['比地_101_61377656.html', 306, {'中国石油天然气股份有限公司广东广州销售分公司'}]---['比地_101_61377656.html', 306, {'中国石油天然气股份有限公司广东广州销售分公司'}]---['比地_101_61377656.html', 306, {'中国石油天然气股份有限公司广东广州销售分公司'}]
+['比地_101_61378233.html', 315, {'国营云南机器三厂烟机配件分厂'}]---['比地_101_61378233.html', 315, set()]---['比地_101_61378233.html', 315, {'国营云南机器三厂烟机配件分厂'}]
+['比地_101_61378514.html', 316, set()]---['比地_101_61378514.html', 316, {'湘潭大学'}]---['比地_101_61378514.html', 316, set()]
+['比地_101_61378568.html', 317, {'咸宁职业教育(集团)学校'}]---['比地_101_61378568.html', 317, {'咸宁职业教育(集团)学校'}]---['比地_101_61378568.html', 317, {'咸宁职业教育(集团)学校'}]
+['比地_101_61379248.html', 320, {'四川护理职业学院附属医院'}]---['比地_101_61379248.html', 320, set()]---['比地_101_61379248.html', 320, set()]
+['比地_101_61380435.html', 329, set()]---['比地_101_61380435.html', 329, {'中国有色金属长沙勘察设计研究院有限公司'}]---['比地_101_61380435.html', 329, {'中国有色金属长沙勘察设计研究院有限公司'}]
+['比地_101_61380932.html', 338, {'江苏商贸职业学院'}]---['比地_101_61380932.html', 338, {'江苏商贸职业学院'}]---['比地_101_61380932.html', 338, set()]
+['比地_101_61381606.html', 350, set()]---['比地_101_61381606.html', 350, set()]---['比地_101_61381606.html', 350, {'融泰博通商贸(天津)有限公司'}]
+['比地_101_61383553.html', 363, {'渤海大学'}]---['比地_101_61383553.html', 363, {'渤海大学'}]---['比地_101_61383553.html', 363, {'渤海大学'}]
+['比地_101_61383968.html', 368, {'天诚工程咨询有限公司'}]---['比地_101_61383968.html', 368, {'天诚工程咨询有限公司'}]---['比地_101_61383968.html', 368, {'天诚工程咨询有限公司'}]
+['比地_101_61391321.html', 378, {'清华大学'}]---['比地_101_61391321.html', 378, {'清华大学'}]---['比地_101_61391321.html', 378, {'清华大学'}]
+['比���_101_61393719.html', 381, set()]---['比地_101_61393719.html', 381, set()]---['比地_101_61393719.html', 381, set()]
+['比地_102_61347611.html', 407, {'北京大学'}]---['比地_102_61347611.html', 407, {'北京大学'}]---['比地_102_61347611.html', 407, set()]
+['比地_105_61337616.html', 411, set()]---['比地_105_61337616.html', 411, set()]---['比地_105_61337616.html', 411, {'解放军总医院海南医院'}]
+['比地_51_61322698.html', 437, set()]---['比地_51_61322698.html', 437, set()]---['比地_51_61322698.html', 437, {'广州塔旅游文化发展股份有限公司经营管理分公司'}]
+['比地_51_61331086.html', 442, set()]---['比地_51_61331086.html', 442, set()]---['比地_51_61331086.html', 442, {'东风汽车集团股份有限公司技术中心'}]
+['比地_52_61298731.html', 486, set()]---['比地_52_61298731.html', 486, {'合海橡塑装备有限公司'}]---['比地_52_61298731.html', 486, {'合海橡塑装备有限公司'}]
+['比地_52_61299179.html', 493, set()]---['比地_52_61299179.html', 493, {'金正大诺泰尔化学有限公司'}]---['比地_52_61299179.html', 493, {'金正大诺泰尔化学有限公司'}]
+['比地_52_61299536.html', 495, set()]---['比地_52_61299536.html', 495, {'张家港市给排水公司'}]---['比地_52_61299536.html', 495, set()]
+['比地_52_61302458.html', 519, set()]---['比地_52_61302458.html', 519, {'无锡职业技术学院'}]---['比地_52_61302458.html', 519, set()]
+['比地_52_61304211.html', 544, {'好孩子(中国)商贸有限公司'}]---['比地_52_61304211.html', 544, {'好孩子(中国)商贸有限公司'}]---['比地_52_61304211.html', 544, {'好孩子(中国)商贸有限公司'}]
+['比地_52_61308400.html', 567, {'澳柯玛股份有限公司'}]---['比地_52_61308400.html', 567, set()]---['比地_52_61308400.html', 567, set()]
+['比地_52_61308978.html', 570, {'中粮昌吉'}]---['比地_52_61308978.html', 570, {'中粮昌吉'}]---['比地_52_61308978.html', 570, {'中粮昌吉'}]
+['比地_52_61310152.html', 577, set()]---['比地_52_61310152.html', 577, {'柳州职业技术学院'}]---['比地_52_61310152.html', 577, set()]
+['比地_52_61314031.html', 583, {'海南大学'}]---['比地_52_61314031.html', 583, {'海南大学'}]---['比地_52_61314031.html', 583, {'海南大学'}]
+['比地_52_61314343.html', 586, {'首都医科大学'}]---['比地_52_61314343.html', 586, {'首都医科大学'}]---['比地_52_61314343.html', 586, {'首都医科大学'}]
+['比地_52_61318909.html', 604, {'山西北方建设工程招标代理中心(有限公司)', '太原公共交通控股'}]---['比地_52_61318909.html', 604, {'山西北方建设工程招标代理中心(有限公司)', '太原公共交通控股'}]---['比地_52_61318909.html', 604, {'太原公共交通控股', '山西北方建设工程招标代理中心(有限公司)'}]
+['比地_52_61324152.html', 624, {'中纺湛江'}]---['比地_52_61324152.html', 624, set()]---['比地_52_61324152.html', 624, set()]
+['比地_52_61326631.html', 637, {'对外经济贸易大学'}]---['比地_52_61326631.html', 637, {'对外经济贸易大学'}]---['比地_52_61326631.html', 637, {'对外经济贸易大学'}]
+['比地_52_61331133.html', 648, {'文一投资控股集团'}]---['比地_52_61331133.html', 648, set()]---['比地_52_61331133.html', 648, set()]
+['比地_52_61332341.html', 660, {'东风汽车集团股份有限公司'}]---['比地_52_61332341.html', 660, set()]---['比地_52_61332341.html', 660, set()]
+['比地_52_61334852.html', 676, set()]---['比地_52_61334852.html', 676, {'中铁航空港集团第一工程有限公司'}]---['比地_52_61334852.html', 676, set()]
+['比地_52_61340390.html', 711, set()]---['比地_52_61340390.html', 711, {'宁夏第三人民医院'}]---['比地_52_61340390.html', 711, set()]
+['比地_52_61341149.html', 725, set()]---['比地_52_61341149.html', 725, set()]---['比地_52_61341149.html', 725, {'红塔烟草(集团)有限责任公司'}]
+['比地_52_61341315.html', 730, {'深圳市龙华区滢水山庄二区小区第四届业主委员会'}]---['比地_52_61341315.html', 730, {'深圳市龙华区滢水山庄二区小区第四届业主委员会'}]---['比地_52_61341315.html', 730, set()]
+['比地_52_61342011.html', 750, {'汕头职业技术学院'}]---['比地_52_61342011.html', 750, {'汕头职业技术学院'}]---['比地_52_61342011.html', 750, {'汕头职业技术学院'}]
+['比地_52_61343430.html', 776, {'江苏省连云港中医药高等职业技术学药事管理'}]---['比地_52_61343430.html', 776, set()]---['比地_52_61343430.html', 776, {'江苏省连云港中医药高等职业技术学药事管理'}]
+['比地_52_61343829.html', 785, set()]---['比地_52_61343829.html', 785, {'淮海工学院'}]---['比地_52_61343829.html', 785, set()]
+['比地_52_61344108.html', 789, {'中化公司'}]---['比地_52_61344108.html', 789, set()]---['比地_52_61344108.html', 789, set()]
+['比地_52_61344330.html', 794, set()]---['比地_52_61344330.html', 794, {'长城钻探四川页岩气项目部'}]---['比地_52_61344330.html', 794, {'长城钻探四川页岩气项目部'}]
+['比地_52_61345292.html', 829, {'江西中烟工业有限责任公司赣州卷烟厂'}]---['比地_52_61345292.html', 829, set()]---['比地_52_61345292.html', 829, set()]
+['比地_52_61345450.html', 832, {'辽东学院后勤服务总公司'}]---['比地_52_61345450.html', 832, set()]---['比地_52_61345450.html', 832, {'辽东学院后勤服务总公司'}]
+['比地_52_61345451.html', 833, {'中国农业大学118万'}]---['比地_52_61345451.html', 833, {'中国农业大学118万'}]---['比地_52_61345451.html', 833, {'中国农业大学118万'}]
+['比地_52_61346085.html', 845, set()]---['比地_52_61346085.html', 845, {'江西理工大学'}]---['比地_52_61346085.html', 845, set()]
+['比地_52_61346340.html', 854, {'安徽医科大学'}]---['比地_52_61346340.html', 854, {'安徽医科大学'}]---['比地_52_61346340.html', 854, set()]
+['比地_52_61346866.html', 863, set()]---['比地_52_61346866.html', 863, {'济宁医学院'}]---['比地_52_61346866.html', 863, set()]
+['比地_52_61347324.html', 883, {'成都市第二人民医院工会'}]---['比地_52_61347324.html', 883, set()]---['比地_52_61347324.html', 883, {'成都市第二人民医院工会'}]
+['比地_52_61355899.html', 899, {'中泰证券'}]---['比地_52_61355899.html', 899, set()]---['比地_52_61355899.html', 899, {'中泰证券'}]
+['比地_52_61356671.html', 908, {'南京江南公交客运有限公司检测中心'}]---['比地_52_61356671.html', 908, set()]---['比地_52_61356671.html', 908, set()]
+['比地_52_61356972.html', 916, set()]---['比地_52_61356972.html', 916, {'江南大学'}]---['比地_52_61356972.html', 916, set()]
+['比地_52_61358314.html', 938, {'广发期货'}]---['比地_52_61358314.html', 938, set()]---['比地_52_61358314.html', 938, {'广发期货'}]
+['比地_52_61358560.html', 946, {'雷沃重工'}]---['比地_52_61358560.html', 946, set()]---['比地_52_61358560.html', 946, {'雷沃重工'}]
+['比地_52_61358678.html', 947, {'东市汇龙中学'}]---['比地_52_61358678.html', 947, {'东市汇龙中学'}]---['比地_52_61358678.html', 947, {'东市汇龙中学'}]
+['比地_52_61358782.html', 951, {'南园小学'}]---['比地_52_61358782.html', 951, {'南园小学'}]---['比地_52_61358782.html', 951, set()]
+['比地_52_61361232.html', 960, {'甘肃警察职业学院皋兰校区'}]---['比地_52_61361232.html', 960, set()]---['比地_52_61361232.html', 960, {'甘肃警察职业学院皋兰校区'}]
+['比地_52_61361833.html', 968, {'一重新能源发展集团有限公司'}]---['比地_52_61361833.html', 968, {'一重新能源发展集团有限公司'}]---['比地_52_61361833.html', 968, {'一重新能源发展集团有限公司'}]
+['比地_52_61362784.html', 973, set()]---['比地_52_61362784.html', 973, set()]---['比地_52_61362784.html', 973, {'中国对外贸易中心'}]
+['比地_52_61365676.html', 991, set()]---['比地_52_61365676.html', 991, set()]---['比地_52_61365676.html', 991, {'河池学院数统学院'}]
+['比地_52_61365701.html', 992, {'华南师范大学石牌校区'}]---['比地_52_61365701.html', 992, set()]---['比地_52_61365701.html', 992, {'华南师范大学石牌校区'}]
+['比地_52_61366313.html', 1004, {'上海天马'}]---['比地_52_61366313.html', 1004, {'上海天马'}]---['比地_52_61366313.html', 1004, set()]
+['比地_52_61367613.html', 1022, {'洛阳分院'}]---['比地_52_61367613.html', 1022, set()]---['比地_52_61367613.html', 1022, set()]
+['比地_52_61369010.html', 1040, {'好孩子(中国)商贸有限公司'}]---['比地_52_61369010.html', 1040, {'好孩子(中国)商贸有限公司'}]---['比地_52_61369010.html', 1040, {'好孩子(中国)商贸有限公司'}]
+['比地_52_61370450.html', 1071, {'巢湖运众汽车销售服务有限公司'}]---['比地_52_61370450.html', 1071, set()]---['比地_52_61370450.html', 1071, set()]
+['比地_52_61371104.html', 1081, {'公路开发公司'}]---['比地_52_61371104.html', 1081, {'公路开发公司'}]---['比地_52_61371104.html', 1081, {'公路开发公司'}]
+['比地_52_61371109.html', 1083, {'西华师范大学'}]---['比地_52_61371109.html', 1083, set()]---['比地_52_61371109.html', 1083, set()]
+['比地_52_61372733.html', 1101, set()]---['比地_52_61372733.html', 1101, {'广州医科大学附属第三医院荔湾医院'}]---['比地_52_61372733.html', 1101, set()]
+['比地_52_61373301.html', 1108, {'泰州机电高等职业技术学校'}]---['比地_52_61373301.html', 1108, set()]---['比地_52_61373301.html', 1108, set()]
+['比地_52_61374204.html', 1120, set()]---['比地_52_61374204.html', 1120, set()]---['比地_52_61374204.html', 1120, {'融泰博通商贸(天津)有限公司'}]
+['比地_52_61374282.html', 1121, {'大秦铁路股份有限公司'}]---['比地_52_61374282.html', 1121, set()]---['比地_52_61374282.html', 1121, set()]
+['比地_52_61375054.html', 1135, {'吐哈油田公司招投标部'}]---['比地_52_61375054.html', 1135, set()]---['比地_52_61375054.html', 1135, {'吐哈油田公司招投标部'}]
+['比地_52_61383175.html', 1140, {'澳柯玛股份有限公司智能制造中心'}]---['比地_52_61383175.html', 1140, set()]---['比地_52_61383175.html', 1140, {'澳柯玛股份有限公司智能制造中心'}]
+['比地_52_61383195.html', 1143, {'山东大学第二医院'}]---['比地_52_61383195.html', 1143, set()]---['比地_52_61383195.html', 1143, set()]
+['比地_52_61387342.html', 1162, {'明溪县总医院'}]---['比地_52_61387342.html', 1162, {'明溪县总医院'}]---['比地_52_61387342.html', 1162, {'明溪县总医院'}]
+['比地_52_61393329.html', 1173, {'中国铁塔股份有限公司'}]---['比地_52_61393329.html', 1173, {'中国铁塔股份有限公司'}]---['比地_52_61393329.html', 1173, {'中国铁塔股份有限公司'}]
+['比地_52_61402956.html', 1192, {'四川省宜宾市第一中学校'}]---['比地_52_61402956.html', 1192, set()]---['比地_52_61402956.html', 1192, {'四川省宜宾市第一中学校'}]

+ 0 - 0
BiddingKG/dl_dev/test/compare1.txt


BIN
BiddingKG/dl_dev/test/data/t10k-images-idx3-ubyte.gz


BIN
BiddingKG/dl_dev/test/data/t10k-labels-idx1-ubyte.gz


BIN
BiddingKG/dl_dev/test/data/train-images-idx3-ubyte.gz


BIN
BiddingKG/dl_dev/test/data/train-labels-idx1-ubyte.gz


BIN
BiddingKG/dl_dev/test/list_sentence_entity.pk


BIN
BiddingKG/dl_dev/test/model_person_classify.model.hdf5


BIN
BiddingKG/dl_dev/test/model_person_classify_fjs.model.hdf5


BIN
BiddingKG/dl_dev/test/person_savedmodel_new/saved_model.pb


BIN
BiddingKG/dl_dev/test/person_savedmodel_new/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl_dev/test/person_savedmodel_new/variables/variables.index


+ 25 - 0
BiddingKG/dl_dev/test/t2/1.py

@@ -0,0 +1,25 @@
+
+import sys,os
+
+print(os.path.dirname(os.path.dirname(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+def getFileFromSysPath(filename):
+    print(sys.path)
+    for _path in sys.path:
+        if os.path.isdir(_path):
+            print(_path)
+            print(os.listdir(_path))
+            for _file in os.listdir(_path):
+                _abspath = os.path.join(_path,_file)
+                if os.path.isfile(_abspath):
+                    print(_file)
+                    if _file==filename:
+                        return _abspath
+    return None
+# print(os.path.exists("1598545223446.jpg"))
+print(getFileFromSysPath("LEGAL_ENTERPRISE.txt"))
+# for _file in  os.listdir("F:/Workspace2016/BiddingKG/BiddingKG/dl/test/t2/../"):
+#     if os.path.isfile(_file):
+#         print(_file)
+print(os.path.split("/BiddingKG/dl_dev/test/t2"))

+ 0 - 0
BiddingKG/dl_dev/test/t2/__init__.py


+ 116 - 0
BiddingKG/dl_dev/test/test4.py

@@ -0,0 +1,116 @@
+#coding:UTF8
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+import logging
+import json
+global json,logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+import json
+import random
+
+session = requests.Session()
+
+def test(name,content,_url=None):
+    # _times = 2
+    # _content = ""
+    # for _ in range(_times):
+    #     _content += content
+    # content = _content
+    print(len(content))
+    user = {
+            "content": content,
+            "doc_id":name,
+            "timeout":2000,
+            "original_docchannel":101
+            }
+    # print(user)
+    myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
+
+    list_url = ["http://127.0.0.1:15030/content_extract",
+                "http://127.0.0.1:15031/content_extract",
+                "http://127.0.0.1:15032/content_extract",
+                "http://127.0.0.1:15033/content_extract",
+                "http://127.0.0.1:15034/content_extract",
+                "http://127.0.0.1:15035/content_extract",
+                "http://127.0.0.1:15036/content_extract",
+                "http://127.0.0.1:15037/content_extract",
+                ]
+    # _i = random.randint(0,len(list_url)-1)
+    # _resp = requests.post(list_url[_i], json=user, headers=myheaders, verify=True)
+
+    # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
+    _url = "http://192.168.2.102:15030/test"
+    _url = "http://192.168.2.102:15030/industry_extract"
+    _url = "http://192.168.2.102:15030/content_extract"
+
+    _resp = session.post(_url, json=user,verify=True,timeout=1000)
+    # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
+    resp_json = _resp.content.decode("utf-8")
+    logging.info("%d===%s"%(_resp.status_code,resp_json[:100]))
+
+    return resp_json
+
+def presure_test():
+
+    from BiddingKG.dl.common.multiThread import MultiThreadHandler
+    from queue import Queue
+    text = codecs.open("2.html","r",encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div",id="pcontent"))
+
+
+    start_time = time.time()
+    task_queue = Queue()
+    for i in range(300):
+        task_queue.put(content)
+    def _handle(item,result_queue):
+        test("",item)
+    mt = MultiThreadHandler(task_queue,_handle,None,3)
+    mt.run()
+    end_time = time.time()
+    print("all takes :%ds"%(end_time-start_time))
+
+def runlocal(content):
+    import sys
+    import os
+    sys.path.append(os.path.abspath("../.."))
+    import fool
+    from BiddingKG.dl.interface.extract import predict
+
+    predict("12", content,"打印机",original_docchannel=101)
+
+def run_one():
+    from BiddingKG.dl.interface.extract import predict
+    # filename = "比地_52_79929693.html"
+    #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    # text = codecs.open("2.html","r",encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    a = time.time()
+    # text = '''
+    # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
+    # '''
+    print("start")
+    _time1 = time.time()
+    print(predict("12", content,"打印机",original_docchannel=52))
+    # test(12,content)
+    # test(12,text)
+    print("takes",time.time()-a)
+    pass
+
+if __name__=="__main__":
+    # presure_test()
+    run_one()

+ 1621 - 0
BiddingKG/dl_dev/test/test_data_fjs.py

@@ -0,0 +1,1621 @@
+#coding:UTF-8
+# !/usr/bin/python
+# -*- coding: <utf-8> -*-
+
+import ast
+import copy
+import re
+import sys
+import os
+import time
+import codecs
+from datetime import datetime
+
+import psycopg2
+import pandas as pd
+
+sys.setrecursionlimit(1000000)
+sys.path.append(os.path.abspath("../.."))
+sys.path.append(os.path.abspath("../../dl"))
+
+# 数据预处理,将数据转为BIO标注类型
+
+
+def Postgre2Data():
+    # 连接postgresql数据库
+    connect = psycopg2.connect(database="iepy", user="iepy_read", password="iepy_read", host="192.168.2.101",
+                               port="5432")
+    cursor = connect.cursor()
+    cursor1 = connect.cursor()
+
+    # 执行语句:先筛选出已审核通过的用户和时间段
+    # cursor1.execute("SELECT a.user, begin_time, end_time"
+    #                 " FROM corpus_payroll a"
+    #                 " ORDER BY a.user")
+    #
+    # rows1 = cursor1.fetchall()
+    # # 循环,根据筛选条件循环查另一条SQL,并保存结果
+    # result = []
+    # for row in rows1:
+    #     # 执行语句:取语料库中的文章id,分词结果,句子分割index
+    #     cursor.execute("select human_identifier, tokens, sentences, text, edittime, edituser"
+    #                    " from corpus_iedocument"
+    #                    " where date(edittime) <= '" + row[2] + "'" +
+    #                    " and date(edittime) >= '" + row[1] + "'" +
+    #                    " and edituser = '" + row[0] + "'")
+    #                    # + " limit 5")
+    #     # cursor.execute("select human_identifier, tokens, sentences, text, edittime, edituser"
+    #     #                " from corpus_iedocument"
+    #     #                " where date(edittime) >= '" + "2020-08-01" + "'" +
+    #     #                " and date(edittime) <= '" + "2020-08-31" + "'" +
+    #     #                " and edituser = '" + "test1" + "'" +
+    #     #                " limit 10")
+    #
+    #     # 获取SELECT 返回的元组
+    #     rows = cursor.fetchall()
+    #     for row in rows:
+    #         result.append(row)
+
+    result = []
+    cursor.execute("SELECT human_identifier, tokens, sentences, text, edittime, edituser"
+                   " FROM corpus_iedocument"
+                   " where edituser is not NULL")
+                   # + " limit 30")
+    rows = cursor.fetchall()
+    for row in rows:
+        result.append(row)
+    print(len(result))
+
+    human_identifier = []
+    tokens = []
+    sentences = []
+    text = []
+    corpus_iedocument = []
+    for row in result:
+        human_identifier.append(row[0])
+        s = row[1]
+        s = s.replace("[", "").replace("]", "").replace("\'", "")
+        ss = s.split(", ")
+        sss = []
+        for s1 in ss:
+            sss.append(s1)
+        tokens.append(sss)
+        sentences.append(row[2])
+        text.append(row[3])
+    corpus_iedocument.append(human_identifier)
+    corpus_iedocument.append(tokens)
+    corpus_iedocument.append(sentences)
+    corpus_iedocument.append(text)
+    # print(corpus_iedocument[0])
+
+    # 循环每个documentid,取出对应标注结果。
+    # 返回二维列表,第一维是document,第二维是document_id和value
+    brat_labeledbratannotation = []
+    for i in range(len(corpus_iedocument[0])):
+
+        document = []
+        document_id = []
+        value = []
+        # 执行语句,取brat人工标注库中的文章id,标注的结果
+        cursor.execute('select document_id, value from brat_bratannotation '
+                       + 'where document_id = \'' + corpus_iedocument[0][i] + '\'')
+        rows = cursor.fetchall()
+        for row in rows:
+            if (row[1][0] != 'T'):
+                continue
+            # print(row[1][0])
+            document_id.append(row[0])
+            value.append(row[1])
+        document.append(document_id)
+        document.append(value)
+
+        brat_labeledbratannotation.append(document)
+
+    # 关闭游标
+    cursor.close()
+    # cursor1.close()
+    # 关闭数据库连接
+    connect.close()
+    return corpus_iedocument, brat_labeledbratannotation
+
+
+def Text2Csv():
+    corpus_iedocument, brat_labeledbratannotation = Postgre2Data()
+
+    # text_df = pd.DataFrame(columns=("document_id", "text", "value"))
+    text_list = []
+    document_id_list = []
+    manual_BIO_list = []
+    category_list = []
+    word_list = []
+    # 循环:每篇Document
+    for index in range(len(corpus_iedocument[3])):
+        text = corpus_iedocument[3][index]
+        document_id = brat_labeledbratannotation[index][0]
+        manual_BIO = brat_labeledbratannotation[index][1]
+
+        # 循环:处理人工标注的数据,结构化,取联系人类型和单词index,并对数组按单词index排序
+        for j in range(len(manual_BIO)):
+            categoryAndIndex = manual_BIO[j].replace("	", " ").split(" ")[1:]
+            category = categoryAndIndex[0]
+            word = categoryAndIndex[-1]
+
+            document_id_list.append(document_id[j])
+            text_list.append(text)
+            category_list.append(category)
+            word_list.append(word)
+            manual_BIO_list.append(categoryAndIndex)
+
+    text_dict = {'document_id': document_id_list, 'text': text_list, 'word': word_list, 'category': category_list, 'categoryAndIndex': manual_BIO_list}
+    text_df = pd.DataFrame(text_dict)
+    # text_df.columns = ['document_id', 'text', 'word', 'category', 'categoryAndIndex']
+    text_df.to_csv("C:\\Users\\admin\\Desktop\\text.csv")
+    return
+
+
+def Csv2BidwayText():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\text.csv")
+    df = df[df["category"] == "bidway"]
+    df.columns = ["index", "category", "categoryAndIndex", "document_id", "text",  "word"]
+    df = df.reset_index()
+    df = df[["document_id", "text", "categoryAndIndex", "word", "category"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
+
+
+def Csv2ServiceTimeText():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\text.csv")
+    df = df[df["category"] == "serviceTime"]
+    df.columns = ["index", "category", "categoryAndIndex", "document_id", "text",  "word"]
+    df = df.reset_index()
+    df = df[["document_id", "text", "categoryAndIndex", "word", "category"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
+
+
+def data2BIOData():
+    corpus_iedocument, brat_labeledbratannotation = Postgre2Data()
+
+    # 单词list
+    words_list = [0]
+    words_list_all = []
+    # 单词的BIO标注列表
+    word_BIO_list_all = []
+    # 句子列表
+    sentences_list_all = []
+    manual_BIO_list = []
+    # 单词在句子中的index列表
+    wordInSentencesIndex_list_all = []
+    # 单词对应的句子的编号:0~句子条数
+    wordInSentencesNumber_list_all = []
+    # 单词对应的句子分词token列表
+    wordInSentencesTokens_list_all = []
+
+    # 循环:documment篇数
+    for i in range(len(corpus_iedocument[0])):
+        categoryAndIndex_list = []
+        words_list[0] = corpus_iedocument[1][i]
+        words_list_all.append(corpus_iedocument[1][i])
+        manual_BIO_list = brat_labeledbratannotation[i][1]
+
+        # 循环:处理人工标注的数据,结构化,取联系人类型和单词index,并对数组按单词index排序
+        for data in manual_BIO_list:
+            categoryAndIndex = data.replace("	", " ").split(" ")[1:4]
+            categoryAndIndex_list.append(categoryAndIndex)
+            categoryAndIndex_list = sorted(categoryAndIndex_list, key=lambda c: int(c[1]), reverse=False)
+
+        # 循环:将该篇Document的句子分出来
+        index_begin = 0
+        formatted_sentence_index = corpus_iedocument[2][i][1:-1].split(",")
+        sentences_list = []
+        for index in range(1, len(formatted_sentence_index)):
+            s = corpus_iedocument[3][i][index_begin: int(formatted_sentence_index[index])]
+            index_begin = int(formatted_sentence_index[index])
+            sentences_list.append(s)
+        sentences_list_all.append(sentences_list)
+
+        # 处理数据,成为BIO标注类型,即每个单词都有一个对应的标注
+        # 对每个人工标注循环找,并对index跨单词进行标注
+        # 单个单词多个标注就用列表全部存储
+        # 并对单词输出其所在句子的index,和所在句子的编号
+        word_BIO_list = [[0]] * len(words_list[0])
+        # 循环:一篇document中所有Label和下标
+        for index in range(len(categoryAndIndex_list)):
+            word_index = 0
+            # 标识上一个标注是否为B,记录上个标识,并记录最后位置
+            tag_flag = ""
+            tag_index = 0
+            # 单词index标识
+            word_flag = 0
+            # 循环:对一个Label和Index循环所有单词
+            for word in words_list[0]:
+
+                if word_index == int(categoryAndIndex_list[index][1]):
+                    # 如果原来有标注的类,就添加;没有则赋值
+                    if word_BIO_list[word_flag][0] != 0 \
+                            and ("B-" + categoryAndIndex_list[index][0]) not in word_BIO_list[word_flag]:
+                        word_BIO_list[word_flag].append("B-" + categoryAndIndex_list[index][0])
+                    else:
+                        word_BIO_list[word_flag] = ["B-" + categoryAndIndex_list[index][0]]
+
+                    tag_flag = categoryAndIndex_list[index][0]
+                    tag_index = int(categoryAndIndex_list[index][2])
+                    # print(word, " ", "B-"+categoryAndIndex_list[index][0])
+
+                elif word_index < tag_index - 1 and tag_flag != "":
+                    if word_BIO_list[word_flag][0] != 0 \
+                            and ("I-" + tag_flag) not in word_BIO_list[word_flag]:
+                        word_BIO_list[word_flag].append("I-" + tag_flag)
+                    else:
+                        word_BIO_list[word_flag] = ["I-" + tag_flag]
+
+                word_flag += 1
+                word_index += len(word)
+                # 有些空白word
+                if word is None or word == "":
+                    word_index += 1
+
+        # 循环:将其余Label置为O
+        for index in range(len(word_BIO_list)):
+            if word_BIO_list[index][0] == 0:
+                word_BIO_list[index] = ["O"]
+        word_BIO_list_all.append(word_BIO_list)
+
+        # 输出每个单词在句子中的index和在第几条句子;之前的单词index是全文的index。
+        # 并输出每个单词对应的句子的分词Tokens
+        wordInSentencesIndex_list = []
+        wordInSentencesNumber_list = []
+        wordInSentencesTokens_list = []
+        sentence_number = 0
+        sentences_index_list = corpus_iedocument[2][i][1:-1].split(", ")
+        word_index = 0
+        # 循环:所有单词
+        for index in range(len(words_list[0])):
+            # 判断在第几个句子
+            # print("word_index", word_index, sentence_number, len(sentences_index_list))
+            if sentence_number + 1 >= len(sentences_index_list) or word_index < int(
+                    sentences_index_list[sentence_number + 1]):
+                wordInSentencesNumber_list.append(sentence_number)
+            else:
+                sentence_number += 1
+                if sentence_number >= len(sentences_index_list):
+                    break
+                wordInSentencesNumber_list.append(sentence_number)
+
+            # 输出该单词在该句子的index
+            if sentences_index_list[sentence_number] == "":
+                continue
+            wordInSentence_begin_index = word_index - int(sentences_index_list[sentence_number])
+            if words_list[0][index] is None or words_list[0][index] == "":
+                wordInSentence_end_index = wordInSentence_begin_index + 1
+            else:
+                wordInSentence_end_index = wordInSentence_begin_index + len(words_list[0][index])
+            wordInSentencesIndex_list.append(str(wordInSentence_begin_index) + "," + str(wordInSentence_end_index))
+
+            # 根据句子编号输出句子Tokens
+            if wordInSentencesNumber_list[index] < len(sentences_list):
+                wordInSentencesTokens_list.append(sentences_list[wordInSentencesNumber_list[index]])
+            else:
+                wordInSentencesTokens_list.append(sentences_list[-1])
+
+            # # 输出该单词在该句子的index
+            # if sentences_index_list[sentence_number] == "":
+            #     # print("句子序号为'': ")
+            #     # print(sentences_index_list, len(sentences_index_list), sentence_number)
+            #     continue
+            #
+            # for j in range(len(sentences_list[wordInSentencesNumber_list[index]])):
+            #
+            # wordInSentence_begin_index = word_index - int(sentences_index_list[sentence_number])
+            # if words_list[0][index] is None or words_list[0][index] == "":
+            #     wordInSentence_end_index = wordInSentence_begin_index + 1
+            # else:
+            #     wordInSentence_end_index = wordInSentence_begin_index + len(words_list[0][index])
+            # wordInSentencesIndex_list.append(str(wordInSentence_begin_index) + "," + str(wordInSentence_end_index))
+
+
+            word_index += len(words_list[0][index])
+            # 有些空白word
+            if words_list[0][index] is None or words_list[0][index] == "":
+                word_index += 1
+
+        wordInSentencesIndex_list_all.append(wordInSentencesIndex_list)
+        wordInSentencesNumber_list_all.append(wordInSentencesNumber_list)
+        wordInSentencesTokens_list_all.append(wordInSentencesTokens_list)
+        # print("wordInSentencesTokens_list", wordInSentencesTokens_list)
+
+    return words_list_all, word_BIO_list_all, wordInSentencesIndex_list_all, wordInSentencesTokens_list_all
+
+
+def BIOData2TXT():
+    words_list_all, word_BIO_list_all, \
+    wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = data2BIOData()
+
+    print(words_list_all)
+    print(type(word_BIO_list_all))
+    print(len(wordInSentencesIndex_list_all))
+    print(len(wordInSentencesTokens_list_all))
+
+    file = open('C:\\Users\\admin\\Desktop\\BIOData_list.txt', 'w', encoding='utf-8')
+    file.write(str([words_list_all, word_BIO_list_all, wordInSentencesIndex_list_all, wordInSentencesTokens_list_all]))
+    file.close()
+    return
+
+
+def TXT2BIOData():
+    start_time = time.time()
+
+    file = open('C:\\Users\\admin\\Desktop\\BIOData_list.txt', 'r', encoding='utf-8')
+    str1 = file.read()
+    list1 = ast.literal_eval(str1)
+    file.close()
+
+    # print(list1[0])
+    # print(type(list1[1]))
+    # print(len(list1[2]))
+    # print(len(list1[3]))
+
+    end_time = time.time()
+    print("耗时:", end_time-start_time)
+
+    return list1[0], list1[1], list1[2], list1[3]
+
+
+def BIOData2DataFrame():
+    words_list_all, word_BIO_list_all, _, _ = data2BIOData()
+    # print(words_list_all)
+    # print(word_BIO_list_all)
+    df = pd.DataFrame([words_list_all[0], word_BIO_list_all[0]])
+    df = df.T
+    for index in range(len(words_list_all)):
+        if index == 0:
+            continue
+        df = df.append(pd.DataFrame([words_list_all[index], word_BIO_list_all[index]]).T)
+
+    # print(df)
+    df.columns = ["Word", "BIO"]
+    df.to_csv("C:\\Users\\admin\\Desktop\\BIO.csv")
+
+
+def PersonBIOData2BIO_Sentence():
+    words_list_all, word_BIO_list_all, _, _ = data2BIOData()
+    # words_list_all, word_BIO_list_all, _, _ = TXT2BIOData()
+    # df = pd.DataFrame([words_list_all[0], word_BIO_list_all[0]])
+    # df = df.T
+    df = pd.DataFrame()
+    # 对每个Document
+
+    for index in range(len(words_list_all)):
+        list1 = word_BIO_list_all[index]
+        new_list = []
+
+        # 对每个BIO对
+        for i in range(len(list1)):
+            str1 = ""
+            for j in range(len(list1[i])):
+                if list1[i][j][2:8] == "person":
+                    if str1 == "":
+                        str1 = list1[i][j]
+                    elif str1 != "O":
+                        str1 = str1 + "," + list1[i][j]
+                else:
+                    str1 = "O"
+
+            new_list.append(str1)
+        df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
+
+    df.columns = ["Word", "BIO"]
+    # 将I-person转为B-person,因为一个模型只判断一类
+    # df["BIO"] = df["BIO"].apply(lambda x: "B" + x[1:] if x[0] == "I" else x)
+    # print(df[df["BIO"]])
+    # print(df)
+    # df.to_csv("C:\\Users\\admin\\Desktop\\Person_BIO.csv")
+
+    # 合并B-person和I-person为B-person
+    tag_flag = ""
+    delete_index_list = []
+    df = df.reset_index()
+    df = df[["Word", "BIO"]]
+    for index, row in df.iterrows():
+        if row["BIO"][0] == "B":
+            tag_flag = row["BIO"]
+        elif row["BIO"][0] == "I" and tag_flag != "":
+            df["Word"].iloc[index-1] = df["Word"].iloc[index-1] + df["Word"].iloc[index]
+            # df1["end_index"].iloc[index-1] = int(df1["end_index"].iloc[index-1]) + len(df["Word"].iloc[index])
+            delete_index_list.append(index)
+        else:
+            tag_flag = ""
+    df = df.drop(delete_index_list)
+    # df1 = df1.drop(delete_index_list)
+
+
+    # 取标注为person_person的词的前35个词,后3个词作为一个句子
+    sentences = []
+    for index in range(len(df["BIO"])):
+        sentence = ""
+        if df["BIO"].iloc[index] != "O":
+            sentence1 = ""
+            sentence2 = ""
+            if index > 60 or len(df["BIO"]) - index < 60:
+                for i in range(60, 0, -1):
+                    sentence1 = sentence1 + df["Word"].iloc[index - i] + " "
+                for i in range(1, 61):
+                    sentence2 = sentence2 + df["Word"].iloc[index + i] + " "
+                sentence = sentence + sentence1 + "||" + df["Word"].iloc[index] + "||" + sentence2
+            else:
+                sentence = None
+        else:
+            sentence = None
+
+        sentences.append(sentence)
+
+    df["Sentence"] = sentences
+
+    # 舍弃BIO为O的行
+    df = df.reset_index()
+    df = df[["Word", "Sentence", "BIO"]]
+    delete_index_list = []
+    for index, row in df.iterrows():
+        if row["BIO"] == "O":
+            delete_index_list.append(index)
+    df = df.drop(delete_index_list)
+    df = df.reset_index()
+    df = df[["Word", "Sentence", "BIO"]]
+
+    # 判断类标签,0为人名,1为联系人,2为招标联系人,3为代理联系人,4为评审专家,5为其他非联系人
+    df["Label"] = df["BIO"].apply(lambda x: 5 if x == "O" else (1 if x[9:] == "person" else (
+        2 if x[9:] == "tendereePerson" else (3 if x[9:] == "agencyPerson" else (0 if x[2:] == "person" else 4)))))
+
+    df = df[["Word", "Label", "Sentence", "BIO"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_60.csv")
+
+    # df["Sentence"] = df["BIO"].apply(lambda x: x if x[9:] == "person" else x)
+
+
+def BIOData2PersonData():
+    words_list_all, word_BIO_list_all, \
+    wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = data2BIOData()
+
+    df = pd.DataFrame()
+    # 循环:对每个Document
+    for index in range(len(words_list_all)):
+        list1 = word_BIO_list_all[index]
+        new_list = []
+
+        # 循环:一篇Document中的每个BIO对,判断Label是person的
+        for i in range(len(list1)):
+            str1 = ""
+            for j in range(len(list1[i])):
+                if list1[i][j][2:8] == "person":
+                    # print("==", list1[i][j])
+                    if str1 == "":
+                        str1 = list1[i][j]
+                    elif str1 != "O":
+                        str1 = str1 + "," + list1[i][j]
+                else:
+                    str1 = "O"
+            new_list.append(str1)
+        df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
+    df.columns = ["Word", "BIO"]
+
+    # 循环:对每个Document
+    df1 = pd.DataFrame()
+    for index in range(len(words_list_all)):
+        # 循环:一篇Document中的单词的begin_index,end_index,tokens
+        begin_index = []
+        end_index = []
+        tokens = []
+        for i in range(len(wordInSentencesIndex_list_all[index])):
+            ss = wordInSentencesIndex_list_all[index][i].split(",")
+            begin_index.append(ss[0])
+            end_index.append(ss[1])
+            tokens.append(wordInSentencesTokens_list_all[index][i])
+        df1 = df1.append(pd.DataFrame([tokens, begin_index, end_index]).T)
+    df1.columns = ["tokens", "begin_index", "end_index"]
+    # print("df1.shape ", df1.shape)
+    # print("df.shape ", df.shape)
+
+    # 将I-person转为B-person,因为一个模型只判断一类
+    # df["BIO"] = df["BIO"].apply(lambda x: "B" + x[1:] if x[0] == "I" else x)
+    # 判断类标签,0为人名,1为联系人,2为招标联系人,3为代理联系人,4为评审专家,5为其他非联系人
+    df["Label"] = df["BIO"].apply(lambda x: 5 if x == "O" else (1 if x[9:] == "person" else (
+        2 if x[9:] == "tendereePerson" else (3 if x[9:] == "agencyPerson" else (0 if x[2:] == "person" else 4)))))
+
+
+    # 重置索引
+    df = df.reset_index()
+    df1 = df1.reset_index()
+    # 合并B-person和I-person为B-person
+    tag_flag = ""
+    delete_index_list = []
+    for index, row in df.iterrows():
+        if row["BIO"][0] == "B":
+            tag_flag = row["BIO"]
+        elif row["BIO"][0] == "I" and tag_flag != "":
+            df["Word"].iloc[index-1] = df["Word"].iloc[index-1] + df["Word"].iloc[index]
+            df1["end_index"].iloc[index-1] = int(df1["end_index"].iloc[index-1]) + len(df["Word"].iloc[index])
+            delete_index_list.append(index)
+        else:
+            tag_flag = ""
+    df = df.drop(delete_index_list)
+    df1 = df1.drop(delete_index_list)
+
+    # 重置索引
+    df = df.reset_index()
+    df1 = df1.reset_index()
+    df1 = pd.concat([df["Word"], df["Label"], df1["tokens"], df1["begin_index"], df1["end_index"]], axis=1)
+    df1.columns = ["Word", "Label", "tokens", "begin_index", "end_index"]
+
+    # 舍弃Label为5的行
+    delete_index_list = []
+    for index, row in df1.iterrows():
+        if row["Label"] == 5:
+            delete_index_list.append(index)
+    df1 = df1.drop(delete_index_list)
+    df1.reset_index()
+
+    # 拼接列begin_index,end_index,tokens
+    # begin_index = []
+    # end_index = []
+    # for index in range(len(wordInSentencesIndex_list_all)):
+    #     ss = wordInSentencesIndex_list_all[index].split(",")
+    #     begin_index.append(ss[0])
+    #     end_index.append(ss[1])
+    # df["begin_index"] = pd.DataFrame(begin_index)
+    # df
+
+    # print(df1)
+    df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Data_all.csv")
+
+
+def BIOData2Bidway():
+    words_list_all, word_BIO_list_all, \
+    wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = TXT2BIOData()
+
+    df = pd.DataFrame()
+    # 循环:对每个Document
+    for index in range(len(words_list_all)):
+        list1 = word_BIO_list_all[index]
+        new_list = []
+
+        # 循环:一篇Document中的每个BIO对,判断Label是bidway的
+        for i in range(len(list1)):
+            str1 = ""
+            for j in range(len(list1[i])):
+                if list1[i][j][2:8] == "bidway":
+                    # print("==", list1[i][j])
+                    if str1 == "":
+                        str1 = list1[i][j]
+                    elif str1 != "O":
+                        str1 = str1 + "," + list1[i][j]
+                else:
+                    str1 = "O"
+            new_list.append(str1)
+        df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
+    df.columns = ["Word", "BIO"]
+    df.to_csv("C:\\Users\\admin\\Desktop\\Bidway_BIO.csv")
+    return
+
+
+def BIOData2ServiceTime():
+    words_list_all, word_BIO_list_all, \
+    wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = TXT2BIOData()
+
+    df = pd.DataFrame()
+    # 循环:对每个Document
+    for index in range(len(words_list_all)):
+        list1 = word_BIO_list_all[index]
+        new_list = []
+
+        # 循环:一篇Document中的每个BIO对,判断Label是bidway的
+        for i in range(len(list1)):
+            str1 = ""
+            for j in range(len(list1[i])):
+                if list1[i][j][2:] == "serviceTime":
+                    # print("==", list1[i][j])
+                    if str1 == "":
+                        str1 = list1[i][j]
+                    elif str1 != "O":
+                        str1 = str1 + "," + list1[i][j]
+                else:
+                    str1 = "O"
+            new_list.append(str1)
+        df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
+    df.columns = ["Word", "BIO"]
+    df.to_csv("C:\\Users\\admin\\Desktop\\ServiceTime_BIO.csv")
+    return
+
+
+def duplicateData(label, sample_rate):
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Data_all_OverSample.csv")
+    print(df.shape)
+    df1 = df[df["Label"] == label]
+    df1 = df1.sample(frac=sample_rate)
+    df = df.append(df1)
+    df.to_csv("C:\\Users\\admin\\Desktop\\Person_Data_all_OverSample.csv")
+    print(df.shape)
+
+
+def resetAndShuffleData():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+    df = df.sample(frac=1).reset_index(drop=True)
+    df = df.reset_index()
+    # df = df[["Word", "Label", "tokens", "begin_index", "end_index"]]
+    df = df[["Word", "Label", "Sentence", "BIO"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+
+
+def re_bidway():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
+
+    reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
+                     u'|发包方式|发包类型|开展方式|招标类型)(.*)'
+                     u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+                     u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
+                     u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+                     u'|网上招标|其他'
+                     u'|竞谈竞价|网上直购|公开竞谈'
+                     u'|库内邀请|库内公开发包)')
+
+    # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
+    #                  u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
+    #                  u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
+    #                  u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
+    #                  u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
+    #                  u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
+
+    reg2 = re.compile(u'(采用|以|)'
+                      u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+                      u'|竞争性谈判|询价|电子书面竞投|电子竞价'
+                      u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+                      u'|网上招标|分散采购'
+                      u'|竞谈竞价|网上直购|公开竞谈'
+                      u'|库内邀请)'
+                      u'(采购方式|方式)')
+
+    reg1 = re.compile(
+                      # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
+                      # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
+                      # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
+                      # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
+                      # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
+                      # u'|国际公开竞争性招标)'
+                        u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
+                        u'|竞争性谈判|询价|电子书面竞投'
+                        u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
+                        u'|网上招标|分散采购'
+                        u'|竞谈竞价|网上直购|公开竞谈'
+                        u'|库内邀请)'
+                      )
+
+    reg1_not = re.compile(u'及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录')
+
+    reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
+
+
+    reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
+                              u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
+                              u'|网上电子投标|比质比价|询单|比选'
+                              u'|公开招租|网上招标|分散采购'
+                              u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
+                              )
+
+    text_list = df["text"].to_list()
+    output_list = []
+    for index in range(len(text_list)):
+        input_str = text_list[index]
+
+        # 把一些混淆的词先替换掉
+        input_str = re.sub(reg1_not, "", input_str)
+
+        match = reg.search(input_str)
+        output_str = None
+        # 根据正则表达式匹配
+        if match:
+            # 判断长度,截断
+            if len(match.group()) >= 15:
+                ss = re.split(",|\.|,|。|;|;", match.group())
+                # 判断所需的字符串在哪一段
+                for i in range(len(ss)):
+                    if re.search(reg1, ss[i]):
+                        output_str = ss[i]
+                        break
+            else:
+                output_str = match.group()
+
+        else:
+            match2 = re.search(reg2, input_str)
+            if match2:
+                output_str = match2.group()
+
+            else:
+                match1 = re.search(reg1, input_str)
+                if match1:
+                    output_str = match1.group()
+
+        # 再判断一次长度
+        if output_str is not None:
+            if len(output_str) >= 15:
+                match2 = re.search(reg2, input_str)
+                if match2:
+                    output_str = match2.group()
+            if len(output_str) >= 15:
+                match1 = re.search(reg1, input_str)
+                if match1:
+                    output_str = match1.group()
+
+        # 最后输出还为空,匹配一些易混淆的词
+        if output_str is None:
+            match3 = re.search(reg3, input_str)
+            if match3:
+                output_str = match3.group()
+
+
+        if output_str is not None:
+            if not re.search("分散采购|采购方式:邀请", output_str):
+                # 公开采购转为公开招标
+                output_str = re.sub("公开采购", "公开招标", output_str)
+                # 去掉第一个字符冒号
+                ss = re.split("::|:|:", output_str)
+                output_str = ss[-1]
+                # 去掉采购、方式、采用
+                output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
+
+            # 使用标准标签过滤
+            match4 = re.search(reg_standard, output_str)
+            if match4:
+                output_str = match4.group()
+
+        output_list.append(output_str)
+
+    df["re"] = pd.DataFrame(output_list)
+    df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text1.csv")
+
+
+def re_serviceTime():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
+
+    # reg = re.compile(u'(周期|工期|服务期|服务时限|交货时间|履行期限|服务周期|交货期|供货期|合格工期'
+    #                  u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
+    #                  u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
+    #                  u')'
+    #                  u'(.*)(日止|日内|年|年度|天|月|日|周内|年内)')
+
+    reg0 = re.compile(u'(服务时间:|服务期限:)'
+                      u'([^至到]*)'
+                      u'(至|到)'
+                      u'([^日时]*)'
+                      u'(日|时)'
+                      )
+
+    reg = re.compile(u'(周期|工期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期'
+                     u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
+                     u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
+                     u')'
+                     # u'([^日止|日内|年|年度|月|日|周内|年内|\d+]*)'
+                     u'([^年月日\d+]*)'
+                     u'([\d+|一|二|三|四|五|六|七|八|九|十])'
+                     u'(日止|日内|年|年度|月|日|周内|年内|日历天|工作日|\d+日|\d+|起)'
+                     u'(个月|\(日历天\)|)')
+
+    reg_not = re.compile(u'(工期延误|工期节点|工期管理|合同履行日期:见|服务期限截止|交付使用'
+                         u'|服务期限:1、|工期\(交货期\):|工期、)')
+
+
+    reg1 = re.compile(u'(合同签订|签订合同|合同履行日期)'
+                      u'([^\d]*)'
+                      u'(\d+|一|二|三|四|五|六|七|八|九|十)'
+                      u'(个|)'
+                      u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日)'
+                      )
+
+    reg2 = re.compile(u'(服务期限|履行期限|工期|服务期|维护期限|服务周期|工期,\(日历天\),'
+                      u'|服务期\(日历天\)|预定工期\(日历天\)|期限要求)'
+                      u'(:|:|)+'
+                      u'(\d+|一|二|三|四|五|六|七|八|九|十|两|贰|叁)'
+                      u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日|天|)'
+                      )
+
+    text_list = df["text"].to_list()
+    output_list = []
+
+    for index in range(len(text_list)):
+        input_str = text_list[index]
+        input_str = re.sub(reg_not, "", input_str)
+        output_str = ""
+        unit = ""
+
+        match0 = re.findall(reg0, input_str)
+        if match0:
+            ss = ""
+            for i in range(len(match0)):
+                s = ""
+
+                for j in range(len(match0[i])):
+                    s = s + match0[i][j]
+                ss = ss + s
+                if i < len(match0)-1:
+                    ss = ss + " "
+            output_str = ss
+
+            # 太长的裁剪
+            if len(output_str) >= 40:
+                sss = output_str.split(",")
+                output_str = sss[0]
+
+            print("0: ", output_str)
+        else:
+            match = reg.findall(input_str)
+            if match:
+                ss = ""
+                for i in range(len(match)):
+                    s = ""
+                    if "天" in match[i]:
+                        unit = "天"
+                    if "月" in match[i]:
+                        unit = "月"
+                    for j in range(2, len(match[i])):
+                        s = s + match[i][j] + unit
+                    ss = ss + s
+                    if i < len(match)-1:
+                        ss = ss + " "
+                output_str = ss
+                print(output_str)
+
+            else:
+                match1 = re.findall(reg1, input_str)
+                if match1:
+                    ss = ""
+                    for i in range(len(match1)):
+                        s = ""
+                        if "天" in match[i]:
+                            unit = "天"
+                        if "月" in match[i]:
+                            unit = "月"
+                        for j in range(2, len(match1[i])):
+                            s = s + match1[i][j] + unit
+                        ss = ss + s
+                        if i < len(match1)-1:
+                            ss = ss + " "
+                    output_str = ss
+                    print("1: ", output_str)
+
+                else:
+                    match2 = re.findall(reg2, input_str)
+                    if match2:
+                        ss = ""
+                        for i in range(len(match2)):
+                            s = ""
+                            for j in range(2, len(match2[i])):
+                                s = s + match2[i][j]
+                            ss = ss + s
+                            if i < len(match2)-1:
+                                ss = ss + " "
+                        output_str = ss
+                        print("2: ", output_str)
+
+        output_list.append(output_str)
+    # for index in range(len(text_list)):
+    #     input_str = text_list[index]
+    #     match = reg.search(input_str)
+    #     output_str = None
+    #     # 根据正则表达式匹配
+    #     if match:
+    #         # 匹配成功,先匹配冒号,再分割冒号后的第一个标点
+    #         match2 = re.search(u':|:', match.group())
+    #         if match2:
+    #             ss = re.split(",|\.|,|。|;|;", match.group()[match2.span()[0]:])
+    #             output_str = match.group()[:match2.span()[0]] + ss[0]
+    #         else:
+    #             ss = re.split(",|\.|,|。|;|;", match.group())
+    #             output_str = ss[0]
+    #
+    #         # 再匹配一些特殊情况
+    #         # 匹配出太长的,就是需要截断
+    #         if len(output_str) >= 40:
+    #             ss = re.split(",|\.|,|。|;|;", output_str)
+    #             output_str = ss[0]
+    #         # 错误分类的:服务期限:1、资金来源:自筹资金
+    #         if re.search(u"来源|1、|资金", output_str):
+    #             output_str = None
+    #         # 有完成、交货这些字眼分割
+    #         if output_str is not None:
+    #             ss = re.split("完工|质量", output_str)
+    #             if len(ss) > 1:
+    #                 output_str = ss[0]
+    #     else:
+    #         match1 = re.search(reg1, input_str)
+    #         if match1:
+    #             # 匹配成功,先匹配冒号,再分割冒号后的第一个标点
+    #             match2 = re.search(u':|:', match1.group())
+    #             if match2:
+    #                 ss = re.split(",|\.|,|。|;|;", match1.group()[match2.span()[0]:])
+    #                 output_str = match1.group()[:match2.span()[0]] + ss[0]
+    #             else:
+    #                 ss = re.split(",|\.|,|。|;|;", match1.group())
+    #                 output_str = ss[0]
+    #             # 再匹配一些特殊情况
+    #             # 匹配出太长的,就是需要截断
+    #             if len(output_str) >= 40:
+    #                 ss = re.split(",|\.|,|。|;|;", output_str)
+    #                 output_str = ss[0]
+
+
+    df["re"] = pd.DataFrame(output_list)
+    df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text1.csv")
+
+    return
+
+
+def re_serviceTime2():
+
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
+
+    text_list = df["text"].to_list()
+    output_list = []
+
+    keyword = u'('                                                               \
+              u'工期/交货期/服务期|项目周期|工期\(交货期\)|计划工期|工期要求:|服务期|服务时限|履行期限|服务周期|供货期|合格工期'        \
+              u'|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期'     \
+              u'|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期:'     \
+              u')'
+
+    # 替换 易混淆关键词
+    reg_not = re.compile(u'(工期延误|工期节点|工期管理|合同履行日期:见|服务期限截止|交付使用'
+                         u'|服务期限:1、|工期、)|截止|合同签订日期:|保证金在合同签订'
+                         u'|工期情况|签订合同前,|计划工期内|服务期内|服务期限应按')
+
+    # 匹配 特定词 + 数字
+    # reg0 = re.compile(u'(工期/交货期/服务期|服务期限|服务期)'
+    #                   u'(:)'
+    #                   u'(\d+)')
+
+    # 匹配 匹配 关键词 + 年月日时至年月日时止|年月日至年月日
+    reg0 = re.compile(u'(服务期|服务期限|服务周期|服务时间)'
+                      u'([^至]*)'
+                      u'(至)'
+                      u'([^日天止]*)'
+                      u'(日|天|止)')
+
+    # 匹配 特定词 + 数字 + 年月周天
+    reg1 = re.compile(u'(工期/交货期/服务期|服务期限|服务期|工期,|工期要求|中介服务时限)'
+                      u'([^天年月日]*[\d+一二三四五六七两叁贰壹肆伍])'
+                      u'(天|个月|个日历天|年|日历天|日|\(日历天\)|\(天\))')
+
+
+
+    # 匹配 特定词 + 数字 + 年月周天
+    reg2 = re.compile(u'(合同签订|签订合同|合同履行日期)'
+                      u'([^\d年]*)'
+                      u'(\d+|一|二|三|四|五|六|七|八|九|十)'
+                      u'(个|)'
+                      u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日|天内)'
+                      )
+
+    # 匹配 特定词 + (天/日历天) + 数字
+    reg3 = re.compile(u'(工期,|工期|服务时间|服务期)'
+                      u'(\(日历天\),|\(日历天\)|\(天\))'
+                      u'([^\d+]*)'
+                      u'(\d+)')
+
+    # 匹配 特定词 + (年) + 数字
+    reg6 = re.compile(u'(服务期限)'
+                      u'(\(年\))'
+                      u'([^\d+]*)'
+                      u'(\d+)')
+
+    # 匹配 关键词 + 数字 + 年/月/天
+    reg4 = re.compile(keyword +
+                      u'([^天年月日]*)'
+                      u'([\d+一二三四五六七两叁贰壹肆伍])'
+                      u'(,|)'
+                      u'(天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\))')
+
+    # 匹配 关键词 + 年月日时至年月日时止
+    # reg5 = re.compile(keyword +
+    #                   u'([^至]*)'
+    #                   u'(至)'
+    #                   u'([^止]*)'
+    #                   u'(止)')
+
+    # 匹配 关键词 + 年月日至年月日
+    # reg6 = re.compile(keyword +
+    #                   u'([^至]*)'
+    #                   u'(至)'
+    #                   u'([^日天]*)'
+    #                   u'(日|天)')
+
+    # 匹配 优先级低的词 + 年月日
+    reg5 = re.compile(u'(服务要求|服务时限)'
+                      u'([^年日]*)'
+                      u'(年|日)')
+
+    for index in range(len(text_list)):
+        # 初始化
+        output_str = ""
+        input_str = text_list[index]
+        # 替换
+        input_str = re.sub(reg_not, "", input_str)
+        # 匹配
+        if output_str == "":
+            output_str = re_findAllResult(reg3, input_str, unit="天", index=2)
+        if output_str == "":
+            output_str = re_findAllResult(reg6, input_str, unit="年", index=2)
+        if output_str == "":
+            output_str0 = re_findAllResult(reg0, input_str, index=1)
+            output_str1 = re_findAllResult(reg1, input_str, index=1)
+            # 同时匹配两个表达式,如果一个是空就选另一个,两个皆不为空,判断长度
+            if output_str0 == "" and output_str1 == "":
+                output_str = ""
+            elif output_str0 == "":
+                output_str = output_str1
+            elif output_str1 == "":
+                output_str = output_str0
+            else:
+                if len(output_str0) >= 100:
+                    output_str = output_str1
+                elif len(output_str0) >= len(output_str1):
+                    output_str = output_str0
+                else:
+                    output_str = output_str1
+
+        if output_str == "":
+            output_str = re_findAllResult(reg2, input_str, index=2)
+
+        if output_str == "":
+            output_str = re_findAllResult(reg4, input_str, index=1)
+        if output_str == "":
+            output_str = re_findAllResult(reg5, input_str, index=1)
+
+
+        # 将冒号删掉
+        output_str = re.sub(":|:|限|交货期/服务期|,|\)|\(", "", output_str)
+
+        # 字符串中包含断句符号,裁剪
+        ss = re.split("。|,|;", output_str)
+        output_str = ss[0]
+
+        # 添加
+        output_list.append(output_str)
+
+    df["re"] = pd.DataFrame(output_list)
+    df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text2.csv")
+
+
+def re_serviceTime3():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
+
+    text_list = df["text"].to_list()
+
+    # 初始化
+    output_list = []
+    text_index_list = []
+
+    before = '(?P<before>'\
+             '工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
+             '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+             '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
+             '|交货时间|工期\(日历天\)' \
+             '|服务期限为|计划工期|工期要求|服务期限|服务期' \
+             '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
+             '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期' \
+             ')'
+
+    before2 = '(?P<before2>' \
+              '合同签订后|合同签订之日起|约|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
+              '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
+              '|自合同签订生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|本项目合同期|' \
+              ')'
+
+    charac = '(?P<charac>' \
+             '[::,,]*' \
+             ')'
+
+    center = '(?P<center>' \
+             '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
+             ')'
+
+    after = '(?P<after>' \
+            '天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|' \
+            ')'
+
+    reg = re.compile(before + charac + before2 + center + after)
+
+    reg1 = re.compile(before + charac + '(.*?止)')
+
+    reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
+                         u'|工期、)'
+                         u'|工期情况|划工期内|服务期内')
+
+    reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,|服务期限应按'
+                          u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
+                          u')')
+
+    reg_not2 = re.compile(u'截止|1\.|1、')
+
+    for index in range(len(text_list)):
+        # 初始化
+        output_str = ""
+        input_str = text_list[index]
+
+        # 替换混淆词
+        input_str = re.sub(reg_not, "####", input_str)
+        input_str = re.sub(reg_not1, "######", input_str)
+        input_str = re.sub(reg_not2, "##", input_str)
+
+        output_str, text_index = re_findAllResult(reg, input_str)
+        if len(text_index) == 0:
+            output_str, text_index = re_findAllResult(reg1, input_str)
+
+        # 添加
+        output_list.append(output_str)
+        text_index_list.append(str(text_index))
+
+    df["text_index"] = pd.DataFrame(text_index_list)
+    index_to_word = []
+    for index, row in df.iterrows():
+        i_list = ast.literal_eval(row["text_index"])
+        word = ""
+        for i in range(len(i_list)):
+            word = word + row["text"][i_list[i][0]:i_list[i][1]]
+            if i != len(i_list) - 1:
+                word = word + " "
+        if len(word) >= 120:
+            word = ""
+            df["text_index"].iloc[index] = []
+        index_to_word.append(word)
+    df["re"] = pd.DataFrame(index_to_word)
+
+    df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re", "text_index"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text4.csv")
+
+
+def re_findAllResult(reg, input, unit="", index=0):
+    '''
+
+    :param reg: 正则表达式
+    :param input: 待匹配句子
+    :param unit: 需要加的单位
+    :param index: 字符串拼接的开始位置
+    :return: 正则后的字符串
+    '''
+    match = re.findall(reg, input)
+    output = ""
+
+    if match:
+        ss = ""
+        for i in range(len(match)):
+            s = ""
+            for j in range(index, len(match[i])):
+                s = s + match[i][j]
+                if unit != "" and j == len(match[i])-1:
+                    s = s + unit
+            ss = ss + s
+            if i < len(match)-1:
+                ss = ss + " "
+        output = ss
+
+    # 全文下标
+    text_index = []
+    match1 = re.finditer(reg, input)
+    for i in match1:
+        d = i.groupdict()
+        print(d)
+        if d.get("before") is not None:
+            front_len = len(d.get("before")) + len(d.get("charac"))
+        else:
+            front_len = 0
+        text_index.append([i.start()+front_len, i.end()])
+
+    return output, text_index
+
+
+def calculateLen(ss, i):
+    front_len = 0
+    back_len = 0
+    print("------")
+    print(i)
+    print(ss)
+    for index in range(i):
+        print(ss[index], len(ss[index]))
+        front_len += len(ss[index])
+    for index in range(i+1, len(ss)):
+        back_len += len(ss[index])
+    return front_len, back_len
+
+
+def test_re():
+    keyword = u'(' \
+              u'工期/交货期/服务期|项目周期|工期\(交货期\)|计划工期|工期要求:|服务期|服务时限|履行期限|服务周期|供货期|合格工期' \
+              u'|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期' \
+              u'|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期:' \
+              u')'
+
+    reg0 = re.compile(u'(服务时间:|服务期限:)'
+                      u'([^至到]*)'
+                      u'(至|到)'
+                      u'([^日时]*)'
+                      u'(日|时)'
+                      )
+
+    reg = re.compile(u'(周期|工期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期'
+                     u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
+                     u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
+                     u')'
+                     # u'([^日止|日内|年|年度|月|日|周内|年内|\d+]*)'
+                     u'([^年月日\d+]*)'
+                     u'([\d+|一|二|三|四|五|六|七|八|九|十])'
+                     u'(日止|日内|年|年度|月|日|周内|年内|日历天|工作日|\d+日|\d+|起*)'
+                     u'(个月|\(日历天\)|)')
+
+    reg1 = re.compile(u'(工期/交货期/服务期:|服务期限|服务期|工期,|工期要求|中介服务时限)'
+                      u'([^天年月日]*[\d+一二三四五六七两叁贰壹肆伍])'
+                      u'(天|个月|个日历天|年|日历天|日|\(日历天\)|\(天\))')
+
+    reg2 = re.compile(u'(服务期限|履行期限|工期|服务期|维护期限|服务周期|工期,\(日历天\),)'
+                      u'(:|:)+'
+                      u'(\d+|一|二|三|四|五|六|七|八|九|十|两|贰|叁)'
+                      u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日)'
+                      )
+
+    s = u'(项目周期|周期|工期/交货期/服务期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期'         \
+        u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'    \
+        u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期)'
+    reg3 = re.compile(s +
+                      u'([^天年月日]*)'
+                      u'([\d+一二三四五六七两叁贰壹肆伍])'
+                      u'(,|)'
+                      u'(天|个月|年|日历天|日|\(日历天\)|\(天\))')
+
+    reg_00 = re.compile(u'(服务期限|工期|服务时间)'
+                        u'([^至]*)'
+                        u'(至)'
+                        u'([^止]*)'
+                        u'(止)')
+
+    reg_01 = re.compile(u'(服务期限|工期|服务时间)'
+                        u'([^至]*)'
+                        u'(至)'
+                        u'([^日]*)'
+                        u'(日)')
+
+    reg4 = re.compile(keyword +
+                      u'([^天年月日]*)'
+                      u'([\d+一二三四五六七两叁贰壹肆伍])'
+                      u'(,|)'
+                      u'(天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\))')
+
+    reg5 = re.compile(u'(服务要求|服务时限)'
+               u'([^年日]*)'
+               u'(年|日)')
+
+
+    test_text0 = "保险服务期限:自2020年1月1日零时起至2021年12月31日24时止的自然年度" \
+                 " 服务时间:2020年05月25日至2020年08月08日"
+
+
+
+    test_text = ",中标候选人公示快照。北京北方车辆集团有限公司原试验工段改扩建工程中标候选人公示,原试验工段改扩建工程,(招标项目编号:C1100000096007025006),于2020年05月21日在北京市市辖区西城区西便门内大街79号4号楼409进行了开标、评标等工作,现将本次评标结果推荐中标候选人公示如下:" \
+                "标段(包)编号:C1100000096007025006001,标段(包)名称:原试验工段改扩建工程,第一名:北京永兴丰源建筑工程有限公司,投标报价(元):2,010,700.02,质量标准:合格工期(天):90,项目负责人姓名:周秋红相关证书名称:二级建造师相关证书编号:京211141545754,建筑工程施工总承包壹级,建筑装修装饰工程专业承包贰级," \
+                "钢结构工程专业承包叁级,第二名:北京市九方弘业建筑工程有限责任公司,投标报价(元):1,988,322.19,质量标准:合格工期(天):90,项目负责人姓名:任敬科相关证书名称:二级建造师相关证书编号:01453994,建筑工程施工总承包叁级,钢结构工程专业承包叁级,第三名:河南德恒建设工程有限公司,投标报价(元):1,996,228.17,质量" \
+                "标准:合格工期(天):90,项目负责人姓名:张献军相关证书名称:二级建造师相关证书编号:豫241141449543,建筑工程施工总承包贰级,公示期:2020年05月26日-2020年05月28日,特此公示!,对评标结果如有异议,请于2020年05月28日前在中国兵器电子招标投标交易平台上进行提出。联系人:" \
+                "李茜,联系电话:13910533516,北京五环国际工程管理有限公司,2020年05月25日,"
+
+    test_text1 = "服务时间:合同签订之日起90日历天,联系电话:13910533516,北京五环国际工程管理有限公司,2020年05月25日"
+
+    test_text2 = "服务期限:两日  服务要求:1年 服务时限:中选后15个工作日完成"
+
+    test_text3 = "工期/交货期/服务期:30天 标准:合格工期(天) 服务期限:两年。具体采购内容和要求详见招标文件年 项目周期:40日历天"
+
+    test_text4 = u'''
+    ,大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
+    中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
+    哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,
+    中标侯选人第3名:江南电气有限公司,投标报价:20.13万元,质量:合格,工期,交货期/服务期:30天:2、中标候选人按照招标文件要求承诘的项目伉责人情况,中标侯选人(哈尔滨龙网电力设备有限公司)的项目负贵人:宋环宇身份证,10398309240912;,中标候选人(哈尔滨昊龙电气设各制造有限公司)的项目负贵人:尹水生身份证,2:0227197902120112,中标候选人(江南电气有限公司)的项目负贵人:秦世亮身份证,230104197410012334;,3、中标候选人响应招标文
+    件要求的资格能力条件,中标候选人(哈尔滨龙网电力设备有限公司)的资格能力条件:完全响应招标公告;中标选人(哈尔滨昊龙电气没备制造有公司)的资格能力条件:完伞响应招标公,告,中标候选人(江南电气有限公司)的资格能力条件:完仝响应招标公告,、提出异议的渠道和方式,以上结果公示三日,公示期间投标人或者其他利害关系人如有异议请以书面形式向招标,人提出;如无异议,预中标人即为中标人。三、其他,项目编号:-20200309-5,项目名称:大庆禾工煤炭分质清
+    沽划用项目-临时电二期工程设备、物资采购,计划供货期:合同签订后30日内供货,交货地点:施工现场地面交货,质量标准:符合国家及国家电网行业合格标准,招邡方式:公开招标,开标时间:2020华3月3日9时30分,公示起止日期:2020年4月1日至2020年±月3日,经评标委员会评审,中标候选人由高到低排序前三名为:第一名:晗尔滨龙网电力设备有限公司,第二名:晗尔滨昊龙电气设备制造有限公司,第三名:江南电气有限公司,点标有,经评标委员会评审,依法确定排名第一的
+    中标候选人为预中标人。预中标人为:晗尔滨龙网电力设备有限公司,颀中标价:¥199,800.00元,以上结果公示三日,公小期间投标人或者其他利害关系人如有异议请以书面形式向招标入提,出;如无异议,预中标人即为中标人。监督部门及联系方式:黑龙江北星电力有跟公罰、0459-6504811,四、监督部门,本招标项目的监督部门为黑龙江北星电力有限公司。五、联系方式,招标人:黑龙江北星电力有限公司,地址:大庆市让胡路区中买大街南段28号,联系人:卜先生,电话:0459-
+    6604811,电子邮件:418864qgq.com,招标代理机构:黑龙江省信亿招标有限公司,地址:哈尔滨市香坊区红滨大街1号516室,联系人:张海洋,电话;0451-55151625,电子邮件:xyzb5164163.com,招标人或其招标代理机构主要负责人(项目负贲人,(签名),1,招标人或其招标代理机构:与,盖章),
+    '''
+
+    s = re.finditer(reg4, test_text4)
+    # s = re.sub(reg5, "", test_text2)
+    # print(s)
+    # print(s.span())
+    # s = re.match("交货期/服务期:", "交货期/服务期:365天")
+    # print(s.span())
+    # if s:
+    #     print(s)
+
+    # print("计划工期:3个月 工期:3个月".split(" "))
+    for ss in s:
+        # sss = (0, 0)
+        print(ss.group())
+        print(ss.span())
+        # print(sss[1])
+
+
+def re_Accuracy(filename):
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\"+filename+".csv")
+    flag = []
+    flag_1 = 0
+    flag_0 = 0
+    for index, row in df.iterrows():
+        if row["word"] == row["re"]:
+            flag.append(1)
+            flag_1 += 1
+        elif str(row["re"]) in row["word"] or row["word"] in str(row["re"]):
+            flag.append(1)
+            flag_1 += 1
+        else:
+            flag.append(0)
+            flag_0 += 1
+
+    print("Accuracy: ", flag_1/(flag_1+flag_0))
+    df["correct"] = flag
+    df.to_csv("C:\\Users\\admin\\Desktop\\"+filename+".csv")
+
+
+def getTestData():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
+
+    number0 = 500
+    number1 = 1500
+    number2 = 600
+    number3 = 600
+    number4 = 500
+
+    df0 = df[df["Label"] == 0][:number0]
+    df0_deleted = df[df["Label"] == 0][number0:]
+
+    df1 = df[df["Label"] == 1][:number1]
+    df1_deleted = df[df["Label"] == 1][number1:]
+
+    df2 = df[df["Label"] == 2][:number2]
+    df2_deleted = df[df["Label"] == 2][number2:]
+
+    df3 = df[df["Label"] == 3][:number3]
+    df3_deleted = df[df["Label"] == 3][number3:]
+
+    df4 = df[df["Label"] == 4][:number4]
+    df4_deleted = df[df["Label"] == 4][number4:]
+
+    df_test = pd.concat([df0, df1, df2, df3, df4])
+    df_deleted = pd.concat([df0_deleted, df1_deleted, df2_deleted, df3_deleted, df4_deleted])
+
+    df_test.columns = ["index", "Word", "Label", "Sentence", "BIO"]
+    df_test = df_test.reset_index()
+    df_test = df_test[["Word", "Label", "Sentence", "BIO"]]
+    df_deleted.columns = ["index", "Word", "Label", "Sentence", "BIO"]
+    df_deleted = df_deleted.reset_index()
+    df_deleted = df_deleted[["Word", "Label", "Sentence", "BIO"]]
+
+    df_test.to_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df_deleted.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+
+
+def washData():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
+
+    # Agency_person
+    # reg1 = re.compile(u'(代理 机构 :|代理 机构 名称 :|代理 机构 联系 方式 :|采购 代理 机构|询价 代理 :'
+    #                   u'|代理 公司 :|招标 代理 :).*(联系人).*(\|\|)')
+    # # reg1 = re.compile(u'(代理 机构 联系 方式 :|招标 代理 单位 :|代理 机构 名称 :|采购 代理 机构 信息'
+    # #                   u'|交易 代理 机构).*(\|\|)')
+    # reg2 = re.compile(u'招标 人|质疑|采购|报名|监督|发布|出售|技术|中标|项目 联系人|项目 负责人'
+    #                   u'|招标 联系人|招标 单位 联系人')
+    # reg3 = re.compile(u'地址。*地址')
+
+    # Tenderee_person
+    # reg1 = re.compile(u'(采购 人 :|招标 人 :|采购 单位 :|采购 单位 名称 :|采购 人 名称 :|采购 单位 联系 方式 :'
+    #                   u').*(联系人|联系 方式 :).*(\|\|)')
+    # reg1 = re.compile(u'(招标 联系人 :|招标 人 联系 方式|招标 联系人 及 地址 :|招标 人 联系人 :'
+    #                   u'|招标 单位 :|采购 人 信息|采购 人 名称 :|采购 单位 联系人 :|采购 人 联系人).*(\|\|)')
+
+    # reg1 = re.compile(u'(技术 部分|商务 部分).*(\|\|)')
+    # reg2 = re.compile(u'代理|质疑|供应商|法人|发布|监督|项目|技术|投诉|服务 中心|文件 编制|部门|组织')
+    # # 表格型的数据被压成一行,前后分别为招标联系人、代理联系人
+    # reg3 = re.compile(u'(联系人 :).*(联系人 :)')
+
+    # 评审专家
+    # reg1 = re.compile(u'(评审 专家 :|评审 专家 名单|专家 名单 :|专家 :|评审 委员会 成员 名单 :'
+    #                   u'|评委 姓名 :|评审 委员会).*(\|\|)')
+    # reg2 = re.compile(u'招标 人|质疑')
+
+    # person_person
+    # reg1 = re.compile(u'(项目 联系人 :|监督 管理 部门|出让 人 :|监督 :|中标 单位 :|竞价 开启|质疑|商务 咨询 :|项目 名称 :'
+    #                   u'|招标 管理 办公室|负责人 姓名 :|技术 负责人|项目 负责人|法定 代表人|发布人 :|招标 人员 :'
+    #                   u'|项目 负责人 联系 电话 :|项目 经理 :|代理 人员 :|商务 联系人|法人|咨询 电话 :|投诉 电话 :'
+    #                   u'|受理人 :|收件人 :|联络人 :|项目 咨询 联系人 :|项目 报名 联系 :|收货人 :|交易 单位 :'
+    #                   u'|质疑 答复 联系人 :|现场 联系人|项目 总监 :|质疑 联系人|联系 确认|标的 查看|接收人|联系人 :'
+    #                   u'|技术 支持|项目 总工|审核 人|监理 工程师 :).*(\|\|)')
+    # reg1 = re.compile(u'(项目 联系人 :|项目 单位 :|监督 管理 部门 名称 :|质疑 答复 联系人 :|成交 单位 :'
+    #                   u'|项目 负责人|供应商 地址 :|机构 联络人 :|技术 负责人 :|采购 管理 机构 :'
+    #                   u'|项目 联系人).*(\|\|)')
+    # reg1 = re.compile(u'(项目 单位 :|招标 服务 中心|采购 管理 办公室|项目 名称 :|采购 管理 机构 :'
+    #                   u'|发包 单位 :).*(联系人).*(\|\|)')
+    # reg1 = re.compile(u'(招标 组织 单位 :|审核 人 :|采管 办 联系 方式 :|采购 项目 联系 方式'
+    #                   u'|询价 书 名称|疑问|资格 审查|提出|采购 文件|公众 号|项目 联系人 :|技术 负责人'
+    #                   u'|发布 人 :|联系 确认).*(\|\|)')
+    # reg1 = re.compile(u'(法定 代表人 :|委托 代理人 :).*(\|\|)')
+    # reg1 = re.compile(u'(备注 :).*(\|\|)')
+    # reg2 = re.compile(u'磋商|编 制|公证|审核|谈判|评委|代理 机构 名称|代理 机构'
+    #                   u'|采购 人 :|招标 人|采购 单位|采购 单位 名称 :|采购 人 名称 :|采购 单位 联系 方式 :|招标 单位 :'
+    #                   u'|采购 人|招标 代理|从业|施工员|资料员|公证员|受让方|采购员|招标 单位|招标 联系人|釆购 单位'
+    #                   u'|姓名|习近平|开户 名称')
+    # reg1 = re.compile(u'(联系人 :).*(联系人 :).*(\|\|)')
+    reg1 = re.compile(u'(联系人 :).*(\|\|).*(联系人 :)')
+    reg2 = re.compile(u'代理|公司|地址|采购|电话|商务|招标|技术|项目|联系 方式|监督')
+
+    # person
+    # reg1 = re.compile(u'(备注 :|受让方|受让 单位 :|从业 人员 :|姓名 :|施工员|资料员|公证员 :|采购员 :|开户 名称).*(\|\|)')
+    # reg1 = re.compile(u'(安全员|施工员|材料员|质量员|质量检查员|质检员|造价员|资料员).*(\|\|)')
+    # reg2 = re.compile(u'招标|项目|负责')
+
+
+    ## 从其他类筛选出该类
+    # 查看筛选出的数据
+    # df = df[df["Label"] == 2]
+    # wash_list = []
+    # for index, row in df.iterrows():
+    #     match = reg1.search(row["Sentence"])
+    #     if match:
+    #         match2 = reg2.search(match.group())
+    #         # if not match2:
+    #         if not match2:
+    #             wash_list.append(row)
+    # df1 = pd.DataFrame(wash_list)
+    # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_temp.csv")
+
+    # 改标签
+    for index, row in df.iterrows():
+        if row["Label"] == 2:
+            match = reg1.search(row["Sentence"])
+            if match:
+                match2 = reg2.search(match.group())
+                if not match2:
+                    # row["Label"] = 3
+                    df["Label"].iloc[index] = 1
+
+    df = df[["Word", "Label", "Sentence", "BIO"]]
+    df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
+
+
+    ## 从该类筛选出不属于该类的
+    # 查看筛选出的数据
+    # df = df[df["Label"] == 1]
+    # wash_list = []
+    # for index, row in df.iterrows():
+    #     match = reg1.search(row["Sentence"])
+    #     if match:
+    #         match2 = reg2.search(match.group())
+    #         # if not match2:
+    #         if not match2:
+    #             # match3 = reg3.search(match.group())
+    #             match3 = reg3.search(row["Sentence"])
+    #             if not match3:
+    #                 wash_list.append(row)
+    # df1 = pd.DataFrame(wash_list)
+    # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_temp.csv")
+
+    # 改标签
+    # for index, row in df.iterrows():
+    #     if row["Label"] == 1:
+    #         match = reg1.search(row["Sentence"])
+    #         if match:
+    #             match2 = reg2.search(match.group())
+    #             if not match2:
+    #                 # match3 = reg3.search(match.group())
+    #                 match3 = reg3.search(row["Sentence"])
+    #                 if not match3:
+    #                     df["Label"].iloc[index] = 3
+    #
+    # df = df[["Word", "Label", "Sentence", "BIO"]]
+    # df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
+
+
+def relabel():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df1 = df
+    for index, row in df.iterrows():
+        if row["Label"] == 1:
+            df1["Label"][index] == 3
+        if row["Label"] == 2:
+            df1["Label"][index] == 1
+        if row["Label"] == 3:
+            df1["Label"][index] == 2
+
+    df2 = df1
+    for index, row in df1.iterrows():
+        if row["Label"] == 1:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-30:]
+            if "。 联系人" in forward or ", 联系人" in forward \
+                    or ", 联系 方式" in forward or "。 联系 方式" in forward:
+                df2["Label"][index] = 3
+
+        if row["Label"] == 2:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-30:]
+            if "。 联系人" in forward or ", 联系人" in forward \
+                    or ", 联系 方式" in forward or "。 联系 方式" in forward:
+                df2["Label"][index] = 3
+
+    df2 = df2[["Word", "Label", "Sentence", "BIO"]]
+    df2.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    # df2.to_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
+
+
+def relabel2():
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+    df1 = df
+    for index, row in df1.iterrows():
+        if row["Label"] == 3:
+            ss = row["Sentence"].split("||")
+            forward = ss[0][-20:]
+            if "采购 " in forward and "窗口" not in forward and "公司" not in forward \
+                    and "窗口" not in forward and "文件" not in forward \
+                    and "质疑" not in forward and "中心" not in forward\
+                    and "处" not in forward:
+            # if "招标 " in forward:
+                print(forward)
+                df1["Label"][index] = 1
+    df1 = df1[["Word", "Label", "Sentence", "BIO"]]
+    # print(df1)
+    # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+
+
+if __name__ == "__main__":
+    # Postgre2Data()
+    # data2BIOData()
+    # BIOData2DataFrame()
+
+    # start_time = time.time()
+    # print("开始:", start_time)
+    # PersonBIOData2BIO_Sentence()
+    # end_time = time.time()
+    # print("耗时:", end_time-start_time)
+
+    # start_time = time.time()
+    # print("开始:", start_time)
+    # BIOData2PersonData()
+    # end_time = time.time()
+    # print("耗时:", end_time-start_time)
+
+    # print(datetime.strptime("2018-02-02", '%Y-%m-%d'))
+    # print(len("二、公示期:2020年05月25日至2020年06月03日,三、该宗地双方已签订成交确认书,在30日内签订出让合同,"
+    #           "相关事宜在合同中约定,四、联系方式,联系单位:惠州市公共资源交易中心仲恺分中心,单位地址:惠州仲恺高新区和畅五"
+    #           "路人才服务大厦10楼,邮政编码:联系电话:0752-3278419,联系人:"))
+
+    # duplicateData(3, 0.5)
+    # resetAndShuffleData()
+
+    # start_time = time.time()
+    # BIOData2TXT()
+    # end_time = time.time()
+    # print("耗时:", end_time-start_time)
+
+    # TXT2BIOData()
+
+    # BIOData2Bidway()
+    # BIOData2ServiceTime()
+
+    # Text2Csv()
+    # Csv2ServiceTimeText()
+    # Csv2BidwayText()
+    # re_serviceTime()
+    # re_bidway()
+
+    # Postgre2Data()
+
+    # getTestData()
+    # washData()
+
+    # re_serviceTime2()
+    # re_Accuracy("serviceTime_text1")
+    # test_re()
+    # re_serviceTime3()
+    # relabel()
+    relabel2()

+ 635 - 0
BiddingKG/dl_dev/test/test_model_fjs.py

@@ -0,0 +1,635 @@
+import sys
+
+import psycopg2
+from keras.models import Model
+from keras.layers import Input, LSTM, Dense
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot
+
+from BiddingKG.dl.common.models import *
+from sklearn.metrics import classification_report
+from BiddingKG.dl.interface.predictor import h5_to_graph
+
+
+sys.path.append(os.path.abspath("../.."))
+model_file = "model_person_classify_fjs.model.hdf5"
+
+
+
+def getSeq2seqModel():
+    # Batch size for training.
+    batch_size = 64
+    # Number of epochs to train for.
+    epochs = 100
+    # Latent dimensionality of the encoding space.
+    latent_dim = 256
+    # Number of samples to train on.
+    num_samples = 10000
+    # Path to the data txt file on disk.
+    data_path = 'fra-eng/fra.txt'
+
+    # Vectorize the data.
+    input_texts = []
+    target_texts = []
+    # Set方便去重
+    input_characters = set()
+    target_characters = set()
+    with open(data_path, 'r', encoding='utf-8') as f:
+        lines = f.read().split('\n')
+    for line in lines[: min(num_samples, len(lines) - 1)]:
+        input_text, target_text, _ = line.split('\t')
+        # 句子开始符:\t  句子终止符:\n
+        # We use "tab" as the "start sequence" character
+        # for the targets, and "\n" as "end sequence" character.
+        target_text = '\t' + target_text + '\n'
+        input_texts.append(input_text)
+        target_texts.append(target_text)
+        for char in input_text:
+            if char not in input_characters:
+                input_characters.add(char)
+        for char in target_text:
+            if char not in target_characters:
+                target_characters.add(char)
+
+    # 将字符排序
+    input_characters = sorted(list(input_characters))
+    target_characters = sorted(list(target_characters))
+    # Encoder的输入类别长度:字符表长度
+    # Decoder的输出类别长度:字符表长度
+    num_encoder_tokens = len(input_characters)
+    num_decoder_tokens = len(target_characters)
+    # Encoder的输入最大长度:最长的句子的长度
+    # Decoder的输入最大长度:最长的句子的长度
+    max_encoder_seq_length = max([len(txt) for txt in input_texts])
+    max_decoder_seq_length = max([len(txt) for txt in target_texts])
+
+    print('Number of samples:', len(input_texts))
+    print('Number of unique input tokens:', num_encoder_tokens)
+    print('Number of unique output tokens:', num_decoder_tokens)
+    print('Max sequence length for inputs:', max_encoder_seq_length)
+    print('Max sequence length for outputs:', max_decoder_seq_length)
+
+    # 为每个字符与下标组成一个字典
+    input_token_index = dict(
+        [(char, i) for i, char in enumerate(input_characters)])
+    target_token_index = dict(
+        [(char, i) for i, char in enumerate(target_characters)])
+
+    # 初始化Encoder输入矩阵
+    # 第一维为句子条数,即RNN循环次数
+    # 第二维为Encoder最大输入长度
+    # 第三维为Encoder的输入类别长度
+    encoder_input_data = np.zeros(
+        (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
+        dtype='float32')
+
+    # 初始化Decoder输入矩阵(Encoder输出)
+    # 第一维为句子条数,即RNN循环次数
+    # 第二维为Decoder最大输入长度
+    # 第三维为Decoder的输入类别长度
+    decoder_input_data = np.zeros(
+        (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
+        dtype='float32')
+
+    # 初始化Decoder输出矩阵
+    # 第一维为句子条数,即RNN循环次数
+    # 第二维为Decoder最大输入长度
+    # 第三维为Decoder的输入类别长度
+    decoder_target_data = np.zeros(
+        (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
+        dtype='float32')
+
+    # 将input和target打包成一对,如[input, target]
+    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
+
+        for t, char in enumerate(input_text):
+            encoder_input_data[i, t, input_token_index[char]] = 1.
+        encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
+
+        for t, char in enumerate(target_text):
+            # decoder_target_data is ahead of decoder_input_data by one timestep
+            decoder_input_data[i, t, target_token_index[char]] = 1.
+            if t > 0:
+                # decoder_target_data will be ahead by one timestep
+                # and will not include the start character.
+                decoder_target_data[i, t - 1, target_token_index[char]] = 1.
+        decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
+        decoder_target_data[i, t:, target_token_index[' ']] = 1.
+
+    # Define an input sequence and process it.
+    encoder_inputs = Input(shape=(None, num_encoder_tokens))
+    encoder = LSTM(latent_dim, return_state=True)
+    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
+    # We discard `encoder_outputs` and only keep the states.
+    encoder_states = [state_h, state_c]
+
+    # Set up the decoder, using `encoder_states` as initial state.
+    decoder_inputs = Input(shape=(None, num_decoder_tokens))
+    # We set up our decoder to return full output sequences,
+    # and to return internal states as well. We don't use the
+    # return states in the training model, but we will use them in inference.
+    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
+    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
+                                         initial_state=encoder_states)
+    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
+    decoder_outputs = decoder_dense(decoder_outputs)
+
+    # Define the model that will turn
+    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
+    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
+
+    # Run training
+    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
+                  metrics=['accuracy'])
+    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
+              batch_size=batch_size,
+              epochs=epochs,
+              validation_split=0.2)
+    # Save model
+    model.save('s2s.h5')
+
+    # Next: inference mode (sampling).
+    # Here's the drill:
+    # 1) encode input and retrieve initial decoder state
+    # 2) run one step of decoder with this initial state
+    # and a "start of sequence" token as target.
+    # Output will be the next target token
+    # 3) Repeat with the current target token and current states
+
+    # Define sampling models
+    encoder_model = Model(encoder_inputs, encoder_states)
+
+    decoder_state_input_h = Input(shape=(latent_dim,))
+    decoder_state_input_c = Input(shape=(latent_dim,))
+    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+    decoder_outputs, state_h, state_c = decoder_lstm(
+        decoder_inputs, initial_state=decoder_states_inputs)
+    decoder_states = [state_h, state_c]
+    decoder_outputs = decoder_dense(decoder_outputs)
+    decoder_model = Model(
+        [decoder_inputs] + decoder_states_inputs,
+        [decoder_outputs] + decoder_states)
+
+    # Reverse-lookup token index to decode sequences back to
+    # something readable.
+    reverse_input_char_index = dict(
+        (i, char) for char, i in input_token_index.items())
+    reverse_target_char_index = dict(
+        (i, char) for char, i in target_token_index.items())
+
+    def decode_sequence(input_seq):
+        # Encode the input as state vectors.
+        states_value = encoder_model.predict(input_seq)
+
+        # Generate empty target sequence of length 1.
+        target_seq = np.zeros((1, 1, num_decoder_tokens))
+        # Populate the first character of target sequence with the start character.
+        target_seq[0, 0, target_token_index['\t']] = 1.
+
+        # Sampling loop for a batch of sequences
+        # (to simplify, here we assume a batch of size 1).
+        stop_condition = False
+        decoded_sentence = ''
+        while not stop_condition:
+            output_tokens, h, c = decoder_model.predict(
+                [target_seq] + states_value)
+
+            # Sample a token
+            sampled_token_index = np.argmax(output_tokens[0, -1, :])
+            sampled_char = reverse_target_char_index[sampled_token_index]
+            decoded_sentence += sampled_char
+
+            # Exit condition: either hit max length
+            # or find stop character.
+            if (sampled_char == '\n' or
+                    len(decoded_sentence) > max_decoder_seq_length):
+                stop_condition = True
+
+            # Update the target sequence (of length 1).
+            target_seq = np.zeros((1, 1, num_decoder_tokens))
+            target_seq[0, 0, sampled_token_index] = 1.
+
+            # Update states
+            states_value = [h, c]
+
+        return decoded_sentence
+
+
+    for seq_index in range(100):
+        # Take one sequence (part of the training set)
+        # for trying out decoding.
+        input_seq = encoder_input_data[seq_index: seq_index + 1]
+        decoded_sentence = decode_sequence(input_seq)
+        print('-')
+        print('Input sentence:', input_texts[seq_index])
+        print('Decoded sentence:', decoded_sentence)
+
+
+def getBiLSTM_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    input_shape = (2, 35, 128)
+    # input_shape = (1, 70, 128)
+    output_shape = [5]
+
+    L_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    R_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    lstm_0 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(L_input)
+    avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
+    lstm_2 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(R_input)
+    avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
+    concat = layers.merge([avg_0, avg_2], mode="concat")
+    output = layers.Dense(output_shape[0], activation="softmax")(concat)
+
+    model = models.Model(inputs=[L_input, R_input], outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getBiRNN_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    input_shape = (2, 10, 128)
+    output_shape = [5]
+
+    L_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    R_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    lstm_0 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(L_input)
+    avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
+    lstm_2 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(R_input)
+    avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
+    concat = layers.merge([avg_0, avg_2], mode="concat")
+    output = layers.Dense(output_shape[0], activation="softmax")(concat)
+
+    model = models.Model(inputs=[L_input, R_input], outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getBiGRU_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    input_shape = (2, 35, 128)
+    # input_shape = (1, 70, 128)
+    output_shape = [5]
+
+    L_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    R_input = layers.Input(shape=input_shape[1:], dtype="float32")
+    lstm_0 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(L_input)
+    avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
+    lstm_2 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(R_input)
+    avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
+    concat = layers.merge([avg_0, avg_2], mode="concat")
+    output = layers.Dense(output_shape[0], activation="softmax")(concat)
+
+    model = models.Model(inputs=[L_input, R_input], outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getLSTM_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    input_shape = (2, 10, 128)
+    output_shape = [5]
+
+    input = layers.Input(shape=input_shape[1:], dtype="float32")
+    lstm = layers.LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(input)
+    avg = layers.GlobalAveragePooling1D()(lstm)
+    output = layers.Dense(output_shape[0], activation="softmax")(avg)
+
+    model = models.Model(inputs=input, outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getGRUModel_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    # input_shape = (2, 10, 128)
+    input_shape = (1, 70, 128)
+    output_shape = [5]
+
+    input = layers.Input(shape=input_shape[1:], dtype="float32")
+    gru = layers.GRU(32, dropout=0.15, recurrent_dropout=0.15, return_sequences=True)(input)
+    avg = layers.GlobalAveragePooling1D()(gru)
+    output = layers.Dense(output_shape[0], activation="softmax")(avg)
+
+    model = models.Model(inputs=input, outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getRNNModel_Dropout():
+    '''
+    @summary: 获得模型
+    '''
+    input_shape = (2, 10, 128)
+    output_shape = [5]
+
+    input = layers.Input(shape=input_shape[1:], dtype="float32")
+    rnn = layers.SimpleRNN(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(input)
+    avg = layers.GlobalAveragePooling1D()(rnn)
+    output = layers.Dense(output_shape[0], activation="softmax")(avg)
+
+    model = models.Model(inputs=input, outputs=output)
+    model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
+    return model
+
+
+def getGCNModel():
+    return
+
+
+def getData3(isTrain = True):
+
+    '''
+    :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
+    '''
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
+
+    test_data_len = df.shape[0] * 0.2
+    if isTrain:
+        test_data_len = 0
+    else:
+        test_data_len = 3700
+        df = df1
+
+    df = df.reset_index()
+
+    input_shape = (2, 35, 128)
+    output_shape = [5]
+    allLimit = 250000
+    all = 0
+    data_x = []
+    data_y = []
+    data_context = []
+    for index, row in df.iterrows():
+        if isTrain:
+            if index < test_data_len:
+                continue
+        else:
+            if index >= test_data_len:
+                break
+        if all >= allLimit:
+            break
+
+        tokens_list_front = []
+        tokens_list_behind = []
+        tokens_list_all = []
+        sss = row["Sentence"].split("||")
+        front = sss[0]
+        behind = sss[2]
+
+        ss_front = front.split(" ")
+        ss_behind = behind.split(" ")
+        for s in ss_front:
+            tokens_list_front.append(s)
+        for s in ss_behind:
+            tokens_list_behind.append(s)
+        tokens_list_all.append(tokens_list_front)
+        tokens_list_all.append(tokens_list_behind)
+
+        # print(np.array(tokens_list_all).shape)
+        item_x = embedding(tokens_list_all, shape=input_shape)
+        item_y = np.zeros(output_shape)
+        item_y[row[3]] = 1
+        all += 1
+        data_x.append(item_x)
+        data_y.append(item_y)
+
+    data_x1, data_y1 = getDataFromPG((2, 35, 128), [5])
+    data_x = data_x + data_x1
+    data_y = data_y + data_y1
+    print(np.array(data_x).shape, np.array(data_y).shape)
+    return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
+
+
+def getDataFromPG(input_shape, output_shape):
+    conn = psycopg2.connect(dbname="BiddingKG", user="postgres", password="postgres",
+                            host="192.168.2.101")
+    cursor = conn.cursor()
+    sql = "select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id " \
+          "from train_entity_copy A,train_sentences_copy B,hand_label_person C " \
+          "where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index " \
+          "and A.entity_type='person' and A.entity_id=C.entity_id and C.label!=0 " \
+          "and C.label!=3;"
+    cursor.execute(sql)
+    print(sql)
+
+    data_x = []
+    data_y = []
+    rows = cursor.fetchmany(1000)
+    allLimit = 250000
+    all = 0
+    i = 0
+    while(rows):
+        for row in rows:
+            if all >= allLimit:
+                break
+            item_x = embedding(spanWindow(tokens=row[0], begin_index=row[1], end_index=row[2],
+                                          size=input_shape[1]), shape=input_shape)
+            # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False)
+
+            # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False)
+            # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False)
+            item_y = np.zeros(output_shape)
+            item_y[row[3]] = 1
+            all += 1
+            data_x.append(item_x)
+            data_y.append(item_y)
+            i += 1
+            rows = cursor.fetchmany(1000)
+    return data_x, data_y
+
+
+def getData2(isTrain = True):
+    '''
+    :return:返回训练数据或测试数据的词嵌入,前后连成一个句子,包含中心词
+    '''
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+
+    test_data_len = df.shape[0] * 0.2
+    if isTrain:
+        test_data_len = 0
+    else:
+        test_data_len = 3700
+        df = df1
+
+    df = df.reset_index()
+
+    input_shape = (1, 70, 128)
+    output_shape = [5]
+    allLimit = 250000
+    all = 0
+    data_x = []
+    data_y = []
+    data_context = []
+    for index, row in df.iterrows():
+        if isTrain:
+            if index < test_data_len:
+                continue
+        else:
+            if index >= test_data_len:
+                break
+        if all >= allLimit:
+            break
+
+        tokens_list = []
+        tokens_list_all = []
+        ss = row["Sentence"].split(" ")
+        for s in ss:
+            tokens_list.append(s)
+        tokens_list_all.append(tokens_list)
+
+        item_x = embedding(tokens_list_all, shape=input_shape)
+        item_y = np.zeros(output_shape)
+        item_y[row[3]] = 1
+        all += 1
+        data_x.append(item_x)
+        data_y.append(item_y)
+
+    print(np.array(data_x).shape, np.array(data_y).shape)
+    return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
+
+
+def getData(isTrain = True):
+    '''
+
+    :return:返回训练数据或测试数据的词嵌入
+    '''
+    df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
+
+    test_data_len = df.shape[0] * 0.2
+    if isTrain:
+        test_data_len = 0
+    else:
+        test_data_len = 3700
+        df = df1
+
+    df = df.reset_index()
+
+    input_shape = (2, 35, 128)
+    output_shape = [5]
+    allLimit = 250000
+    all = 0
+    data_x = []
+    data_y = []
+    data_context = []
+    for index, row in df.iterrows():
+        if isTrain:
+            if index < test_data_len:
+                continue
+        else:
+            if index >= test_data_len:
+                break
+        if all >= allLimit:
+            break
+
+        print(np.array(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1])).shape)
+
+        item_x = embedding(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1]), shape=input_shape)
+        item_y = np.zeros(output_shape)
+        item_y[row[3]] = 1
+        all += 1
+        data_x.append(item_x)
+        data_y.append(item_y)
+
+    print(np.array(data_x).shape, np.array(data_y).shape)
+    # print(data_x, data_y, data_context)
+    return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
+
+
+def train():
+    '''
+    @summary: 训练模型
+    '''
+    model = getBiGRU_Dropout()
+    model.summary()
+    train_x, train_y, _ = getData3(isTrain=True)
+    test_x, test_y, test_context = getData3(isTrain=False)
+
+    # 回调checkpoint,保存loss最小的模型
+    checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
+    history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto',
+                              y=train_y, validation_data=([test_x[0], test_x[1]], test_y),
+                              epochs=25, batch_size=256, shuffle=True, callbacks=[checkpoint])
+    # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_data=([test_x[0], test_x[0]], test_y), class_weight='auto', epochs=100, batch_size=256, shuffle=True, callbacks=[checkpoint])
+    # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
+
+    # 单向模型
+    # history_model = model.fit(x=train_x[0], y=train_y, validation_data=([test_x[0], test_y]), class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
+    # history_model = model.fit(x=train_x[0], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
+
+    # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=100, class_weight='auto', batch_size=256, shuffle=True, callbacks=[checkpoint])
+    # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=250, batch_size=256, shuffle=True, callbacks=[checkpoint])
+    plotTrainTestLoss(history_model)
+
+
+def predict():
+    model = models.load_model(model_file, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+    test_x, test_y, test_context = getData3(isTrain=False)
+    predict_y = model.predict([test_x[0], test_x[1]])
+    # predict_y = model.predict([test_x[0], test_x[0]])
+    # predict_y = model.predict([test_x[0]])
+    targets_name = ['人名', '招标联系人', '代理联系人', '联系人', '评审专家']
+    print(classification_report(np.argmax(test_y, axis=1), np.argmax(predict_y, axis=1), target_names=targets_name))
+    return predict_y
+
+
+def predict2Csv():
+    df = pd.DataFrame(np.argmax(predict(), axis=1))
+    df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
+    # df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
+
+    df1 = df1[0:3700]
+
+    df1["predict_Label"] = df
+
+    df1.to_csv("C:\\Users\\admin\\Desktop\\result3.csv")
+
+
+def plotTrainTestLoss(history_model):
+    pyplot.plot(history_model.history['loss'])
+    pyplot.plot(history_model.history['val_loss'])
+    pyplot.title('model train vs validation loss')
+    pyplot.ylabel('loss')
+    pyplot.xlabel('epoch')
+    pyplot.legend(['train', 'validation'], loc='upper right')
+    pyplot.show()
+
+
+def hdf52savemodel():
+    filepath = 'model_person_classify_fjs.model.hdf5'
+    with tf.Graph().as_default() as graph:
+        time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+        with tf.Session() as sess:
+            sess.run(tf.global_variables_initializer())
+            h5_to_graph(sess, graph, filepath)
+            tf.saved_model.simple_save(sess,
+                                       "person_savedmodel_new/",
+                                       inputs={"input0":time_model.input[0],
+                                               "input1":time_model.input[1]},
+                                       outputs={"outputs":time_model.output})
+
+
+if __name__ == "__main__":
+    # getData()
+    # train()
+    # predict()
+    # predict2Csv()
+    hdf52savemodel()
+
+    # getData3()
+    # x, y = getDataFromPG((2, 35, 128), [5])
+    # print(x)
+    # print(y)

+ 93 - 0
BiddingKG/dl_dev/test/testocr.py

@@ -0,0 +1,93 @@
+# coding=utf-8
+
+import sys
+import json
+import requests
+import base64
+
+# 保证兼容python2以及python3
+IS_PY3 = sys.version_info.major == 3
+if IS_PY3:
+    from urllib.request import urlopen
+    from urllib.request import Request
+    from urllib.error import URLError
+    from urllib.parse import urlencode
+    from urllib.parse import quote_plus
+else:
+    import urllib2
+    from urllib import quote_plus
+    from urllib2 import urlopen
+    from urllib2 import Request
+    from urllib2 import URLError
+    from urllib import urlencode
+
+# 防止https证书校验不正确
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+
+API_KEY = 'ssfyC49bEbp7QdGnG96GKdt2'
+
+SECRET_KEY = 'YCQtiQVt1GvldNZzyfOpbVtNugj7S1Uw'
+
+
+OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
+
+
+"""  TOKEN start """
+TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
+
+
+"""
+    获取token
+"""
+def fetch_token():
+    params = {'grant_type': 'client_credentials',
+              'client_id': API_KEY,
+              'client_secret': SECRET_KEY}
+    post_data = urlencode(params)
+    if (IS_PY3):
+        post_data = post_data.encode('utf-8')
+    req = Request(TOKEN_URL, post_data)
+    try:
+        f = urlopen(req, timeout=5)
+        result_str = f.read()
+    except URLError as err:
+        print(err)
+    if (IS_PY3):
+        result_str = result_str.decode()
+
+
+    result = json.loads(result_str)
+
+    if ('access_token' in result.keys() and 'scope' in result.keys()):
+        if not 'brain_all_scope' in result['scope'].split(' '):
+            print ('please ensure has check the  ability')
+            exit()
+        return result['access_token']
+    else:
+        print ('please overwrite the correct API_KEY and SECRET_KEY')
+        exit()
+
+
+
+def ocr():
+    '''
+    通用文字识别(高精度版)
+    '''
+
+    request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
+    # 二进制方式打开图片文件
+    f = open('20201111181124.png', 'rb')
+    img = base64.b64encode(f.read())
+
+    params = {"image":img}
+    access_token = '24.f7a57ed7b887ac523c606fa6b09dac88.2592000.1607681571.282335-22744183'
+    request_url = request_url + "?access_token=" + access_token
+    headers = {'content-type': 'application/x-www-form-urlencoded'}
+    response = requests.post(request_url, data=params, headers=headers)
+    if response:
+        print (response.json())
+
+if __name__=="__main__":
+    # print(fetch_token())
+    ocr()

+ 3 - 0
BiddingKG/dl_dev/test/testp.py

@@ -0,0 +1,3 @@
+
+def a(x):
+    return (1,1)

+ 121 - 0
BiddingKG/dl_dev/test/val_fromiepy.py

@@ -0,0 +1,121 @@
+import psycopg2
+
+from DBUtils.PooledDB import PooledDB
+
+from BiddingKG.dl.common.Utils import save,load,getUnifyMoney
+from BiddingKG.dl_dev.test.test4 import predict
+pool = None
+
+def getConnection():
+    global pool
+    if pool is None:
+        pool = PooledDB(psycopg2, 10,dbname="iepy", host="192.168.2.101",user="postgres",password="postgres",port="5432")
+    return pool.connection()
+
+def getTendererMoney_fromIEPY():
+    conn = getConnection()
+    cursor = conn.cursor()
+    sql = 'select "user",begin_time,end_time from corpus_payroll'
+    cursor.execute(sql)
+    rows_user = cursor.fetchall()
+    dict_docid_role_money = dict()
+    for row_user in rows_user:
+        _user = row_user[0]
+        begin_time = row_user[1]
+        end_time = row_user[2]
+        sql = " select human_identifier,sourcetext from corpus_iedocument where edituser='%s' and edittime>=to_date('%s','yyyy-mm-dd') and edittime<=to_date('%s','yyyy-mm-dd')"%(_user,begin_time,end_time)
+        cursor.execute(sql)
+        rows_docId = cursor.fetchall()
+        for row_docId in rows_docId:
+            docId = row_docId[0]
+            sourceText = row_docId[1]
+            sql = " select value from brat_bratannotation where document_id='%s'"%docId
+            cursor.execute(sql)
+            rows_anno = cursor.fetchall()
+            dict_money_entity_pack = dict()
+            list_value = []
+            for row_anno in rows_anno:
+                value = row_anno[0]
+                list_value.append(value)
+            dict_anno = dict()
+            list_rel_anno = []
+            dict_docid_role_money[docId] = {}
+            dict_docid_role_money[docId]["sourceHtml"] = sourceText
+            dict_docid_role_money[docId]["roleMoney"] = []
+            for _value1 in list_value:
+                print(_value1)
+                if _value1[0]=="T":
+                    ID,VL,ENTITY = _value1.split("\t")
+                    dict_anno[ID] = {"type":VL.split()[0],"text":ENTITY}
+                if _value1[0]=="R":
+                    ID,VL = _value1.split("\t")
+                    if VL.split()[0]=="rel_tendererMoney":
+                        list_rel_anno.append([VL.split()[1].split(":")[1],VL.split()[2].split(":")[1]])
+            for item in list_rel_anno:
+                dict_docid_role_money[docId]["roleMoney"].append([dict_anno[item[0]]["type"],dict_anno[item[0]]["text"],dict_anno[item[1]]["text"]])
+
+    save(dict_docid_role_money,"dict_docid_role_money.pk")
+
+def test4(id,content):
+    import json
+    _result = json.loads(predict(id,content))
+    set_docid_role_money = set()
+    for _p in _result["prem"].keys():
+        for item in _result["prem"][_p]["roleList"]:
+            if len(item[0].split("_"))==2:
+                set_docid_role_money.add("%s$%s$%s"%(id,item[1],str(float(getUnifyMoney(str(item[2]))))))
+    return set_docid_role_money
+
+def getTendererMoney_extract_percent1():
+    '''
+    mongo:98/1101,0.089010
+    15:extract:526/all:1000=0.525995
+    extract:710/all:946=0.750521
+    :return:
+    '''
+    dict_docid_role_money = load("dict_docid_role_money.pk")
+    set_docid_role_money_iepy = set()
+    set_docid_role_money_interface = set()
+    _index = 0
+    for _key in dict_docid_role_money.keys():
+        _index += 1
+        print("%s===%d"%(_key,_index))
+        _size = 0
+        if len(dict_docid_role_money[_key]["roleMoney"])>0:
+            sourceHtml = dict_docid_role_money[_key]["sourceHtml"]
+            len_ipey = len(set_docid_role_money_iepy)
+            len_interface = len(set_docid_role_money_interface)
+            for item in dict_docid_role_money[_key]["roleMoney"]:
+                _v = "%s$%s$%s"%(_key,item[1],str(float(getUnifyMoney(item[2]))))
+                set_docid_role_money_iepy.add(_v)
+            _interface = test4(_key,sourceHtml)
+            if _interface is not None:
+                set_docid_role_money_interface = set_docid_role_money_interface | _interface
+            if len(set_docid_role_money_interface)-len_interface!=len(set_docid_role_money_iepy)-len_ipey:
+                print("diff-%s-%d-%d"%(_key,len(set_docid_role_money_interface)-len_interface,len(set_docid_role_money_iepy)-len_ipey))
+            print("extract:%d/all:%d=%f"%(len(set_docid_role_money_iepy&set_docid_role_money_interface),len(set_docid_role_money_iepy),len(set_docid_role_money_iepy&set_docid_role_money_interface)/(len(set_docid_role_money_iepy)+0.01)))
+    print("extract:%d/all:%d=%f"%(len(set_docid_role_money_iepy&set_docid_role_money_interface),len(set_docid_role_money_iepy),len(set_docid_role_money_iepy&set_docid_role_money_interface)/(len(set_docid_role_money_iepy)+0.01)))
+
+if __name__=="__main__":
+    # getTendererMoney_fromIEPY()
+    getTendererMoney_extract_percent1()
+    # dict_docid_role_money = load("dict_docid_role_money.pk")
+    # _set = set()
+    # for item in dict_docid_role_money["95001733"]["roleMoney"]:
+    #     _v = "%s$%s$%s"%("95004646",item[1],str(float(getUnifyMoney(item[2]))))
+    #     _set.add(_v)
+    # print(_set)
+    #     _v = "%s$%s$%s"%("95005274",item[1],str(float(getUnifyMoney(item[2]))))
+    # print(dict_docid_role_money["100001185"])
+    # print(getUnifyMoney(dict_docid_role_money["100001185"]["roleMoney"][0][2]))
+    # print(len(dict_docid_role_money.keys()))
+    # _count = 0
+    # for _key in dict_docid_role_money.keys():
+    #     if len(dict_docid_role_money[_key])>0:
+    #         _count+=1
+    # print(_count)
+
+
+
+
+

+ 67 - 0
BiddingKG/dl_dev/test/val_multi.py

@@ -0,0 +1,67 @@
+'''
+Created on 2019年11月11日
+
+@author: User
+'''
+from BiddingKG.dl.common.Utils import *
+import json
+#from test4 import predict
+
+def val():
+    data = load("article_label_1000_muti.pk")
+    count = 0
+    for item in data:
+        count += 1
+        print("====",count)
+        item["predict"] = predict("12",item["content"])
+    save(data,"article_label_1000_muti_val.pk")
+    
+def compare():
+    data = load("article_label_1000_muti_val.pk")
+    same_tenderee_money = 0
+    notsame_tenderee_money_label = 0
+    notsame_tenderee_money_predict = 0
+    same_role = 0
+    notsame_role_label = 0
+    notsame_role_predict = 0
+    _index = 0
+    for item in data:
+        _index += 1
+        set_pack_role_label = set()
+        set_pack_role_predict = set()
+        set_pack_money_label = set()
+        set_pack_money_predict = set()
+        _label = json.loads(item["label"])
+        _predict = json.loads(item["predict"])
+        for _pack in _label["prem"].keys():
+            tenderee_money = getUnifyMoney(str(_label["prem"][_pack]["tendereeMoney"]))
+            if tenderee_money>0:
+                set_pack_money_label.add((_pack,tenderee_money))
+            for _role in _label["prem"][_pack]["roleList"]:
+                set_pack_role_label.add((_pack,_role[0],_role[1]))
+        for _pack in _predict["prem"].keys():
+            tenderee_money = getUnifyMoney(str(_predict["prem"][_pack]["tendereeMoney"]))
+            if tenderee_money>0:
+                set_pack_money_predict.add((_pack,tenderee_money))
+            for _role in _predict["prem"][_pack]["roleList"]:
+                set_pack_role_predict.add((_pack,_role[0],_role[1]))
+        same_tenderee_money += len(set_pack_money_label&set_pack_money_predict)
+        notsame_tenderee_money_label += len(set_pack_money_label)-len(set_pack_money_label&set_pack_money_predict)
+        notsame_tenderee_money_predict += len(set_pack_money_predict)-len(set_pack_money_label&set_pack_money_predict)
+        
+        same_role += len(set_pack_role_label&set_pack_role_predict)
+        notsame_role_label += len(set_pack_role_label)-len(set_pack_role_label&set_pack_role_predict)
+        notsame_role_predict += len(set_pack_role_predict)-len(set_pack_role_label&set_pack_role_predict)
+        if len(set_pack_money_label)-len(set_pack_money_label&set_pack_money_predict)>0:
+            print(item["filename"],_index,set_pack_money_label,"--",set_pack_money_predict)
+            print(item["label"])
+    print("pack-role precision:",same_role/(same_role+notsame_role_predict)," recall:",same_role/(same_role+notsame_role_label))
+    print("pack-tendereemoney precision:",same_tenderee_money/(same_tenderee_money+notsame_tenderee_money_predict)," recall:",same_tenderee_money/(same_tenderee_money+notsame_tenderee_money_label))
+        
+
+from tensorflow.examples.tutorials.mnist import input_data
+mnist = input_data.read_data_sets("data/", one_hot=True)
+if __name__=="__main__":
+    #val()
+    #compare()
+    print({"image":mnist.test.images[0],"keep_prob":1.0})

+ 362 - 0
BiddingKG/dl_dev/test/validation.py

@@ -0,0 +1,362 @@
+'''
+Created on 2019年5月15日
+
+@author: User
+'''
+
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import json
+sys.path.append(os.path.abspath("../.."))
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+
+
+def run_predict():
+    '''
+    data = load("val.pk")
+    print(data[0])
+    '''
+    data = load("label_0_1197.pk")
+    codeNamePredict = predictor.CodeNamePredict()
+    premPredict = predictor.PREMPredict()
+    epcPredict = predictor.EPCPredict()
+    roleRulePredict = predictor.RoleRulePredictor()
+    count = 0
+    not_find_count = 0
+    list_filename_index_notfound = []
+    for item in data:
+        count += 1
+        print(count,not_find_count,len(data))
+        list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[item["filename"],item["content"],"","",""]],useselffool=True)
+        codeName = codeNamePredict.predict(list_articles)
+        premPredict.predict(list_sentences,list_entitys)
+        roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+        
+        epcPredict.predict(list_sentences,list_entitys)
+        prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+
+        print(prem)
+
+        label_prem = re.sub("[\r\s\n]","",item["prem"])
+        label_prem = re.sub("(","(",label_prem)
+        label_prem = re.sub(")",")",label_prem)
+        set_label_entitys = set()
+        for prem1 in label_prem.split(";"):
+            if len(prem1.split("-"))>2:
+                set_label_entitys.add(prem1.split("-")[2])
+        set_extract_entitys = set()
+        for list_entity in list_entitys:
+            for entity in list_entity:
+                if entity.entity_type in ["org","company"]:
+                    set_extract_entitys.add(entity.entity_text)
+        not_find_count += len(set_label_entitys-(set_label_entitys&set_extract_entitys))
+        print(item["filename"],set_label_entitys-(set_label_entitys&set_extract_entitys))
+        
+        list_filename_index_notfound.append([item["filename"],count,set_label_entitys-(set_label_entitys&set_extract_entitys)])
+        if len(codeName)>0:
+            item["predict_code"] = codeName[0][1]["code"]
+            item["predict_name"] = codeName[0][1]["name"]
+        else:
+            item["predict_code"] = ""
+            item["predict_name"] = ""
+        if len(prem)>0:
+            item["predict_prem"] = prem[0][1]["prem"]
+        else:
+            item["predict_prem"] = ""
+    for item in list_filename_index_notfound:
+        print(item)
+    save(data,"val_selffool7.pk")
+    
+def run_predict_interface():
+    '''
+    data = load("val.pk")
+    print(data[0])
+    '''
+    data = load("label_0_1197.pk")
+    # codeNamePredict = predictor.CodeNamePredict()
+    # premPredict = predictor.PREMPredict()
+    # epcPredict = predictor.EPCPredict()
+    count = 0
+    not_find_count = 0
+    
+    myheaders = {'Content-Type': 'application/json'}
+    guardian_base2 = 'http://192.168.2.101:15011'
+    
+    for item in data:
+        count += 1
+        
+        user = {
+        "content": item["content"],
+        "id":item["filename"]
+        }
+        _resp = requests.post(guardian_base2 + '/article_extract', json=user, headers=myheaders, verify=True)
+        resp_json = _resp.content.decode("utf-8")
+        obj_json = json.loads(resp_json)
+        prem = obj_json["prem"]
+        
+        print(count,not_find_count,len(data))
+        
+        item["predict_code"] = obj_json["code"]
+        item["predict_name"] = obj_json["name"]
+        item["predict_prem"] = prem
+    save(data,"val_interface.pk")
+    
+def getAccRecall():
+    # data = load("val_selffool7.pk")
+    data = load("val_interface.pk")
+    data_label = load("label_0_1197.pk")
+    roles = [0,1,2,3,4,"code","name","money","person"]
+    models = ["","role"]
+    type = ["coincidence","label","predict"]
+    count_dict = {}
+    
+    list_not_true = []
+    index = 0
+    for t in type:
+        for m in models:
+            for role in roles:
+                count_dict[t+"_"+str(m)+"_"+str(role)] = 0
+    list_filename_not_label_predict = []
+    for item,item_label in zip(data,data_label):
+        index += 1
+        label_code = re.sub("[\r\n\s]","",item_label["code"])
+        predict_code = ";".join(item["predict_code"])
+        label_name = re.sub("[\r\n\s]","",item_label["name"])
+        predict_name = item["predict_name"]
+        label_prem = re.sub("[\r\n\s]","",item_label["prem"])
+        label_prem = re.sub("(","(",label_prem)
+        label_prem = re.sub(")",")",label_prem)
+        predict_prem = item["predict_prem"]
+
+        # print("===",item)
+
+        count_not_true = 0
+        #count code
+        set_label_code = set([a for a in re.split("[;;]",label_code) if a!='' and a!='1'])
+        set_predict_code = set([a for a in re.split("[;;]",predict_code) if a!=''])
+        count_dict["coincidence__code"] += len(set_label_code&set_predict_code)
+        count_dict["label__code"] += len(set_label_code-(set_label_code&set_predict_code))
+        count_dict["predict__code"]+= len(set_predict_code-(set_label_code&set_predict_code))        
+                 
+
+        # new count name
+        set_label_name = set([a for a in re.split("[;;]",label_name) if a!='' and a!='1'])
+        #set_predict_name = set([a for a in re.split("[;;]",predict_name) if a!=''])  # 单个项目名称
+        set_predict_name = set(predict_name)  # 多个项目名称
+        if len(set_label_name&set_predict_name) > 0:
+            count_dict["coincidence__name"] += 1
+        elif len(set_predict_name-(set_label_name&set_predict_name)) >0:
+            count_dict["predict__name"]+= 1
+        elif len(set_label_name-(set_label_name&set_predict_name)) > 0:
+            count_dict["label__name"] += 1
+        
+        #count role、money、person
+        role_id = {0:"tenderee",
+                   1:"agency",
+                   2:"win_tenderer",
+                   3:"second_tenderer",
+                   4:"third_tenderer"}
+        
+        filename_not_label_predict = [item["filename"],set(),set()]
+
+        #get not true roles of each article
+        not_true_roles = [item["filename"],set(),set()]
+        predict_roles = set()
+        label_roles = set()
+        # for _pack in predict_prem.keys():
+        #     for prem1 in predict_prem[_pack]["roleList"]:
+        #         predict_roles.add(prem1[0]+prem1[1])
+
+        for item2 in predict_prem:
+            predict_roles.add(item2[2]+item2[3])
+
+        for item1 in label_prem.split(";"):
+            prem1 = item1.split("-")
+            if len(prem1)>1:
+                label_roles.add(role_id[int(prem1[1])]+prem1[2])
+        not_true_roles[1] = label_roles-(label_roles&predict_roles)
+        not_true_roles[2] = predict_roles-(predict_roles&label_roles)
+        if len(not_true_roles[1])>0 or len(not_true_roles[2])>0:
+            print(not_true_roles)
+
+        for role in [0,1,2,3,4]:
+            temp_set = set()
+            temp_set2 = set()
+            same_package_count = 0
+            package_set = set()
+
+            # for _pack in predict_prem.keys():
+            #     for prem1 in predict_prem[_pack]["roleList"]:
+            #         if prem1[0]==role_id[role]:
+            #             temp_set.add((prem1[1]))
+
+            for prem1 in predict_prem:
+                if prem1[2]==role_id[role]:
+                    packageName = prem1[0]
+                    #temp_set.add((packageName,prem1[3]))
+                    temp_set.add((prem1[3]))
+            for item1 in label_prem.split(";"):
+                prem1 = item1.split("-")
+                if len(prem1)>1 and str(prem1[1]).strip()==str(role):
+                    #print(prem1)
+                    packageName = "Project" if prem1[0]=="" else prem1[0]
+                    if packageName in package_set:
+                        same_package_count += 1
+                    package_set.add(packageName)
+                    #temp_set2.add((packageName,prem1[2]))
+                    temp_set2.add((prem1[2]))
+                    
+            _coincidence = temp_set&temp_set2
+            _label = temp_set2-(temp_set&temp_set2)
+            _predict = temp_set-(temp_set&temp_set2)
+            
+            for item1 in list(_label):
+                filename_not_label_predict[1].add((role,item1))
+            for item1 in list(_predict):
+                filename_not_label_predict[2].add((role,item1))
+            
+            count_dict["coincidence_role_"+str(role)] += len(temp_set&temp_set2)
+            count_dict["label_role_"+str(role)] += len(temp_set2-(temp_set&temp_set2))-same_package_count
+            count_dict["predict_role_"+str(role)] += len(temp_set-(temp_set&temp_set2))
+            
+            count_not_true += len(temp_set2-(temp_set&temp_set2))
+        #count package_role_entity_money_people
+        #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem])
+        
+        list_filename_not_label_predict.append(filename_not_label_predict)
+        #count money
+        temp_set = set()
+        temp_set2 = set()
+        same_entity_money = 0
+        # for _pack in predict_prem.keys():
+        #     for prem1 in predict_prem[_pack]["roleList"]:
+        #         money = prem1[2]
+        #         if str(money)!="0":
+        #             temp_set.add((prem1[1],getUnifyMoney(money)))
+        for prem1 in predict_prem:
+            money = prem1[4]
+            if str(money)!="0":
+                temp_set.add((prem1[3],getUnifyMoney(str(money))))
+                # temp_set.add((getUnifyMoney(str(money))))
+        for item1 in label_prem.split(";"):
+            prem1 = item1.split("-")
+            if len(prem1)>1:
+                for m in prem1[3].split("、"):
+                    if m!="":
+                        same_entity_money += 1
+                        temp_set2.add((prem1[2],getUnifyMoney(m)))
+                        # temp_set2.add((getUnifyMoney(m)))
+        if same_entity_money>0:
+            same_entity_money -= 1
+        count_dict["coincidence__money"] += len(temp_set&temp_set2)
+        count_dict["label__money"] += len(temp_set2-(temp_set&temp_set2))-same_entity_money
+        count_dict["predict__money"] += len(temp_set-(temp_set&temp_set2))
+
+        print("money_notfound",item["filename"],temp_set2-(temp_set&temp_set2))
+        print("money_foundError",item["filename"],temp_set-(temp_set&temp_set2))
+        
+        #count person
+        temp_set = set()
+        temp_set2 = set()
+        # for _pack in predict_prem.keys():
+        #     for prem1 in predict_prem[_pack]["roleList"]:
+        #         person = prem1[3]
+        #         for p in person:
+        #             temp_set.add((prem1[1],p[0]))
+        for prem1 in predict_prem:
+            person = prem1[5]
+            for p in person:
+                temp_set.add((prem1[3],p[0]))
+        for item1 in label_prem.split(";"):
+            prem1 = item1.split("-")
+            if len(prem1)>4:
+                person = prem1[4]
+                for p in person.split("|"):
+                    if p.strip()!="/" and p.strip()!="":
+                        temp_set2.add((prem1[2],p.split("/")[0]))
+        count_dict["coincidence__person"] += len(temp_set&temp_set2)
+        count_dict["label__person"] += len(temp_set2-(temp_set&temp_set2))
+        count_dict["predict__person"] += len(temp_set-(temp_set&temp_set2))
+        #count_not_true = len(temp_set2-(temp_set&temp_set2))
+        #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem])
+        
+        
+        
+    list_not_true.sort(key=lambda x:x[1],reverse=True)
+    for item in list_not_true:
+        if item[1]>0:
+            print(item)
+    count_list = []
+    for key in count_dict.keys():    
+        if count_dict[key]>0:
+            count_list.append([key,count_dict[key]])
+    count_list.sort(key = lambda x:x[0])
+    for item in count_list:
+        print(item) 
+    count_m = ["role","code","name","money","person"]
+    count_roles = ["",0,1,2,3,4]
+    def get_value(_list,find_list):
+        count = 0
+        for item in _list:
+            find_flag = True
+            for key in find_list:
+                if str(item[0]).find(key)<0:
+                    find_flag = False
+            if find_flag:
+                count += item[1]
+        return count
+                    
+    for m in count_m:
+        for roles in count_roles:
+            concidence = get_value(count_list,["coincidence",str(m),str(roles)])
+            label = get_value(count_list,["label",str(m),str(roles)])
+            predict = get_value(count_list,["predict",str(m),str(roles)])
+            if 0 not in [predict+concidence,label+concidence]:
+                print(m,roles,concidence,label,predict,"acc",concidence/(predict+concidence),"recall",concidence/(label+concidence))
+    save(list_filename_not_label_predict,"list_filename_not_label_predict_3.pk")
+    
+def compare():
+    data = load("list_filename_not_label_predict_3.pk")
+    data1 = load("list_filename_not_label_predict_2.pk")
+    for item,item_1 in zip(data,data1):
+        print("==",item)
+        print("--",item_1)
+        label_compare = item[1]-item_1[1]
+        predict_compare = item[2]-item_1[2]
+        if len(label_compare)>0 or len(predict_compare)>0:
+            print(item[0],label_compare,predict_compare)
+            
+def findmultipack():
+    '''
+    @summary: 找到多标段的数据
+    '''
+    data = load("label_0_1197.pk")
+    for item_label in data:
+        label_prem = re.sub("[\r\n\s]","",item_label["prem"])
+        label_prem = re.sub("(","(",label_prem)
+        label_prem = re.sub(")",")",label_prem)
+        set_pack = set()
+        for item1 in label_prem.split(";"):
+            prem1 = item1.split("-")
+            set_pack.add(prem1[0])
+        if len(set_pack)>1:
+            print(item_label["filename"])
+    
+if __name__=="__main__":
+    # run_predict()
+    run_predict_interface()
+    getAccRecall()
+    # compare()
+    #findmultipack()

BIN
BiddingKG/dl_dev/test/vocab_word.pk


+ 304 - 0
BiddingKG/dl_dev/test/测试所有提取信息.py

@@ -0,0 +1,304 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/11 0011 13:52 
+
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import BiddingKG.dl.complaint.punish_predictor as punish_predictor
+# import BiddingKG.dl.complaint.punish_rule as punish_predictor
+import BiddingKG.dl.channel.channel_predictor as channel_predictor
+import json
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+''''''
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+# roleRulePredict = predictor.RoleRulePredictor()
+timePredict = predictor.TimePredictor()
+# punish = punish_rule.Punish_Extract()
+punish = punish_predictor.Punish_Extract()
+productPredict = predictor.ProductPredictor()
+channelPredict = channel_predictor.DocChannel()
+
+# 自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj, str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id, text, title=""):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys)
+    print("timePredict")
+    entityLink.link_entitys(list_entitys)
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    print("getPREMs")
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    product = productPredict.predict(list_sentences,list_entitys)
+    channel = channelPredict.predict(title, list_sentences[0])
+
+    total_tendereeMoney_list = []
+    for entity in list_entitys[0]:
+        if entity.notes == '总投资':
+            total_tendereeMoney_list.append(entity.entity_text)
+    total_tendereeMoney = max([total_tendereeMoney_list]) if len(total_tendereeMoney_list)>=1 else 0
+
+    for entitys in list_entitys:
+        for entity in entitys:
+            print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+                  entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
+    # print(prem)
+    # return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+    # return json.dumps(Preprocessing.union_result(
+    #     Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product), channel)[0],
+    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(
+        Preprocessing.union_result(
+            Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic), product), [{'total_tendereeMoney':total_tendereeMoney}]
+    ),
+        channel),
+            cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+
+
+def predict_back(doc_id, html):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)  #预测项目编号,名称
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)  #  角色金额模型
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)  # 联系人模型
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys) # 时间类别模型
+    print("timePredict")
+    entityLink.link_entitys(list_entitys) #
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包,并包号与其他要素连接起来
+    print("getPREMs")
+    # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # print(punish_dic)
+    # prem[0][1]['punish'] = punish_dic
+
+    # bidway = []  # 招标方式
+    # moneySource = []  # 资金来源
+    # servicetime = []  # 服务时间
+    # time_release = []  # 发布时间
+    # time_bidopen = []  # 开标时间
+    # time_bidclose = []  # 截标时间
+    # for entity in list_entitys[0]:
+    #     if entity.entity_type == 'bidway':
+    #         bidway.append(entity.entity_text)
+    #     elif entity.entity_type == 'moneySource':
+    #         moneySource.append(entity.entity_text)
+    #     elif entity.entity_type == 'servicetime':
+    #         servicetime.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 1:
+    #         time_release.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 2:
+    #         time_bidopen.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 3:
+    #         time_bidclose.append(entity.entity_text)
+    #
+    # prem[0][1]['bidway'] = ';'.join(set(bidway))
+    # prem[0][1]['moneySource'] = ';'.join(set(moneySource))
+    # prem[0][1]['servicetime'] = ';'.join(set(servicetime))
+    # prem[0][1]['time_release'] = ';'.join(set(time_release))
+    # prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
+    # prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
+    #
+    # ''''''
+    #
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+    #               entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
+    #
+    # print(prem)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
+               cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)
+
+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
+    #                   ensure_ascii=False)
+
+
+def test(name, content):
+    user = {
+        "content": content,
+        "id": name
+    }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+    resp_json = _resp.content.decode("utf-8")
+    print(resp_json)
+    return resp_json
+
+
+if __name__ == "__main__":
+    # from tablestore import *
+    # endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
+    # access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
+    # access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
+    # instance_name = 'bxkc-ots'
+    # ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
+    #
+    # def get_data(query, max_rows, table_name='document',
+    #              index_name='document_index',
+    #              column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
+    #              sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
+    #     '''
+    #     从阿里云ots查询数据
+    #     :param query: 查询命令
+    #     :param max_rows: 最大返回多少数据
+    #     :param table_name: 表名
+    #     :param index_name: 表索引名
+    #     :param column_names: 返回字段名
+    #     :param sorters: 排序规则列表
+    #     :return: 处理后的数据列表
+    #     '''
+    #     next_token = None
+    #     data = []
+    #     all_rows = []
+    #     rows, next_token, total_count, is_all_succeed = \
+    #         ots_client.search(table_name,
+    #                           index_name,
+    #                           SearchQuery(query,
+    #                                       next_token=next_token,
+    #                                       sort=Sort(sorters=sorters),  # ASC升序
+    #                                       limit=100,
+    #                                       get_total_count=True),
+    #                           ColumnsToGet(column_names=column_names,
+    #                                        return_type=ColumnReturnType.SPECIFIED))
+    #     all_rows.extend(rows)
+    #     while next_token:
+    #         rows, next_token, total_count, is_all_succeed = \
+    #             ots_client.search(table_name,
+    #                               index_name,
+    #                               SearchQuery(query,
+    #                                           next_token=next_token,
+    #                                           sort=None,
+    #                                           limit=100,
+    #                                           get_total_count=True),
+    #                               ColumnsToGet(column_names=column_names,
+    #                                            return_type=ColumnReturnType.SPECIFIED))
+    #         all_rows.extend(rows)
+    #         if len(all_rows) > max_rows:
+    #             print('已获取%d条数据' % len(all_rows))
+    #             break
+    #
+    #     if all_rows:
+    #         for row in all_rows:
+    #             tmp = []
+    #             tmp.append(row[0][1][1])
+    #             for tup in row[1]:
+    #                 tmp.append(tup[1])
+    #             data.append(tmp)
+    #     return data
+    #
+    #
+    # bool_query = TermQuery('docid','124113339')
+    # # bool_query = BoolQuery(
+    # #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
+    # #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
+    # # )
+    #
+    # data = get_data(bool_query, 1)
+    # print(data)
+    # docid = str(data[0][0])
+    # html = data[0][1]
+    # title = data[0][2]
+    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
+    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
+    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
+    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
+    docid = ""
+    # title = '招标公告'
+    # html = '招标人:广州市人民医院。代理人:广州医疗代理服务公司。招标金额:3000元,总投资:5万元。中标人:比地科技有限公司,中标金额:1万元。'
+    html = """, [ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) , 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 采购 结果 公告 , 项目 名称 , 公司 2020 - 2021 年度 打印 制作 服务 项目 编号 , 20200803030110070001 采购 组织 人 , 中 节能 建筑 节能 有限公司 河南 分公司 采购 方式 , 谈判 采购 成交 信息 , 序号 , 标段 ( 包 ) 编号 , 标段 ( 包 ) 名称 , 成交 供应商 , 成交 金额 20200803030110070001001 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 郑州市 上街区 永达 文印部 null 元 公告 起 止 时间 2021年 04月 14日 - 2021年 04月 17日 ,
+"""
+    title = """[ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ,
+"""
+    html = html.replace(' ', '')
+    title = title.replace(' ', '')
+    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购中标候选人公示,中标人:广州比地科技有限公司,中标金额:6000万元'
+    # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+
+    a = time.time()
+    print("start")
+    # print(predict('12',text))
+    print(predict(docid, html,title=""))
+    # test("12",text)
+    print("takes", time.time() - a)
+    pass

+ 477 - 0
BiddingKG/dl_dev/test/测试整个要素提取流程.py

@@ -0,0 +1,477 @@
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+sys.path.append(os.path.abspath('../../'))
+print('当前路径为:',os.getcwd())
+print('sys.path',sys.path)
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import json
+
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+'''''' 
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+timePredictor = predictor.TimePredictor()
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32, 
+        np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id,text):
+    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","","",""]],useselffool=True)
+    for articles in list_articles:
+        print('预处理后文本信息')
+        print(articles.content)
+    # for sentences in list_sentences:
+    #     for sentence in sentences:
+    #         # print(sentence.sentence_index,sentence.tokens)
+    #         print(sentence.sentence_index,sentence.in_attachment,sentence.tokens)
+    print("location:",[ent.entity_text for ent in list_entitys[0] if ent.entity_type=='location'])
+
+
+    ''''''
+        
+    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    print('codeName',codeName)
+    premPredict.predict(list_sentences,list_entitys)
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         if entity.entity_type in ['org','company']:
+    #             print("公司->联系人2:", end=' ')
+    #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
+    #             pass
+    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_sentences,list_entitys, codeName)
+    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences,list_entitys)
+
+    timePredictor.predict(list_sentences,list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
+    # print("getPREMs")
+    print("公司——联系人:", end=' ')
+    print(prem[0])
+    # print(prem[0]['prem']['Project']['roleList'])
+
+    
+    ''''''
+    
+    entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    # for entitys in entitys_all:
+        # print(entitys)
+        # en_types = set([it[1] for it in entitys])
+        # print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+        # print([it for it in entitys if it[1] in ('org','company','person')])
+        # for en_type in en_types:
+        #     print('***************************************')
+        #     print(en_type)
+        #     print([(it[0],it[2],it[3]) for it in entitys if it[1]==en_type])
+    for entitys in list_entitys:
+        entitys = sorted(entitys, key=lambda x: (x.sentence_index, x.wordOffset_begin))
+        for entity in entitys:
+            # print('**********实体信息****************')
+            if entity.entity_type=='person':
+                print("联系人-电话:",end=' ')
+                print(entity.entity_text,[i.entity_text for i in entity.person_phone] if entity.person_phone else [],entity.label,entity.values)
+                if entity.pointer_email:
+                    print("联系人-邮箱:",entity.entity_text,entity.pointer_email.entity_text)
+                # print(entity.begin_index, entity.end_index)
+                print(entity.sentence_index)
+                pass
+            # elif entity.entity_type=="time":
+            #     print("time:",end=" ")
+            #     print(entity.entity_text, entity.label, entity.values)
+            # elif entity.entity_type=="email":
+            #     print("email:",end=" ")
+            #     print(entity.entity_text, entity.begin_index, entity.end_index)
+            elif entity.entity_type in ['org','company']:
+                _sentence = list_sentences[0][entity.sentence_index]
+                print(entity.entity_type)
+                if entity.pointer_person:
+                    print("公司->联系人1:",end=' ')
+                    print(entity.entity_text,[i.entity_text for i in entity.pointer_person],entity.label,entity.values)
+                    # print(_sentence.tokens[entity.begin_index:entity.end_index+3])
+                    # print(entity.entity_text,entity.label,entity.values)
+                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                else:
+                    print("公司->联系人2:", end=' ')
+                    print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
+                    print(_sentence.tokens[entity.begin_index:entity.end_index+3])
+                    # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
+                    pass
+                if entity.label in [2,3,4]:
+                    if entity.pointer_money:
+                        print("公司->中投标金额:", end=' ')
+                        print(entity.entity_text, entity.pointer_money.entity_text)
+                    if entity.pointer_serviceTime:
+                        print("公司->工期:", end=' ')
+                        print(entity.entity_text, entity.pointer_serviceTime.entity_text)
+                    if entity.pointer_ratio:
+                        print("公司->费率:", end=' ')
+                        print(entity.entity_text, entity.pointer_ratio.entity_text)
+                # print(entity.pointer_pack)
+            # elif entity.entity_type =='serviceTime':
+            #     print(entity.entity_text)
+            #     if entity.pointer_pack:
+            #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
+            # elif entity.entity_type =='money':
+            #     print('money',entity.entity_text,entity.label,entity.money_unit,entity.notes)
+            # elif entity.entity_type =='phone':
+            #     print('phone',entity.entity_text)
+            # elif entity.entity_type =='name':
+            #     print('pj_name',entity.entity_text,entity.sentence_index,entity.begin_index)
+            # elif entity.entity_type in ['package']:
+            #     print('pack_entity:',entity.entity_text)
+            # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
+
+    #print(prem)
+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=1,ensure_ascii=False)
+
+         
+# def test(name,content):
+#     user = {
+#             "content": content,
+#             "id":name
+#             }
+#     myheaders = {'Content-Type': 'application/json'}
+#     _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+#     resp_json = _resp.content.decode("utf-8")
+#     print(resp_json)
+#     return resp_json
+def get_result_online(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    user = {
+            "content": rows[0][1],
+            "id":docid
+            }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)  # 15015  #最新模型15030
+    resp_json = _resp.content.decode("utf-8")
+    return json.loads(resp_json)
+
+def get_result(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    return(json.loads(predict(docid, rows[0][1])))
+
+def analys_person_phone():
+    import pandas as pd
+    import time
+    t1 = time.time()
+    df = pd.read_excel(r'E:\workspace\BiddingKG\BiddingKG\dl\person\实习生标注信息角色联系人电话.xlsx', encoding='utf-8')
+    lab_num = pos_num = pre_num = 0
+    lab_num2 = pos_num2 = pre_num2 = 0
+    lab_person = pos_person = pre_person = 0
+    lab_role = pos_role = pre_role = 0
+    person_errors = []
+    phone_errors = []
+    join_errors = []
+    person_name_errors =[]
+    role_name_errors =[]
+    for docid in set(df['doc_id']):
+        print('开始处理 : ',docid)
+        df_tmp = df[df.loc[:, 'doc_id'] == docid]
+        values = list(df_tmp['value'])
+        a = [it.split() for it in values]
+        rel_person = [it for it in a if it[1] == 'rel_person']
+        rel_phone = [it for it in a if it[1] == 'rel_phone']
+        r1 = get_result(str(docid))
+        # r1 = get_result_online(str(docid))
+        label_role_person = []  # 标注角色+联系人
+        for rel in rel_person:
+            role = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            person = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and role != []:
+                label_role_person.append(role[0][-1] +'+'+ person[0][-1])
+        label_person_phone = []  # 标注角色+联系人
+        for rel in rel_phone:
+            person = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            phone = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and phone != []:
+                label_person_phone.append(person[0][-1] +'+'+ phone[0][-1])
+        role_person = []
+        person_phone = []
+        if r1.get('success','')==False:
+            print(docid, '接口返回失败 ')
+        else:
+            for v in r1['prem'].values():
+                roleList = v['roleList']
+                for role in roleList:
+                    for it in role[3]:
+                        role_person.append(role[1] +'+'+ it[0])
+                for role in roleList:
+                    for it in role[3]:
+                        person_phone.append(it[0] +'+'+ it[1])
+                    # print(set(label_person_phone))
+            # print(set(person_phone))
+        pos_num += len(set(role_person) & set(label_role_person))
+        lab_num += len(set(label_role_person))
+        pre_num += len(set(role_person))
+        if set(role_person)&set(label_role_person) != set(label_role_person):
+            person_errors.append([docid, set(label_role_person), set(role_person)])
+            # 判断角色联系人是否正确逻辑:1、先看预测角色是否都在标签角色里,2判断预测联系人是否在标签联系人,
+            # print(set(role_person))
+            # print(set(label_role_person))
+        if set(label_person_phone) & set(person_phone)!=set(label_person_phone):
+            phone_errors.append([docid, set(label_person_phone), set(person_phone)])
+        pos_num2 += len(set(label_person_phone) & set(person_phone))
+        lab_num2 += len(set(label_person_phone))
+        pre_num2 += len(set(person_phone))
+
+        lab_person += len(set([it.split('+')[1] for it in label_role_person]))
+        pos_person += len(set([it.split('+')[1] for it in label_role_person])&set([it.split('+')[1] for it in role_person]))
+        pre_person += len(set([it.split('+')[1] for it in role_person]))
+
+        lab_role += len(set([it.split('+')[0] for it in label_role_person]))
+        pos_role += len(set([it.split('+')[0] for it in label_role_person])&set([it.split('+')[0] for it in role_person]))
+        pre_role += len(set([it.split('+')[0] for it in role_person]))
+
+        if set([it.split('+')[0] for it in label_role_person]) != set([it.split('+')[0] for it in role_person]):
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid,set(label_role_person), set(role_person)])
+            else:
+                role_name_errors.append([docid, set(label_role_person), set(role_person)])
+        else:
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid, set(label_role_person), set(role_person)])
+            elif set(label_role_person)!= set(role_person):
+                print(docid,set(label_role_person), set(role_person))
+                join_errors.append([docid,set(label_role_person), set(role_person)])
+    print('单独角色召回率:%.4f,准确率:%.4f'%(pos_role/lab_role, pos_role/pre_role))
+    print('单独联系人召回率:%.4f, 准确率:%.4f'%(pos_person/lab_person, pos_person/pre_person))
+    print('联系人召回率:%.4f, 准确率:%.4f' % (pos_num / lab_num, pos_num / pre_num))
+    print('电话召回率:%.4f,准确率:%.4f' % (pos_num2 / lab_num2, pos_num2 / pre_num2))
+    print('总耗时:',time.time()-t1)
+    return person_errors, phone_errors, join_errors, role_name_errors, person_name_errors
+
+def predict_fromdb(docid, dbname="sys_document_23"):
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # sql = "SELECT  docid as id, dochtmlcon as content  from {1} WHERE DOCID='{0}';".format(docid, dbname)
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier as id,sourcetext as content from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    doc_id = rows[0][0]
+    text = rows[0][1]
+    # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层,  1.采购人信息名 称:北京市植物园。'
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    print('codeName',codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    return list_articles, list_sentences, list_entitys, codeName, prem
+
+if __name__=="__main__":
+    # import pandas as pd
+    # import math
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # df = pd.read_excel('G:/大网站规则识别/1027统计入库top100编号.xlsx')
+    # docs_list = []
+    # for i in range(100):
+    #     web_no = df.loc[i, '编号']
+    #     # num = math.ceil(int(df.loc[i, '1019-1023入库公告数量']) * 0.01)
+    #     num = 10
+    #     sql = "SELECT DOCID,DOCCHANNEL,DOCHTMLCON,WEB_SOURCE_NO from sys_document_23 where WEB_SOURCE_NO='{0}' and DOCCHANNEL='101' and DOCID%9=1 limit {1}".format(
+    #         web_no, num)
+    #     #  rows = cursor.execute(sql) 此处代码错误 rows 需要用 cursor.fetchall方法获取
+    #     cursor.execute(sql)
+    #     rows = cursor.fetchall()
+    #     docs_list.extend(list(rows))
+    # df_doc = pd.DataFrame(docs_list, columns=['docid', 'channel', 'html', 'web_no'])
+    # codenames = []
+    # prems = []
+    # for docid,text in zip(df_doc['docid'], df_doc['html']):
+    #     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[docid, text, "", "", ""]],
+    #                                                                                     useselffool=True)
+    #     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    #     # print(codeName)
+    #     premPredict.predict(list_sentences, list_entitys)
+    #     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    #     # print("epcPredict")
+    #     epcPredict.predict(list_sentences, list_entitys)
+    #     # print("entityLink")
+    #     entityLink.link_entitys(list_entitys)
+    #     # print("getPREMs")
+    #     prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    #     if codeName:
+    #         codenames.append(codeName[0][1])
+    #     else:
+    #         codenames.append(" ")
+    #     if prem:
+    #         prems.append(prem[0][1])
+    #     else:
+    #         prems.append(" ")
+    # df_doc['codename'] = pd.Series(codenames)
+    # df_doc['prem'] = pd.Series(prems)
+    # df_doc.to_excel('G:/大网站规则识别/大网站规则调整后预测结果20201124.xlsx', columns=['docid', 'channel', 'html', 'prem', 'codename', 'web_no'])
+
+
+    # list_articles, list_sentences, list_entitys, codeName, prem = predict_fromdb('82862794',dbname="sys_document_25")  #sys_document_23
+    # print(prem)
+    # print(codeName)
+    # entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    # for entitys in entitys_all:
+    #     # print(entitys)
+    #     # en_types = set([it[1] for it in entitys])
+    #     print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+    # print(list_articles[0].content)
+
+    # print(get_result('100000203'))
+
+    # person_errors, phone_errors, join_errors, role_name_errors, person_name_errors = analys_person_phone()
+    # import pickle
+    # with open('phone_errors.pkl','wb') as f:
+    #     pickle.dump(phone_errors, f)
+
+    # filename = "比地_52_79929693.html"
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    # # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    # # text = codecs.open('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html', encoding='utf-8').read()
+    # docid = '100000203'
+    # r1 = get_result(docid)
+    # r2 = get_result_online(docid)
+    # rolperson = []
+    # person_phone = []
+    # for v in r1['prem'].values():
+    #     roleList = v['roleList']
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             rolperson.append(role[1] + it[0])
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             person_phone.append(it[0]+it[1])
+    # print(r1['prem'])
+    # print(r2['prem'])
+    #
+    # import psycopg2
+    # conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    # cursor = conn.cursor()
+    # sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('95008163');"""
+    # cursor.execute(sql)
+    # rows = cursor.fetchall()
+    # # print(len(rows), rows)
+    # content = rows[0][1]
+    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    # # content = text
+    # # print('content: ',content)
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
+    # text = '''
+    # 招标人:常州市金坛区直溪镇迪庄村村民委员会,地址:直溪镇迪庄村,电话:18798916198电话:17798916098,
+    # '''
+    text = '''
+招标人:湖南财信金融控股集团有限公司,招标代理机构:湖南省招标有限责任公司。联系人:曾君联系人:夏女士,电话:0731-84556896电话:0731-85196661
+    '''
+    # text=''',东莞市粤隆招标有限公司受东莞市道滘镇教育管理中心的委托,于2020年05月20日就2020年道滘镇公办中小学增加多媒体教学平台
+    # 采购项目(441900-10-202003-1606000137-0006)采用公开招标进行采购。现就本次采购的中标(成交)结果公告如下:
+    # 一、采购项目编号:441900-10-202003-1606000137-0006二、采购项目名称:2020年道滘镇公办中小学增加多媒体教学平台采购项目三、
+    # 采购项目预算金额(元):3,505,265.00四、采购方式:公开招标五、中标供应商,1:A包中标供应商名称东莞市靖恩智能科技有限公司法人代表王地址
+    # 东莞市高埗镇高埗大道93号301房,六、报价明细,备注。主要中标、成交标的名称:2020年道滘镇公办中小学增加多媒体教学平台采购项目,
+    # 中标供应商:东莞市靖恩智能科技有限公司,包编号:A,规格型号:\,数量:1,预算(元):3,505,265.00,服务要求:详见招标文件,
+    # 中标、成交金额(元):3,478,120.00。报价明细附件七、评审日期:2020-05-20评审地点:广东省东莞市道滘镇道厚路创新岛2号四楼,评审委员会
+    # (谈判小组、询价小组、磋商小组或单一来源采购小组):负责人:廖云峰,成员:叶润宁、莫立基、李松涛和吴树根,八、本项目代理收费标准:国家发展
+    # 和改革委员会“发改价格[2011]534号文”及国家发展计划委员会“计价格[2002]1980号文”相关规定,收费金额:人民币肆万贰仟贰佰伍拾玖元整(¥42,259.00元)九、
+    # 评审意见(非标采购方式或竞争性磋商采购方式采用书面推荐供应商参加采购活动的,还应当公告采购人和评审专家的推荐意见),经评标委员会一致决定,
+    # 推荐2020年道滘镇公办中小学增加多媒体教学平台采购项目为第一中标候选人,十、本公告期限1个工作日。十一、联系事项:
+    # (一),采购项目联系人(代理机构):翟先生,联系电话:0769-88839111,采购项目联系人(采购人):李先生,联系电话:0769-81332303。
+    # (二),采购代理机构:东莞市粤隆招标有限公司,地址:东莞市南城街道体育路2号鸿禧中心B902号,联系人:翟先生,联系电话:0769-88839111,
+    # 传真:0769-81216222,邮编:523000。(三),采购人:东莞市道滘镇教育管理中心,地址:广东省东莞市道滘镇花园街1号,联系人:李先生,联系电话:0769-81332303,传真:/,邮编:523000,各有关当事人对中标、成交结果有异议的,可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑,逾期将依法不予受理,'''
+    text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div", id="pcontent"))
+    # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
+#     text = '''
+# 采购代理机构:山东立行建设项目管理有限公司地址:山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号,联系方式:17862288900,
+# '''
+#     text = '''
+# (二),采购人信息,名称:四川省绵阳市平武县卫生健康局,地址:东莞市南城街道体育路2号鸿禧中心B902号,(三),采购代理机构信息,名称:四川省蜀汉硕创工程项目管理有限公司,地址:体育路2号鸿禧中心B,联系人:翟先生,联系电话:0769-88839111,
+# 采购项目,联系电话:0769-81332303/0769-81332903,联系人姓名和电话:李先生/廖云峰\叶润宁,
+# '''
+    a = time.time()
+    print("start")
+    # print(predict("12",content))
+    print(predict("12",text))
+    # result = predict("12",text)
+    # result = predict("12",content)
+    # print(json.loads(result))
+    #test("12",text)
+    print("takes",time.time()-a)
+    _time2 = time.time()
+    # print(predict("12",content))
+    _time3 = time.time()
+    print("init takes:%d"%((_time2-_time1)-(_time3-_time2)))
+    pass