luojiehua 2 vuotta sitten
vanhempi
commit
28145f4370

+ 0 - 15
.idea/codeStyles/Project.xml

@@ -1,15 +0,0 @@
-<component name="ProjectCodeStyleConfiguration">
-  <code_scheme name="Project" version="173">
-    <codeStyleSettings language="JAVA">
-      <indentOptions>
-        <option name="TAB_SIZE" value="2" />
-      </indentOptions>
-    </codeStyleSettings>
-    <codeStyleSettings language="Python">
-      <indentOptions>
-        <option name="TAB_SIZE" value="2" />
-        <option name="SMART_TABS" value="true" />
-      </indentOptions>
-    </codeStyleSettings>
-  </code_scheme>
-</component>

+ 45 - 11
.idea/dataSources.xml

@@ -7,30 +7,64 @@
       <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
       <jdbc-url>jdbc:mysql://rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com:3306/bxkc</jdbc-url>
     </data-source>
-    <data-source source="LOCAL" name="bxkc@47.98.60.3" uuid="55d18506-80f5-4270-88d3-bac5db11eb93">
-      <driver-ref>mongo</driver-ref>
-      <synchronize>true</synchronize>
-      <jdbc-driver>com.dbschema.MongoJdbcDriver</jdbc-driver>
-      <jdbc-url>mongodb://47.98.60.3:17017/bxkc</jdbc-url>
-    </data-source>
     <data-source source="LOCAL" name="oracle 生产库" uuid="d47b5be6-14c1-4fd3-b108-75141d1efa10">
-      <driver-ref>oracle</driver-ref>
+      <driver-ref>oracle.19</driver-ref>
       <synchronize>true</synchronize>
       <auto-commit>false</auto-commit>
       <jdbc-driver>oracle.jdbc.OracleDriver</jdbc-driver>
       <jdbc-url>jdbc:oracle:thin:@121.46.18.113:10522:yanphone</jdbc-url>
     </data-source>
-    <data-source source="LOCAL" name="@192.168.2.101" uuid="8ea2c81f-5933-4c9e-b7d4-c10c033d46a2">
+    <data-source source="LOCAL" name="mysql_测试" uuid="19d8fc36-28e0-4de8-bed4-c356c7fd53cb">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://192.168.2.170:3306/exportDB?useSSL=no</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="@192.168.2.42" uuid="af0b13f8-830b-4f12-a8e5-48c41c3512ab">
+      <driver-ref>oracle.19</driver-ref>
+      <synchronize>true</synchronize>
+      <auto-commit>false</auto-commit>
+      <jdbc-driver>oracle.jdbc.OracleDriver</jdbc-driver>
+      <jdbc-url>jdbc:oracle:thin:@192.168.2.42:1521:orcl</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="线上阿里云读写d" uuid="4c612c50-dae3-4d00-b623-3146b4a15a26">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com:3306/bxkc</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="postgres@192.168.2.103" uuid="e7c7ef97-e360-48d6-a6f9-006b9c13ff95">
       <driver-ref>postgresql</driver-ref>
       <synchronize>true</synchronize>
       <jdbc-driver>org.postgresql.Driver</jdbc-driver>
-      <jdbc-url>jdbc:postgresql://192.168.2.101:5432/</jdbc-url>
+      <jdbc-url>jdbc:postgresql://192.168.2.103:5432/postgres</jdbc-url>
     </data-source>
-    <data-source source="LOCAL" name="mysql_测试" uuid="19d8fc36-28e0-4de8-bed4-c356c7fd53cb">
+    <data-source source="LOCAL" name="附件表@121.46.18.113" check-outdated="false" uuid="990bfd71-c638-4a74-ac81-47140f47999e">
+      <driver-ref>postgresql</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
+      <jdbc-url>jdbc:postgresql://121.46.18.113:5432/postgres</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="@192.168.2.53" uuid="add07915-872f-47e2-8b86-2de756bb3c84">
+      <driver-ref>oracle.19</driver-ref>
+      <synchronize>true</synchronize>
+      <auto-commit>false</auto-commit>
+      <jdbc-driver>oracle.jdbc.OracleDriver</jdbc-driver>
+      <jdbc-url>jdbc:oracle:thin:@192.168.2.53:1521:orcl</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="procurement@116.62.141.83" uuid="8ccde941-d655-416c-9e8b-baaa3aa98c66">
       <driver-ref>mysql.8</driver-ref>
       <synchronize>true</synchronize>
       <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
-      <jdbc-url>jdbc:mysql://192.168.2.170:3306</jdbc-url>
+      <jdbc-url>jdbc:mysql://116.62.141.83:3306/procurement</jdbc-url>
+      <working-dir>$ProjectFileDir$</working-dir>
+    </data-source>
+    <data-source source="LOCAL" name="postgres@116.62.141.83" uuid="bfe4378e-557f-4d2e-a255-1076b461c52b">
+      <driver-ref>postgresql</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
+      <jdbc-url>jdbc:postgresql://116.62.141.83:5432/postgres</jdbc-url>
+      <working-dir>$ProjectFileDir$</working-dir>
     </data-source>
   </component>
 </project>

+ 4 - 0
.idea/encodings.xml

@@ -10,6 +10,10 @@
     <file url="file://$PROJECT_DIR$/data/exportFind_tenderee.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/data/exportFind_tenderee1.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/data/服务型客户.txt" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/dataSource/ossUtils.py" charset="UTF-8" />
+    <file url="file://$PROJECT_DIR$/test/industry_keyword_expand.py" charset="UTF-8" />
+    <file url="file://$PROJECT_DIR$/test/拓展关键词.xlsx" charset="UTF-8" />
+    <file url="file://$PROJECT_DIR$/utils/ERNIE_utils.py" charset="UTF-8" />
     <file url="PROJECT" charset="GBK" />
   </component>
 </project>

+ 2 - 0
.idea/sqldialects.xml

@@ -1,6 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="SqlDialectMappings">
+    <file url="file://$PROJECT_DIR$/ddl" dialect="MySQL" />
+    <file url="file://$PROJECT_DIR$/ddl_postgres" dialect="PostgreSQL" />
     <file url="file://G:/要素提取标注备份/iepy_public_auth_group.sql" dialect="PostgreSQL" />
   </component>
 </project>

+ 1 - 1
DataMining.iml

@@ -3,7 +3,7 @@
   <component name="NewModuleRootManager" inherit-compiler-output="true">
     <exclude-output />
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.5 (dl_nlp)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>

+ 68 - 0
dataSource/ossUtils.py

@@ -0,0 +1,68 @@
+
+import oss2
+import traceback
+from utils.Utils import log
+import time
+import os
+
+def does_bucket_exist(bucket):
+    try:
+        bucket.get_bucket_info()
+    except oss2.exceptions.NoSuchBucket:
+        return False
+    except:
+        raise
+    return True
+
+import hashlib
+def getMDFFromFile(path):
+    _length = 0
+    try:
+        _md5 = hashlib.md5()
+        with open(path,"rb") as f:
+            while True:
+                data = f.read(4096)
+                if not data:
+                    break
+                _length += len(data)
+                _md5.update(data)
+        return _md5.hexdigest(),_length
+    except Exception as e:
+        traceback.print_exc()
+        return None,_length
+
+def uploadFileByPath(bucket,filepath,uploadpath,headers=None):
+    try:
+        start_time = time.time()
+        log("uploading file of %s"%filepath)
+        with open(filepath,"rb") as f:
+            bucket.put_object(uploadpath,f,headers=headers)
+        log("upload file of %s takes %ds"%(filepath,time.time()-start_time))
+        return True
+    except Exception as e:
+        traceback.print_exc()
+        log("upload object failed of %s"%(filepath))
+        return False
+
+def deleteObject(bucket,objectName):
+    try:
+        bucket.delete_object(objectName)
+    except Exception as e:
+        log("delete object failed of %s"%objectName)
+
+def downloadFile(bucket,objectPath,localPath):
+    try:
+        # bucket.get_object_to_file(objectPath, localPath)
+        oss2.resumable_download(bucket, objectPath, localPath,
+                                store=oss2.ResumableDownloadStore(root=os.path.join(os.path.dirname(__file__),"/../tmp")),
+                                multiget_threshold=200*1024,
+                                part_size=200*1024,
+                                num_threads=5)
+        return True
+    except Exception as e:
+        log("download object failed of %s"%str(objectPath))
+        return False
+
+
+if __name__=="__main__":
+    print(getMDFFromFile('1578298842064.doc'))

+ 15 - 2
dataSource/setttings.py

@@ -12,7 +12,9 @@ solr_collections = {"document":"http://47.97.221.63:8983/solr/",#文档
 mysql_host = "rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com"
 mysql_port = 3306
 mysql_user = "bxkc_read"
-mysql_pass = "bxkc_20RE18AD"
+mysql_pass = "UAokyI#^KaaYh0%p"
+# mysql_user = "bxkc"
+# mysql_pass = "cX98ODck&j!E$fNU"
 mysql_db = "bxkc"
 
 test_mysql_host = "192.168.2.170"
@@ -29,6 +31,9 @@ mongo_pass = "BidiReadOnly2017"
 
 elasticSearch_url = "http://47.97.210.202:9200/_search"
 
+ES_HOST = "https://es-cn-tl32rqgql0002lvcu.kibana.elasticsearch.aliyuncs.com:9200/"
+ES_USER = "elastic"
+ES_PASSWORD = "WWBu9#1HWHo$$gJm"
 # neo4j_host = "47.98.60.3"
 neo4j_host = "118.31.10.60"
 neo4j_port = 7687
@@ -38,5 +43,13 @@ neo4j_pass = "bxkc_web"
 oracle_host = "121.46.18.113"
 oracle_port = 10522
 oracle_user = "bxkc_data_readonly"
-oracle_pass = "P7WUrgcz0@#j8pjg"
+oracle_pass = "GG&B$Sa>*=B_YhF8"
+# oracle_user = "bxkc_data"
+# oracle_pass = "Z0rTLHo@nIu5Zk1Z"
 oracle_db = "yanphone"
+
+# ots_AccessKeyId = 'LTAI4G2bwraGDYQ4S5hhCxht'
+# ots_AccessKeySecret = 'k6Llfa0S1KuvYyU2cWchExdQjPGJOY'
+
+ots_AccessKeyId = 'LTAI5tHoEUDSy6FnZjMKsNiZ'
+ots_AccessKeySecret = '1S9VJYRhCjDZt3bLiR0y2ZKbsF5wxP'

+ 80 - 6
dataSource/source.py

@@ -7,6 +7,8 @@ import pymysql
 import pymongo
 from py2neo import Graph,NodeMatcher
 import tablestore
+import psycopg2
+from elasticsearch import Elasticsearch
 
 
 
@@ -40,6 +42,10 @@ def getConnection_testmysql(db=None):
     connect = pymysql.Connect(host=test_mysql_host, port=test_mysql_port, db=db, user=test_mysql_user, passwd=test_mysql_pass)
     return connect
 
+def getConnection_postgres(db):
+    conn = psycopg2.connect(dbname=db,user="postgres",password="postgres",host="192.168.2.103")
+    return conn
+
 def getConnection_oracle():
     import cx_Oracle
     connect = cx_Oracle.connect(oracle_user,oracle_pass,'%s:%s/%s'%(oracle_host,oracle_port,oracle_db), encoding = "UTF-8", nencoding = "UTF-8")
@@ -52,8 +58,9 @@ def getConnect_mongodb():
     db.authenticate(mongo_user,mongo_pass)
     return db
 
-def make_elasticSearch(query):
-    resp = requests.post(elasticSearch_url,json=query)
+def make_elasticSearch(elasticSearch_url,query,auth):
+    resp = requests.post(elasticSearch_url,json=query,auth=auth)
+    print(resp.status_code)
     if resp.status_code==200:
         return json.loads(resp.content.decode())
     return None
@@ -65,10 +72,47 @@ def getConnect_neo4j():
     # print(json.loads(json.dumps(finded.data())))
     # print(finded)
 
+import platform
+
+import os
+def check_net(testserver):
+    try:
+        response = os.system("ping -c 1 " + testserver)
+        if response == 0:
+            return True
+    except:
+        return False
+    return False
+
+import platform
+print(platform.system())
+if platform.system()=="Windows":
+    OTS_URL = "https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com"
+    OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
+else:
+    OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
+    check_url = "oss-cn-hangzhou-internal.aliyuncs.com"
+    is_internal = True
+    if not check_net(check_url):
+        is_internal = False
+        OTS_URL = "https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com"
+
 def getConnect_ots():
-    ots_client = tablestore.client.OTSClient('https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com', 'LTAI4FyUT7ZcQFZPjVtw5y9b', '2zscfFTvy3JWavtCeCOthLxF8bDNH3',
+    # ots_client = tablestore.client.OTSClient(OTS_URL, 'LTAI4FyUT7ZcQFZPjVtw5y9b', '2zscfFTvy3JWavtCeCOthLxF8bDNH3',
+    #                                          'bxkc-ots', logger_name = 'table_store.log',
+    #                                          retry_policy = tablestore.WriteRetryPolicy(),socket_timeout=3000)
+    ots_client = tablestore.client.OTSClient(OTS_URL, ots_AccessKeyId, ots_AccessKeySecret,
                                              'bxkc-ots', logger_name = 'table_store.log',
-                                             retry_policy = tablestore.WriteRetryPolicy())
+                                             retry_policy = tablestore.WriteRetryPolicy(),socket_timeout=3000)
+    return ots_client
+
+def getConnect_capacity():
+    # ots_client = tablestore.client.OTSClient(OTS_URL, 'LTAI4FyUT7ZcQFZPjVtw5y9b', '2zscfFTvy3JWavtCeCOthLxF8bDNH3',
+    #                                          'bxkc-ots', logger_name = 'table_store.log',
+    #                                          retry_policy = tablestore.WriteRetryPolicy(),socket_timeout=3000)
+    ots_client = tablestore.client.OTSClient(OTS_URL, ots_AccessKeyId, ots_AccessKeySecret,
+                                             'bxkc-capacity', logger_name = 'table_store.log',
+                                             retry_policy = tablestore.WriteRetryPolicy(),socket_timeout=3000)
     return ots_client
 
 def getConnect_gdb():
@@ -79,7 +123,20 @@ def getConnect_gdb():
         for item in result:
             print(item.id)
     return client
+import oss2
+def getAuth():
+    auth = oss2.Auth(ots_AccessKeyId, ots_AccessKeySecret)
+    return auth
 
+import os
+def check_net(testserver):
+    try:
+        response = os.system("ping -c 1 " + testserver)
+        if response == 0:
+            return True
+    except:
+        return False
+    return False
 
 if __name__=="__main__":
     # solrQuery("document",{"q":"*:*"})
@@ -89,6 +146,23 @@ if __name__=="__main__":
     # data = make_elasticSearch({"query":{"bool":{"must":[{"wildcard":{"nicknames.keyword":"*服装*"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}})
     # print(data)
     # getConnect_neo4j()
-    conn = getConnection_oracle()
+    # conn = getConnection_oracle()
+
     # cursor = conn.cursor()
-    # getConnect_gdb()
+    # getConnect_gdb()
+    es = Elasticsearch(hosts=ES_HOST,http_auth=(ES_USER, ES_PASSWORD))
+    _q = '''{
+          "track_total_hits": true,
+          "size":1,
+          "query": {
+              "bool": {
+                  "must":[
+                      {"terms":{"contact_phone_type":[1,2]}},
+                      {"match_phrase":{"city":"深圳"}}
+                  ]
+              }
+          }
+          }'''
+    print(es)
+    print(es.info())
+    # print(es.search(body=_q,index="enterprise"))

BIN
export/docchannel.pk


+ 148 - 0
export/exportAttachment.py

@@ -0,0 +1,148 @@
+#encoding:GBK
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+from utils.hashUtil import aesCipher
+from export.exportEnterprise import getDictEnterprise,getOneContact
+
+
+
+def exportAttachment():
+    ots_client = getConnect_ots()
+    columns = ["path","swfUrls","crtime","link_status","size"]
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery('crtime','2022-09-18 00:00:00')
+        # TermQuery("filetype","swf"),
+        # TermQuery("status",10),
+        # BoolQuery(must_not_queries=[RangeQuery("link_status",1)])
+                                         ]
+                           )
+    rows,next_token,total_count,is_all_succeed = ots_client.search("attachment","attachment_index",
+                                                                   SearchQuery(bool_query,sort=Sort([FieldSort("crtime",SortOrder.ASC)]),limit=100,get_total_count=True),
+                                                                   columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+
+    df_data = {"filemd5":[],"path":[],"crtime":[],"size":[]}
+    set_columns = set()
+    list_dict = getRow_ots(rows)
+
+    for _dict in list_dict:
+        for k,v in df_data.items():
+            v.append(_dict.get(k))
+
+    _count = len(list_dict)
+    while True:
+        if not next_token:
+            break
+        rows,next_token,total_count,is_all_succeed = ots_client.search("attachment","attachment_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+        list_dict = getRow_ots(rows)
+        _count += len(list_dict)
+        print("%d/%d"%(_count,total_count))
+        for _dict in list_dict:
+            for k,v in df_data.items():
+                v.append(_dict.get(k))
+        # if _count>=10000:
+        #     break
+
+    log("================%d"%(len(df_data["path"])))
+    for i in range(len(df_data["path"])):
+        for k in df_data.keys():
+            print("%s %s"%(df_data["filemd5"][i],df_data["path"][i]))
+
+
+
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s_attach_export.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
+
+import oss2
+common_bucket = None
+def getBucket():
+    global common_bucket
+    if common_bucket is None:
+        auth = getAuth()
+        check_url = "oss-cn-hangzhou-internal.aliyuncs.com"
+        if check_net(check_url):
+            bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
+        else:
+            bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
+        attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
+        log("bucket_url:%s"%(bucket_url))
+        attachment_bucket_name = "attachment-hub"
+        common_bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
+    return common_bucket
+
+from dataSource.ossUtils import *
+def downloadAtta():
+
+    filename = "½ðÈÚ»ú¹¹Êý¾Ýµ¼³ö.xlsx"
+    df = pd.read_excel(filename)
+    list_docid = list(set(df["docid"]))
+    ots_client = getConnect_ots()
+
+    bucket = getBucket()
+    task_queue = queue.Queue()
+    for _docid in list_docid:
+        task_queue.put(_docid)
+
+    def _handle(item,result_queue,ots_client):
+        consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["page_attachments"])
+        _dict = getRow_ots_primary(return_row)
+        _page_attachments = json.loads(_dict.get("page_attachments","[]"))
+        _index = 0
+        for _pa in _page_attachments:
+            _index = 1
+            _filemd5 = _pa.get("fileMd5")
+
+            consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_filemd5)],["path","filetype"])
+            _dict = getRow_ots_primary(return_row)
+            _path = _dict.get("path")
+            _filetype = _dict.get("filetype")
+            localpath = "attach/%d_%d.%s"%(item,_index,_filetype)
+            if os.path.exists(localpath):
+                continue
+
+            downloadFile(bucket,_path,localpath)
+    mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
+    mt.run()
+
+
+
+def analisysFile():
+    filename = "ap1.log"
+    dict_minute = {}
+    _count = 0
+    with open(filename,"r") as f:
+        while True:
+            _line = f.readline()
+            if not _line:
+                break
+            _count += 1
+            if re.search("swf of docid",_line) is not None:
+                _minute = _line[:16]
+                if _minute not in dict_minute:
+                    dict_minute[_minute] = 0
+                dict_minute[_minute] += 1
+    log("all line counts:%d"%_count)
+    keys = list(dict_minute.keys())
+    keys.sort(key=lambda x:x)
+    for k in keys:
+        print(k,dict_minute[k])
+
+
+
+if __name__=="__main__":
+    exportAttachment()
+    # downloadAtta()
+    # analisysFile()

+ 338 - 0
export/exportData.py

@@ -0,0 +1,338 @@
+#coding:utf8
+from dataSource.source import getConnection_testmysql,getConnection_postgres
+from utils.Utils import save,getCurrent_date
+
+
+def exportMergeTrainData():
+    conn = getConnection_testmysql()
+
+    cursor = conn.cursor()
+
+    for _type in ["is null","is not null"]:
+        for i in range(20):
+
+            _limit = 1000000
+            _offset = i*_limit
+
+            sql = "select docid_less,docid_greater,json_matrix,prob from merge_probability_pairs_featurematrix_train where prob %s limit %d offset %d"%(_type,_limit,_offset)
+
+            cursor.execute(sql)
+
+            row_name = cursor.description
+            print(row_name)
+            rows = cursor.fetchall()
+
+            list_data = []
+            for row in rows:
+                _dict = dict()
+                for _n,_v in zip(row_name,row):
+                    _dict[_n[0]] = _v
+                list_data.append(_dict)
+            save(list_data,"../data/%s-mergeTrain_%s_part%d.pk"%(getCurrent_date("%Y-%m-%d"),_type.replace(" ",""),i))
+
+
+import pandas as pd
+import re
+
+def labelTime():
+    filename = "../data/延期数据.xlsx"
+    df = pd.read_excel(filename)
+    columns = ["docid","entity_text","label","sentence_left","sentence_right","context_left","context_right","new_label"]
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = df[_c]
+    append_columns = ["reg_count"]
+    columns.extend(append_columns)
+    for _c in append_columns:
+        df_data[_c] = []
+    for _left,_new_label,_text in zip(df["sentence_left"],df["new_label"],df["entity_text"]):
+        _line = _left[-7:]
+        if str(_new_label)!='nan' and _new_label is not None and _new_label!="":
+            df_data["reg_count"].append("-1")
+        else:
+            # if re.search(".*月.*日",_text) is not None or re.search("^\d+\-\d+\-",_text) is not None:
+            #     df_data["reg_count"].append("1")
+            # else:
+            #     df_data["reg_count"].append("0")
+            if re.search("截止(时间|日期):$",_line) is not None:
+                df_data["reg_count"].append("1")
+            else:
+                df_data["reg_count"].append("0")
+    new_df = pd.DataFrame(df_data)
+    new_df.to_excel(filename,columns=columns)
+
+import time
+def getAttachmentProcessTime():
+
+    list_line = []
+    with open("flow_attachment.log","r",encoding="utf") as f:
+        list_line = f.readlines()
+    # _regrex = "filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) with size:(?P<filesize>\d+(\.\d+)?)M download:(?P<dowload>\d+(\.\d+)?)s,recognize takes (?P<recognize>\d+(\.\d+)?)s,ret_size:(?P<ret_size>\d+(\.\d+)?)"
+    # _regrex = "process filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) with size:(?P<filesize>\d+(\.\d+)?)M download:(?P<dowload>\d+(\.\d+)?)s recognize takes (?P<recognize>\d+(\.\d+)?)s,ret_size:(?P<ret_size>\d+(\.\d+)?)"
+    # # _regrex = "filemd5:(?P<filemd5>[a-z0-9]+) of type:(?P<filetype>[a-z0-9]+) download:(?P<dowload>\d+(\.\d+)?)s recognize:(?P<recognize>\d+(\.\d+)?)s result:True rec_size:(?P<ret_size>\d+(\.\d+)?)"
+    # keys = ["filemd5","filetype","filesize","dowload","recognize","ret_size"]
+
+    _regrex = "(?P<process_time>.+),\d+ - root - INFO - thread status alive:(?P<alive_count>\d+) restart:0 total:40"
+
+    keys = ["process_time","alive_count"]
+    df_data = {}
+    for k in keys:
+        df_data[k] = []
+    for _line in list_line:
+        _search = re.search(_regrex,_line)
+        if _search is not None:
+            gd = _search.groupdict()
+            for k in keys:
+                df_data[k].append(gd.get(k,""))
+
+    _sum1 = 0
+    _sum2 = 0
+    last_time = None
+    for _time,_count in zip(df_data["process_time"],df_data["alive_count"]):
+        if last_time is None:
+            last_time = time.mktime(time.strptime(_time,"%Y-%m-%d %H:%M:%S"))
+
+        current_time = time.mktime(time.strptime(_time,"%Y-%m-%d %H:%M:%S"))
+        if int(_count)!=40:
+            _sum1 += (40-int(_count))*(current_time-last_time)
+        else:
+            _sum2 += 40*(current_time-last_time)
+
+        last_time = current_time
+    print("sum1",_sum1)
+    print("sum2",_sum2)
+    df = pd.DataFrame(df_data)
+    df.to_excel("attachmentProcessTime3.xlsx",columns=keys)
+
+_exportID = 0
+def getExportID():
+    global _exportID
+    _exportID += 1
+    return _exportID
+
+def getTypeName():
+    _typestr = '''
+    code | 项目编号
+    name | 项目名称
+    org | 组织
+    company | 公司
+    job | 职业
+    person | 人名
+    time | 时间
+    location | 地址
+    package | 包号
+    phone | 电话
+    money | 金额
+    money_tendereeMoney | 招标金额
+    money_tendererMoney | 中投标金额
+    
+    org_tenderee | 招标人
+    org_agency | 代理人
+    org_tenderer | 中标人
+    org_secondTenderer | 第二候选人
+    org_thirdTenderer | 第三候选人
+    company_tenderee | 招标人
+    company_agency | 代理人
+    company_tenderer | 中标人
+    company_secondTenderer | 第二候选人
+    company_thirdTenderer | 第三候选人
+    
+    person_tendereePerson | 招标联系人
+    person_agencyPerson | 代理联系人
+    person_person | 联系人
+    
+    rel_tendererMoney | 中投标金额
+    rel_tendereeMoney | 招标金额
+    rel_person | 联系人
+    rel_pack | 所属包
+    rel_address | 地址
+    rel_phone | 联系电话
+    rel_pack_code | 包件编号
+    rel_pack_name | 包件名称
+    
+    person_review | 评审专家
+    time_release | 发布时间
+    time_bidopen | 开标时间
+    time_bidclose | 截标时间
+    moneysource | 资金来源
+    bidway | 招标方式
+    serviceTime | 服务期限
+    product | 产品
+    abandon_reason | 失败原因
+    '''
+    dict_type_name = {}
+    for _s_n in _typestr.split("\n"):
+        _s_n = _s_n.strip()
+        if _s_n=="":
+            continue
+        _s_t = _s_n.split("|")
+        if len(_s_t) ==2:
+            _type = _s_t[0].strip()
+            _name = _s_t[1].strip()
+            dict_type_name[_type] = _name
+    return dict_type_name
+
+dict_type_name = getTypeName()
+import json
+def toJson(list_anno,content):
+    json_dict = {}
+    json_dict["id"] = getExportID()
+    json_dict["text"] = content
+    dict_anno = {}
+    for _anno in list_anno:
+        value = _anno["value"]
+        _split = value.split("\t")
+        if _split[0][0]=="T":
+            _type,_begin,_end = _split[1].split(" ")
+            dict_anno[_split[0]] = {"id":_split[0],"type":_type,"text":_split[2],"begin":int(_begin),"end":int(_end)}
+        elif _split[0][0]=="R":
+            _type,arg1,arg2 = _split[1].split(" ")
+            dict_anno[_split[0]] = {"id":_split[0],"type":_type,"arg1":arg1.split(":")[1],"arg2":arg2.split(":")[1]}
+    for k,v in dict_anno.items():
+        if v["id"][0]=="T":
+            v["new_id"] = getExportID()
+            v["label"] = dict_type_name[v["type"]]
+            v["start_offset"] = v["begin"]
+            v["end_offset"] = v["end"]
+    for k,v in dict_anno.items():
+        if v["id"][0]=="R":
+            v["new_id"] = getExportID()
+            v["type"] = dict_type_name[v["type"]]
+            v["from_id"] = dict_anno[v["arg1"]]["new_id"]
+            v["to_id"] = dict_anno[v["arg2"]]["new_id"]
+    list_entitys = []
+    list_relations = []
+    for k,v in dict_anno.items():
+        if v["id"][0]=="T":
+            _dict = {"id":v["new_id"],
+                     "label":v["label"],
+                     "start_offset":v["start_offset"],
+                     "end_offset":v["end_offset"]}
+            list_entitys.append(_dict)
+        if v["id"][0]=="R":
+            _dict = {"id":v["new_id"],
+                     "type":v["type"],
+                     "from_id":v["from_id"],
+                     "to_id":v["to_id"]}
+            list_relations.append(_dict)
+    json_dict["entities"] = list_entitys
+    json_dict["relations"] = list_relations
+    return json.dumps(json_dict,ensure_ascii=False)
+
+
+
+def exportIepyLabel():
+    conn = getConnection_postgres("iepy")
+    cursor = conn.cursor()
+    sql = ' select begin_time,end_time,"user",doc_count from corpus_payroll where end_time<=\'2021-07-25\' order by end_time desc limit 20'
+    cursor.execute(sql)
+    list_diff = []
+    rows_payroll = cursor.fetchall()
+    list_json = []
+    for _payroll in rows_payroll:
+        _begin_time = _payroll[0]
+        _end_time = _payroll[1]
+        _user = _payroll[2]
+        doc_count = _payroll[3]
+        print(_user,_begin_time,_end_time,doc_count)
+        _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100)  order by document_id"%(_user,_begin_time,_end_time)
+        cursor.execute(_sql)
+        rows = cursor.fetchall()
+        if len(rows)>0:
+            current_docid = rows[0][0]
+            _index = -1
+            list_values = []
+            while _index<len(rows)-1:
+                _index += 1
+                row = rows[_index]
+                document_id = row[0]
+                value = row[1]
+                if document_id!=current_docid:
+                    print(current_docid)
+                    sql = "select text from corpus_iedocument where human_identifier='%s'"%(str(current_docid))
+                    cursor.execute(sql)
+                    content = cursor.fetchall()[0][0]
+                    _json = toJson(list_values,content)
+                    list_json.append(_json)
+                    _index -= 1
+                    current_docid = document_id
+                    list_values = []
+                else:
+                    list_values.append({"document_id":document_id,"value":value})
+    print("length:",len(list_json))
+    _count = 0
+    with open("iepy_export.json","w",encoding="utf8") as f:
+        for _json in list_json:
+            if len(_json)<600:
+                f.write(_json)
+                f.write("\n")
+                _count += 1
+                if _count>=100:
+                    break
+    print(_count)
+
+def exportMergeWrongData():
+    from dataSource.source import getConnection_mysql
+    from dataSource.pool import ConnectorPool
+    from utils.multiThread import MultiThreadHandler
+    from queue import Queue
+    import datetime
+
+    min_id = 1850000
+    # max_id = 2000000
+    max_id = 328426202+1
+    thread_count = 30
+
+    every_count = (max_id-min_id)//30+1
+
+    list_dis = []
+    for _i in range(thread_count):
+        dis = [min_id+_i*every_count,min_id+(_i+1)*every_count]
+        list_dis.append(dis)
+
+    task_queue = Queue()
+    result_queue = Queue()
+    for _dis in list_dis:
+        task_queue.put(_dis)
+
+    def _handle(item,result_queue):
+        conn = getConnection_mysql()
+        cursor = conn.cursor()
+        start,end = item
+        set_docid = set()
+        for _id in range(start,end):
+            sql = "select rule,docid,operate_time from bxkc.bxkc_delete_document_log where id=%d"%(_id)
+            cursor.execute(sql)
+
+            rows = cursor.fetchall()
+            if len(rows)>0:
+                rule,docid,operate_time = rows[0]
+                if docid in set_docid:
+                    continue
+
+                if str(rule)[:4]=="项目合并":
+                    set_docid.add(docid)
+                    print(len(set_docid),_id,end,end-_id)
+            if operate_time.strftime("%Y-%m-%d")<"2022-10-14":
+                break
+        result_queue.put(list(set_docid))
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,30)
+    mt.run()
+    list_data = []
+    while 1:
+        try:
+            list_docid = result_queue.get(True,1)
+            list_data.extend(list_docid)
+        except Exception as e:
+            break
+    list_data = list(set(list_data))
+    df = pd.DataFrame({"docid":list_data})
+    df.to_excel("mergeWrong.xlsx")
+
+
+if __name__=="__main__":
+    # exportMergeTrainData()
+    # labelTime()
+    # getAttachmentProcessTime()
+    # exportIepyLabel()
+    exportMergeWrongData()

+ 233 - 0
export/exportDesigned.py

@@ -0,0 +1,233 @@
+#encoding:GBK
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+from utils.hashUtil import aesCipher
+from export.exportEnterprise import getDictEnterprise,getOneContact
+
+
+set_columns = set()
+list_df_columns = []
+
+def set_dict_item(_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns:
+        set_columns.add(name)
+        list_df_columns.append(getLegal_str(name))
+
+def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns1:
+        set_columns1.add(name)
+        list_df_columns1.append(getLegal_str(name))
+
+def getTenderee(contacts):
+    list_contacts = json.loads(contacts)
+    for _contact in list_contacts:
+        if _contact.get("type")=="业主单位":
+            _phone = _contact.get("cellphone")
+            if _phone and re.search("^1\d{10}$",_phone.split(" ")[-1]) is not None:
+                return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("cellphone")
+    for _contact in list_contacts:
+        if _contact.get("type")=="业主单位":
+            return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("phone")
+    return "","",""
+
+def exportDocument_by_pagetime():
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+
+
+    set_enter = set()
+    str_enter = '''
+ 
+    '''
+    for a in re.split("\s+",str_enter):
+        if a.strip()!="":
+            set_enter.add(a.strip())
+
+    columns = ["doctitle","doctextcon","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
+    columns = ["page_time","contacts","covered_area","floor_space","progress","project_address","project_description","project_investment","project_name"]
+    def getData(df_data,rows,set_line,list_keyword):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            item = {}
+            _dict = row
+            # set_dict_item(item,"",_dict.get("docid",""))
+            tenderee,contact_name,contact_phone = getTenderee(_dict.get("contacts","[]"))
+            set_dict_item(item,"业主",tenderee)
+            set_dict_item(item,"业主联系人",contact_name)
+            set_dict_item(item,"业主联系电话",contact_phone)
+            set_dict_item(item,"发布时间",_dict.get("page_time",""))
+            set_dict_item(item,"建筑面积",_dict.get("covered_area",""))
+            set_dict_item(item,"层数",_dict.get("floor_space",""))
+            set_dict_item(item,"阶段",_dict.get("progress",""))
+            set_dict_item(item,"项目地址",_dict.get("project_address",""))
+            set_dict_item(item,"简介",_dict.get("project_description",""))
+            set_dict_item(item,"项目投资",_dict.get("project_investment",""))
+            set_dict_item(item,"地址","http://www.bidizhaobiao.com/nzjxm-%d.html"%_dict.get("id",""))
+            # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+            # set_dict_item(item,"项目名称",_dict.get("project_name",""))
+            # set_dict_item(item,"区县",_dict.get("district",""))
+            # set_dict_item(item,"发布时间",_dict.get("page_time",""))
+            # set_dict_item(item,"创建时间",_dict.get("crtime",""))
+            #
+            # set_dict_item(item,"行业一级分类",_dict.get("industry",""))
+            # set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
+            #
+            # set_dict_item(item,"uuid",_dict.get("uuid"))
+            #
+            # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))
+            #
+            # set_dict_item(item,"项目编号",_dict.get("project_code",""))
+            # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+            # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+            # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            # set_dict_item(item,"代理单位",_dict.get("agency",""))
+            # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+            # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+            # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+            #
+            # set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
+            # sub_docs_json = _dict.get("sub_docs_json")
+            # if sub_docs_json is not None:
+            #     for _doc in json.loads(sub_docs_json):
+            #         if "win_tenderer" in _doc:
+            #             set_dict_item(item,"中标单位",_doc["win_tenderer"])
+            #         if "win_tenderee_manager" in _doc:
+            #             set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+            #         if "win_tenderee_phone" in _doc:
+            #             set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+            #         if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+            #             set_dict_item(item,"中标金额",_doc["win_bid_price"])
+            #         if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+            #             set_dict_item(item,"招标金额",_doc["bidding_budget"])
+            # if "招标金额" not in item:
+            #     set_dict_item(item,"招标金额","")
+            # if "中标金额" not in item:
+            #     set_dict_item(item,"中标金额","")
+            # if "中标单位" not in item:
+            #     set_dict_item(item,"中标单位","")
+            #
+            # if "中标单位联系人" not in item:
+            #     set_dict_item(item,"中标单位联系人","")
+            # if "中标单位联系电话" not in item:
+            #     set_dict_item(item,"中标单位联系电话","")
+            #
+            # # if item["中标单位"] not in set_enter:
+            # #     continue
+            #
+            # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
+            # if _line in set_line:
+            #     continue
+            # if item["招标金额"]=="":
+            #     continue
+
+            # set_line.add(_line)
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+    # list_province = ["江西","湖南","四川","安徽"]
+    list_province = ["全国"]
+    for _province in list_province:
+        df_data = {}
+
+        str_keywords = '''
+        医院	养老院	疗养院	老人院
+            '''
+        list_keyword = []
+        list_should_keyword = []
+        for _p in re.split("\s|、|,|,|/",str_keywords):
+            if _p.strip()=="":
+                continue
+            list_keyword.append(_p)
+            print(_p)
+            list_should_keyword.append(MatchPhraseQuery('full_text', '%s'%_p.strip()))
+            # list_should_keyword.append(Ma('attachmenttextcon','%s'%_p.strip()))
+
+        s_tenderee = '酒店、地产'
+        list_should_ten = []
+        for _p in re.split("、",s_tenderee):
+            if _p.strip()=="":
+                continue
+            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
+
+        list_should_chan = []
+        list_should_chan.append(TermQuery("docchannel",101))
+        # list_should_chan.append(TermQuery("docchannel",101))
+        # list_should_chan.append(TermQuery("docchannel",102))
+
+        list_should_bidway = []
+        s_bidway = "公开招标、邀请招标、竞争性谈判、竞争性磋商、询价采购、单一来源采购"
+        for _b in re.split("、",s_bidway):
+            if _b.strip()=="":
+                continue
+            list_should_bidway.append(MatchPhraseQuery("doctextcon",_b.strip()))
+
+
+        str_area = '北京、天津'
+        list_should_area = []
+        for _p in str_area.split("、"):
+            list_should_area.append(TermQuery("province",_p))
+
+
+        must_not_q = []
+        not_str = '校园电视台 虚拟演播室'
+        for _s in not_str.split(" "):
+            must_not_q.append(MatchPhraseQuery("doctextcon",_s))
+
+        should_q_keywrod = BoolQuery(should_queries=list_should_keyword)
+        bool_query = BoolQuery(must_queries=[
+
+            RangeQuery("page_time","2020-01-01","2022-01-01",True,True)
+            ,should_q_keywrod
+        ]
+        )
+
+
+
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+        set_line = set()
+        _count = len(rows)
+        print(list_keyword)
+        print("total_count:%d"%total_count)
+        getData(df_data,rows,set_line,list_keyword)
+        while next_token:
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            getData(df_data,rows,set_line,list_keyword)
+            _count += len(rows)
+            # if len(df_data[list(df_data.keys())[0]])>=100:
+            #     break
+
+
+
+        # list_df_columns.append('信用代码')
+        # list_df_columns.append('原网地址')
+        df1 = pd.DataFrame(df_data)
+        df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
+
+
+if __name__=="__main__":
+    exportDocument_by_pagetime()

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 385 - 281
export/exportDocument.py


+ 0 - 619
export/exportEnterprice.py

@@ -1,619 +0,0 @@
-#coding:UTF8
-
-import sys
-import os
-sys.path.append("../")
-
-import pandas as pd
-from dataSource.source import *
-import json
-from utils.multiThread import MultiThreadHandler
-import queue
-from utils.Utils import *
-from dataSource.pool import ConnectorPool
-import re
-from tablestore import *
-import traceback
-
-
-data_path = "../data/"
-
-def getCompanys():
-    list_company = []
-    keywords = ["环境","生态","再生","回收","环保"]
-    provinces = ["广东"]
-    for _name in keywords:
-        for _prov in provinces:
-            data = make_elasticSearch({
-                "query": {
-                    "bool": {
-                        "must": [
-                            {
-                                "wildcard": {
-                                    "name.keyword": "*%s*"%_name
-                                }
-                            }
-                            # ,
-                            # {
-                            #     "term": {
-                            #         "province.keyword": "%s"%_prov
-                            #     }
-                            # }
-                            # ,
-                            # {
-                            #     "range": {
-                            #         "zhongBiaoNumber": {
-                            #             "gt": "0"
-                            #         }
-                            #     }
-                            # }
-                        ],
-                        "must_not": [ ],
-                        "should": [ ]
-                    }
-                },
-                "from": 0,
-                "size": 1000000,
-                "sort": [ ],
-                "aggs": { }
-            })
-            print("--",data["hits"]["total"])
-            for item in data["hits"]["hits"]:
-                _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
-                _company["enterprise_name"] = item["_source"].get("name","")
-                _company["regCapital"] = item["_source"].get("regCapital","")
-                _company["zhongBiaoNumber"] = item["_source"].get("zhongBiaoNumber","0")
-                list_company.append(_company)
-    # data = make_elasticSearch({
-    #     "query": {
-    #         "bool": {
-    #             "must": [
-    #                 {
-    #                     "wildcard": {
-    #                         "name.keyword": "*电商*"
-    #                     }
-    #                 }
-    #                 ,
-    #                 {
-    #                     "term": {
-    #                         "province.keyword": "北京"
-    #                     }
-    #                 }
-    #                 ,
-    #                 {
-    #                     "range": {
-    #                         "zhongBiaoNumber": {
-    #                             "gt": "0"
-    #                         }
-    #                     }
-    #                 }
-    #             ],
-    #             "must_not": [ ],
-    #             "should": [ ]
-    #         }
-    #     },
-    #     "from": 0,
-    #     "size": 10000,
-    #     "sort": [ ],
-    #     "aggs": { }
-    # })
-    #
-    # for item in data["hits"]["hits"]:
-    #     _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
-    #     _company["enterprise_name"] = item["_source"].get("name","")
-    #     _company["regCapital"] = item["_source"].get("regCapital","")
-    #     list_company.append(_company)
-    print(len(list_company))
-    return list_company
-
-def exportFactory():
-    def _handle(item,result_queue,pool_mongo,pool_neo4j):
-        company_name = item["enterprise_name"]
-        mongo = pool_mongo.getConnector()
-        coll_zb = mongo.enterprise_profile
-        rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1, "actualCapital":1,"estiblishTime":1,"legal_person":1,"phone":1 })
-        _flag = False
-        for row in rows:
-            actualCapital = row.get("actualCapital","0")
-            estiblishTime = row.get("estiblishTime","2020-01-01")
-            _captial = re.match("\d+[亿万]+",actualCapital)
-            # if _captial is not None:
-            # if getUnifyMoney(_captial.group())>getUnifyMoney("5000万"):
-            # if estiblishTime<="2015-10-09":
-            item["legal_person"] = row.get("legal_person","")
-            item["phone"] = row.get("phone","")
-            item["actualCapital"] = actualCapital
-            item["estiblishTime"] = row.get("estiblishTime","")
-            _flag = True
-            break
-        if _flag:
-            result_queue.put(item)
-        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
-        graph = pool_neo4j.getConnector()
-        finded = graph.run(cql)
-        data = json.loads(json.dumps(finded.data()))
-        _count = data[0]["_c"]
-        # list_project = []
-        # for _data in data:
-        #     if _count<=3:
-        #         if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
-        #             if _data["project_name"] is not None:
-        #                 list_project.append(_data["project_name"])
-        #     _count += 1
-        item["count"] = _count
-        pool_mongo.putConnector(mongo)
-        pool_neo4j.putConnector(graph)
-    # list_company = getCompanys()
-    list_company = []
-    filename = "../data/天眼查1(1).xlsx"
-    df1 = pd.read_excel(filename)
-    for item in df1["公司名称"]:
-        list_company.append({"enterprise_name":item,"regCapital":"","legal_person":"","phone":"","industry":"","province":""})
-    task_queue = queue.Queue()
-    result_queue = queue.Queue()
-    for item in list_company:
-        task_queue.put(item)
-    pool_mongo = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_mongodb)
-    pool_neo4j = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_neo4j)
-    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j)
-    _mult.run()
-    list_name = []
-    list_actualCapital = []
-    list_estiblishTime = []
-    list_legal_person = []
-    list_phone = []
-    list_zb = []
-    while(True):
-        try:
-            item = result_queue.get(False)
-            list_name.append(item["enterprise_name"])
-            list_actualCapital.append(item["actualCapital"])
-            list_estiblishTime.append(item["estiblishTime"])
-            list_legal_person.append(item["legal_person"])
-            list_phone.append(item["phone"])
-            list_zb.append(item["count"])
-        except:
-            break
-    df = pd.DataFrame({"公司":list_name,"实缴":list_actualCapital,
-                       "注册时间":list_estiblishTime,"联系人":list_legal_person,"联系电话":list_phone,
-                       "中标次数":list_zb})
-    df.to_excel("%s"%filename+"_export.xlsx",columns=["公司","实缴","注册时间","联系人","联系电话","中标次数"])
-
-def deal():
-    def _handle(item,result_queue):
-        graph = getConnect_neo4j()
-        company_name = item["enterprise_name"]
-        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc limit 3"%(company_name)
-        finded = graph.run(cql)
-        data = json.loads(json.dumps(finded.data()))
-        _count = 1
-        list_project = []
-        for _data in data:
-            if _count<=3:
-                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
-                    list_project.append(_data["project_name"])
-            _count += 1
-        item["project"] = str(list_project)
-        result_queue.put(item)
-    file = "../data/北京行业_export.xls"
-    df = pd.read_excel(file)
-    list_company = []
-    for _company,rep,industry,project,count,person,phone in zip(df["公司名字"],df["注册资金"],df["行业"],df["中标项目"],df["中标次数"],df["联系人"],df["联系电话"]):
-        list_company.append({"enterprise_name":_company,"regCapital":rep,"legal_person":person,"phone":phone,"industry":industry,"province":"","count":count})
-    task_queue = queue.Queue()
-    result_queue = queue.Queue()
-    for item in list_company:
-        task_queue.put(item)
-    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
-    _mult.run()
-    list_name = []
-    list_regCapital = []
-    list_industry = []
-    list_count = []
-    list_person = []
-    list_phone = []
-    list_project = []
-    while(True):
-
-        try:
-            _result = result_queue.get(False)
-            list_name.append(_result["enterprise_name"])
-            list_regCapital.append(_result["regCapital"])
-            list_industry.append(_result["industry"])
-            list_count.append(_result["count"])
-            list_person.append(_result["legal_person"])
-            list_phone.append(_result["phone"])
-            list_project.append(_result["project"])
-        except Exception as e:
-            print(e)
-            break
-    df1 = pd.DataFrame({"公司名字":list_name,"注册资金":list_regCapital,"行业":list_industry,"中标项目":list_project,"中标次数":list_count,"联系人":list_person,"联系电话":list_phone})
-    df1.to_excel("%s_export1.xls"%("北京行业"),columns=["公司名字","注册资金","行业","中标项目","中标次数","联系人","联系电话"])
-
-def deal1():
-    def _handle(item,result_queue):
-        graph = getConnect_neo4j()
-        company_name = item["enterprise_name"]
-        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc "%(company_name)
-        finded = graph.run(cql)
-        data = json.loads(json.dumps(finded.data()))
-        _count = 0
-        list_project = []
-        for _data in data:
-            if _count<=2:
-                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
-                    list_project.append(_data["project_name"])
-            _count += 1
-        item["count"] = _count
-        item["project"] = str(list_project)
-        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company_name)
-        print(cql)
-        finded = graph.run(cql)
-        finded_money = json.loads(json.dumps(finded.data()))
-        whole_money = 0
-        for _item in finded_money:
-            if _item["r.price"] is not None:
-                whole_money += getUnifyMoney(_item["r.price"])
-        item["whole_money"] = str(whole_money)
-        result_queue.put(item)
-    # filename = "数据导出需求9.11(1)(1).xlsx"
-    filename = "../data/新建 XLSX 工作表(1).xlsx"
-    df = pd.read_excel(filename)
-    list_company = []
-    for _key in df.keys():
-        print(_key,len(df[_key]))
-    for _company in df["公司名称"]:
-        list_company.append({"enterprise_name":_company,"regCapital":"","legal_person":"","phone":"","industry":"","province":"","count":0})
-    task_queue = queue.Queue()
-    result_queue = queue.Queue()
-    for item in list_company:
-        task_queue.put(item)
-    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
-    _mult.run()
-    _dict_item = {}
-    while(True):
-        try:
-            item = result_queue.get(False)
-            if item["enterprise_name"]!="":
-                _dict_item[item["enterprise_name"]] = item
-        except Exception as e:
-            print(str(e))
-            break
-    list_count = []
-    list_project = []
-    list_money = []
-    list_zb = []
-    for _company in df["公司名称"]:
-        if _company in _dict_item:
-            list_count.append(_dict_item[_company]["count"])
-            list_project.append(_dict_item[_company]["project"])
-            list_money.append(_dict_item[_company]["whole_money"])
-            list_zb.append("是" if _dict_item[_company]["count"]>0 else "否")
-        else:
-            print(_company)
-            list_count.append(0)
-            list_project.append("")
-            list_money.append("0")
-            list_zb.append("否")
-
-    print(len(list_count),len(list_project),len(list_money),len(list_zb))
-    df2 = pd.DataFrame({"公司名称":df["公司名称"],"次数":list_count})
-    df2.to_excel("%s_export.xls"%filename)
-    # df1 = pd.DataFrame({"月份":df["月份"],"电话":df["电话"],"公司名字":df["公司名字"],"开通时间":df["开通时间"],
-    #                     "到期时间":df["到期时间"],"客户公司注册时间":df["客户公司注册时间"],"客户公司注册资金":df["客户公司注册资金"],
-    #                     "实际缴费资金":df["实际缴费资金"],"天眼查行业分类":df["天眼查行业分类"],"是否中标":list_zb,
-    #                     "中标次数":list_count,"中标项目|3个":list_project,"中标金额":list_money,"客户设置关键词":df["客户设置关键词"],"客户搜索词":df["客户搜索词"].xls})
-    # df1.to_excel("%s_补充.xls"%filename,columns=["月份","电话","公司名字",	"开通时间"	,"到期时间"	,"客户公司注册时间"	,"客户公司注册资金"	,"实际缴费资金"	,"天眼查行业分类"	,"是否中标"	,"中标次数"	,"中标项目|3个"	,"中标金额"	,"客户设置关键词"	,"客户搜索词"])
-
-def deal3():
-    filename = "../data/导出工厂.xlsx"
-    df = pd.DataFrame(filename)
-    count = 0
-    for item in df["实缴"]:
-        if getUnifyMoney(item)>getUnifyMoney("5000万"):
-            count += 1
-            print(count)
-
-def exportEnterpriseByName():
-    df = pd.read_csv("../data/中标家具公司.csv",encoding="GBK")
-
-    def _handle(item,result_queue,pool_ots):
-        ots_client = pool_ots.getConnector()
-
-        primary_key = [('name',str(item["name"]))]
-
-        columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
-
-        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
-
-        print(return_row)
-
-        for _item in return_row.attribute_columns:
-            if _item[0]=="contacts":
-                a = json.loads(_item[1])
-                for i in a:
-                    if i.get("mobile_no","")==item["phone"] or i.get("phone_no","")==item["phone"]:
-                        item["contact_person"] = i.get("contact_person","")
-            else:
-                item[_item[0]] = _item[1]
-
-    list_dict = []
-    for name,phone in zip(df["name"],df["phone"]):
-        list_dict.append({"name":name,"phone":phone})
-
-    task_queue = queue.Queue()
-    for item in list_dict:
-        task_queue.put(item)
-
-    result_queue = queue.Queue()
-    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
-    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_ots=pool_ots)
-    mt.run()
-
-    columns = ["name","contact_person","phone","reg_capital","actual_capital","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
-    df_data = {}
-    for _c in columns:
-        df_data[_c] = []
-    for item in list_dict:
-        for _key in columns:
-            df_data[_key].append(item.get(_key,""))
-    df1 = pd.DataFrame(df_data)
-    df1.to_csv("中标家具公司1.csv")
-
-def getCompanys():
-    conn = getConnection_mysql()
-    cursor = conn.cursor()
-    sql = '''select C.login_id as 登陆名,B.company ,B.contactname as 联系人,B.phone as 联系电话 ,(select MLEVELNAME from sys_memberlevel where id =A.memberlevelid) as 会员等级,( select name from b2c_mall_staff_basic_info where userid=B.aftermarket) as 售后客服   from bxkc.bxkc_member_term A,bxkc.b2c_mall_staff_basic_info B,bxkc.b2c_user_login_info C
-where A.USERID=B.USERID and B.USERID=C.USERID and B.innerOrg like '广州%'
-and A.memberlevelid!=81 and A.status='01' and str_to_date('2020-11-20','%Y-%m-%d') between  A.stime and A.etiem ;
-'''
-    cursor.execute(sql)
-    vol = cursor.description
-    list_company = []
-    rows = cursor.fetchall()
-    for row in rows:
-        _company = {}
-        for _vol,_value in zip(vol,row):
-            _name = _vol[0]
-            _company[_name] = _value
-        list_company.append(_company)
-    return list_company
-
-def exportEnterprise_byindustry(page_time,
-                                columns = ["name","address","business_scope","province","city","district","reg_capital","phone","estiblish_time"],
-                                keywords = ["钢材","水泥","五金","水电","暖通","暖气","电缆"]):
-
-    list_should_q = []
-    for _key in keywords:
-        list_should_q.append(WildcardQuery("industry","*%s*"%_key))
-        list_should_q.append(WildcardQuery("nicknames","*%s*"%_key))
-    key_query = BoolQuery(should_queries=list_should_q)
-
-    #WildcardQuery("industry","*建筑*")
-    ots_client = getConnect_ots()
-    bool_query = BoolQuery(must_queries=[RangeQuery("bidi_id",0,include_lower=True),
-                                         key_query,
-                                         RangeQuery("estiblish_time",range_to="2017-01-01")])
-
-    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
-                                                                      SearchQuery(bool_query, limit=100, get_total_count=True),
-                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
-    all_rows = 0
-    df_data = {}
-    for key in columns:
-        df_data[key] = []
-    for row in rows:
-        _dict = dict()
-        for part in row:
-            for item in part:
-                _dict[item[0]] = item[1]
-        for key in columns:
-            df_data[key].append(_dict.get(key,""))
-        # if "reg_capital" in _dict:
-        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
-        #     if _money is not None:
-        #         if getUnifyMoney(_money.group())>2000000:
-        #             for key in columns:
-        #                 df_data[key].append(_dict.get(key,""))
-    all_rows += len(rows)
-
-    # print(next_token)
-    while(next_token):
-        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
-                                                                          SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
-                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
-        for row in rows:
-            _dict = dict()
-            for part in row:
-                for item in part:
-                    _dict[item[0]] = item[1]
-            for key in columns:
-                df_data[key].append(_dict.get(key,""))
-        # if "reg_capital" in _dict:
-        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
-        #     if _money is not None:
-        #         if getUnifyMoney(_money.group())>2000000:
-        #             for key in columns:
-        #                 df_data[key].append(_dict.get(key,""))
-        all_rows += len(rows)
-        print(all_rows,total_count,len(df_data[columns[0]]))
-    df = pd.DataFrame(df_data)
-    df.to_csv("../data/enterprise_2017_a.csv",columns=columns)
-
-
-def getTyc_company():
-    root_path = ["G:/文档/tyc国企","G:/文档/tyc机构"]
-    list_files = []
-    for _path in root_path:
-        for file in os.listdir(_path):
-            list_files.append(os.path.join(_path,file))
-
-    list_files = ["G:/文档/tyc机构\\高级搜索导出数据结果—自定义条件—天眼查(W20011656561610789770227).xlsx"]
-
-    pool_mysql = ConnectorPool(method_init=getConnection_testmysql,init_num=10,max_num=30)
-    task_queue = queue.Queue()
-    result_queue = queue.Queue()
-    for _file in list_files:
-        task_queue.put(_file)
-
-    def _handle(_file,task_queue,pool_mysql):
-        print("handle",_file)
-        conn = pool_mysql.getConnector()
-        cursor = conn.cursor()
-        df = pd.read_excel(_file,header=2)
-        for name,social_credit,identification,regist_num,organization_code in zip(df["公司名称"],df["统一社会信用代码"],df["纳税人识别号"],df["注册号"],df["组织机构代码"]):
-            try:
-                sql = " insert into Enterprise(name,social_credit,identification,regist_num,organization_code) values ('%s','%s','%s','%s','%s')"%(name,social_credit,identification,regist_num,organization_code)
-                cursor.execute(sql)
-            except Exception as e:
-                print("error")
-        conn.commit()
-        pool_mysql.putConnector(conn)
-
-    mt = MultiThreadHandler(task_queue,_handle,result_queue,20,pool_mysql=pool_mysql)
-    mt.run()
-
-def exportEnterprise_by_bidNum():
-
-    ots_client = getConnect_ots()
-    bool_query = BoolQuery(must_queries=[RangeQuery("tyc_id",1,include_lower=True),
-                                           RangeQuery("bid_number",4,include_lower=True)
-                                           ])
-
-    columns = ["name"]
-    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
-                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
-                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
-    df_data = {}
-    for _key in columns:
-        df_data[_key] = []
-
-    def getData(df_data,rows):
-        list_dict = getRow_ots(rows)
-        for _dict in list_dict:
-            for _key in columns:
-                _v = _dict.get(_key,"")
-                if len(_v)>4:
-                    df_data[_key].append(_v)
-    getData(df_data,rows)
-    _count = len(rows)
-    while(next_token):
-        print("%d/%d"%(_count,total_count))
-        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
-                                                                          SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
-                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
-        getData(df_data,rows)
-        _count += len(rows)
-
-    df = pd.DataFrame(df_data)
-    df.to_csv("../data/enterprise_bidinum.csv",columns=columns)
-
-def make_Legal_enterprise():
-    import codecs
-    def format(_e):
-        if _e is None:
-            return None
-        if not isinstance(_e,str):
-            return None
-        if re.search("^[a-zA-Z0-9]+$",_e) is not None:
-            return None
-        if re.search("[<《]>-。\-\.\?]",_e) is not None:
-            return None
-        _e1 = re.sub("\s+","",_e.replace("(","(").replace(")",")"))
-        if re.search("[省市区县乡镇]$",_e) is not None:
-            return None
-        if len(_e1)>=4:
-            return _e1
-        return None
-    set_enterprise = set()
-    df = pd.read_csv("../data/enterprise_bidinum.csv",encoding="GBK")
-
-    _count = 0
-    for _e in df["name"]:
-        _count += 1
-        if _count%10000==0:
-            print(_count)
-        _e1 = format(_e)
-        if _e1 is not None:
-            set_enterprise.add(_e1)
-
-    conn = getConnection_testmysql()
-    cursor = conn.cursor()
-    sql = " select name from Enterprise "
-    cursor.execute(sql)
-    rows = cursor.fetchmany(10000)
-    while rows:
-        for row in rows:
-            _count += 1
-            if _count%10000==0:
-                print(_count)
-            _e = row[0]
-            _e1 = format(_e)
-            if _e1 is not None:
-                set_enterprise.add(_e1)
-        rows = cursor.fetchmany(10000)
-
-    with codecs.open("../data/LEGAL_ENTERPRISE.txt","w",encoding="UTF8") as f:
-        for _e in list(set_enterprise):
-            f.write(_e+"\n")
-
-
-def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]):
-    task_queue = queue.Queue()
-    result_queue= queue.Queue()
-
-    for _enterprise in list_enterprise:
-        task_queue.put(_enterprise)
-    def _handle(item,result_queue,pool_ots):
-        ots_client = pool_ots.getConnector()
-        primary_key = [("name",item)]
-        consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
-        dict_data = getRow_ots_primary(return_row)
-        if dict_data is not None:
-            result_queue.put({item:dict_data})
-
-        pool_ots.putConnector(ots_client)
-
-    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
-    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
-    mt.run()
-
-    dict_enterprise = {}
-    while True:
-        try:
-            _dict = result_queue.get(False)
-            for k,v in _dict.items():
-                dict_enterprise[k] = v
-        except Exception as e:
-            break
-    return dict_enterprise
-
-
-
-def getOneContact(contacts,tojson=True,mobile_first=True):
-    if tojson:
-        list_contacts = json.loads(contacts)
-    else:
-        list_contacts = contacts
-    mobile_person = ""
-    mobile_no = ''
-    phone_person = ""
-    phone_no = ''
-    for _contact in list_contacts:
-        if _contact.get("mobile_no","")!="":
-            mobile_person = _contact.get("contact_person","")
-            mobile_no = _contact.get("mobile_no","")
-        if _contact.get("phone_no","")!="":
-            phone_person = _contact.get("phone_no","")
-            phone_no = _contact.get("phone_no","")
-    if mobile_first:
-        return mobile_person,mobile_no
-    return phone_person,phone_no
-
-
-
-
-if __name__=="__main__":
-    # getTyc_company()
-    exportEnterprise_by_bidNum()
-    make_Legal_enterprise()

+ 1207 - 0
export/exportEnterprise.py

@@ -0,0 +1,1207 @@
+#coding:UTF8
+
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+
+
+from export.exportUtils import generateBoolShouldQuery,splitIntoList
+
+data_path = "../data/"
+
+def getCompanys():
+    list_company = []
+    keywords = ["环境","生态","再生","回收","环保"]
+    provinces = ["广东"]
+    for _name in keywords:
+        for _prov in provinces:
+            data = make_elasticSearch({
+                "query": {
+                    "bool": {
+                        "must": [
+                            {
+                                "wildcard": {
+                                    "name.keyword": "*%s*"%_name
+                                }
+                            }
+                            # ,
+                            # {
+                            #     "term": {
+                            #         "province.keyword": "%s"%_prov
+                            #     }
+                            # }
+                            # ,
+                            # {
+                            #     "range": {
+                            #         "zhongBiaoNumber": {
+                            #             "gt": "0"
+                            #         }
+                            #     }
+                            # }
+                        ],
+                        "must_not": [ ],
+                        "should": [ ]
+                    }
+                },
+                "from": 0,
+                "size": 1000000,
+                "sort": [ ],
+                "aggs": { }
+            })
+            print("--",data["hits"]["total"])
+            for item in data["hits"]["hits"]:
+                _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
+                _company["enterprise_name"] = item["_source"].get("name","")
+                _company["regCapital"] = item["_source"].get("regCapital","")
+                _company["zhongBiaoNumber"] = item["_source"].get("zhongBiaoNumber","0")
+                list_company.append(_company)
+    # data = make_elasticSearch({
+    #     "query": {
+    #         "bool": {
+    #             "must": [
+    #                 {
+    #                     "wildcard": {
+    #                         "name.keyword": "*电商*"
+    #                     }
+    #                 }
+    #                 ,
+    #                 {
+    #                     "term": {
+    #                         "province.keyword": "北京"
+    #                     }
+    #                 }
+    #                 ,
+    #                 {
+    #                     "range": {
+    #                         "zhongBiaoNumber": {
+    #                             "gt": "0"
+    #                         }
+    #                     }
+    #                 }
+    #             ],
+    #             "must_not": [ ],
+    #             "should": [ ]
+    #         }
+    #     },
+    #     "from": 0,
+    #     "size": 10000,
+    #     "sort": [ ],
+    #     "aggs": { }
+    # })
+    #
+    # for item in data["hits"]["hits"]:
+    #     _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
+    #     _company["enterprise_name"] = item["_source"].get("name","")
+    #     _company["regCapital"] = item["_source"].get("regCapital","")
+    #     list_company.append(_company)
+    print(len(list_company))
+    return list_company
+
+def exportFactory():
+    def _handle(item,result_queue,pool_mongo,pool_neo4j):
+        company_name = item["enterprise_name"]
+        mongo = pool_mongo.getConnector()
+        coll_zb = mongo.enterprise_profile
+        rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1, "actualCapital":1,"estiblishTime":1,"legal_person":1,"phone":1 })
+        _flag = False
+        for row in rows:
+            actualCapital = row.get("actualCapital","0")
+            estiblishTime = row.get("estiblishTime","2020-01-01")
+            _captial = re.match("\d+[亿万]+",actualCapital)
+            # if _captial is not None:
+            # if getUnifyMoney(_captial.group())>getUnifyMoney("5000万"):
+            # if estiblishTime<="2015-10-09":
+            item["legal_person"] = row.get("legal_person","")
+            item["phone"] = row.get("phone","")
+            item["actualCapital"] = actualCapital
+            item["estiblishTime"] = row.get("estiblishTime","")
+            _flag = True
+            break
+        if _flag:
+            result_queue.put(item)
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
+        graph = pool_neo4j.getConnector()
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = data[0]["_c"]
+        # list_project = []
+        # for _data in data:
+        #     if _count<=3:
+        #         if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+        #             if _data["project_name"] is not None:
+        #                 list_project.append(_data["project_name"])
+        #     _count += 1
+        item["count"] = _count
+        pool_mongo.putConnector(mongo)
+        pool_neo4j.putConnector(graph)
+    # list_company = getCompanys()
+    list_company = []
+    filename = "../data/天眼查1(1).xlsx"
+    df1 = pd.read_excel(filename)
+    for item in df1["公司名称"]:
+        list_company.append({"enterprise_name":item,"regCapital":"","legal_person":"","phone":"","industry":"","province":""})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    pool_mongo = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_mongodb)
+    pool_neo4j = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_neo4j)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j)
+    _mult.run()
+    list_name = []
+    list_actualCapital = []
+    list_estiblishTime = []
+    list_legal_person = []
+    list_phone = []
+    list_zb = []
+    while(True):
+        try:
+            item = result_queue.get(False)
+            list_name.append(item["enterprise_name"])
+            list_actualCapital.append(item["actualCapital"])
+            list_estiblishTime.append(item["estiblishTime"])
+            list_legal_person.append(item["legal_person"])
+            list_phone.append(item["phone"])
+            list_zb.append(item["count"])
+        except:
+            break
+    df = pd.DataFrame({"公司":list_name,"实缴":list_actualCapital,
+                       "注册时间":list_estiblishTime,"联系人":list_legal_person,"联系电话":list_phone,
+                       "中标次数":list_zb})
+    df.to_excel("%s"%filename+"_export.xlsx",columns=["公司","实缴","注册时间","联系人","联系电话","中标次数"])
+
+def deal():
+    def _handle(item,result_queue):
+        graph = getConnect_neo4j()
+        company_name = item["enterprise_name"]
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc limit 3"%(company_name)
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = 1
+        list_project = []
+        for _data in data:
+            if _count<=3:
+                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+                    list_project.append(_data["project_name"])
+            _count += 1
+        item["project"] = str(list_project)
+        result_queue.put(item)
+    file = "../data/北京行业_export.xls"
+    df = pd.read_excel(file)
+    list_company = []
+    for _company,rep,industry,project,count,person,phone in zip(df["公司名字"],df["注册资金"],df["行业"],df["中标项目"],df["中标次数"],df["联系人"],df["联系电话"]):
+        list_company.append({"enterprise_name":_company,"regCapital":rep,"legal_person":person,"phone":phone,"industry":industry,"province":"","count":count})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
+    _mult.run()
+    list_name = []
+    list_regCapital = []
+    list_industry = []
+    list_count = []
+    list_person = []
+    list_phone = []
+    list_project = []
+    while(True):
+
+        try:
+            _result = result_queue.get(False)
+            list_name.append(_result["enterprise_name"])
+            list_regCapital.append(_result["regCapital"])
+            list_industry.append(_result["industry"])
+            list_count.append(_result["count"])
+            list_person.append(_result["legal_person"])
+            list_phone.append(_result["phone"])
+            list_project.append(_result["project"])
+        except Exception as e:
+            print(e)
+            break
+    df1 = pd.DataFrame({"公司名字":list_name,"注册资金":list_regCapital,"行业":list_industry,"中标项目":list_project,"中标次数":list_count,"联系人":list_person,"联系电话":list_phone})
+    df1.to_excel("%s_export1.xls"%("北京行业"),columns=["公司名字","注册资金","行业","中标项目","中标次数","联系人","联系电话"])
+
+def deal1():
+    def _handle(item,result_queue):
+        graph = getConnect_neo4j()
+        company_name = item["enterprise_name"]
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc "%(company_name)
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = 0
+        list_project = []
+        for _data in data:
+            if _count<=2:
+                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+                    list_project.append(_data["project_name"])
+            _count += 1
+        item["count"] = _count
+        item["project"] = str(list_project)
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company_name)
+        print(cql)
+        finded = graph.run(cql)
+        finded_money = json.loads(json.dumps(finded.data()))
+        whole_money = 0
+        for _item in finded_money:
+            if _item["r.price"] is not None:
+                whole_money += getUnifyMoney(_item["r.price"])
+        item["whole_money"] = str(whole_money)
+        result_queue.put(item)
+    # filename = "数据导出需求9.11(1)(1).xlsx"
+    filename = "../data/新建 XLSX 工作表(1).xlsx"
+    df = pd.read_excel(filename)
+    list_company = []
+    for _key in df.keys():
+        print(_key,len(df[_key]))
+    for _company in df["公司名称"]:
+        list_company.append({"enterprise_name":_company,"regCapital":"","legal_person":"","phone":"","industry":"","province":"","count":0})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
+    _mult.run()
+    _dict_item = {}
+    while(True):
+        try:
+            item = result_queue.get(False)
+            if item["enterprise_name"]!="":
+                _dict_item[item["enterprise_name"]] = item
+        except Exception as e:
+            print(str(e))
+            break
+    list_count = []
+    list_project = []
+    list_money = []
+    list_zb = []
+    for _company in df["公司名称"]:
+        if _company in _dict_item:
+            list_count.append(_dict_item[_company]["count"])
+            list_project.append(_dict_item[_company]["project"])
+            list_money.append(_dict_item[_company]["whole_money"])
+            list_zb.append("是" if _dict_item[_company]["count"]>0 else "否")
+        else:
+            print(_company)
+            list_count.append(0)
+            list_project.append("")
+            list_money.append("0")
+            list_zb.append("否")
+
+    print(len(list_count),len(list_project),len(list_money),len(list_zb))
+    df2 = pd.DataFrame({"公司名称":df["公司名称"],"次数":list_count})
+    df2.to_excel("%s_export.xls"%filename)
+    # df1 = pd.DataFrame({"月份":df["月份"],"电话":df["电话"],"公司名字":df["公司名字"],"开通时间":df["开通时间"],
+    #                     "到期时间":df["到期时间"],"客户公司注册时间":df["客户公司注册时间"],"客户公司注册资金":df["客户公司注册资金"],
+    #                     "实际缴费资金":df["实际缴费资金"],"天眼查行业分类":df["天眼查行业分类"],"是否中标":list_zb,
+    #                     "中标次数":list_count,"中标项目|3个":list_project,"中标金额":list_money,"客户设置关键词":df["客户设置关键词"],"客户搜索词":df["客户搜索词"].xls})
+    # df1.to_excel("%s_补充.xls"%filename,columns=["月份","电话","公司名字",	"开通时间"	,"到期时间"	,"客户公司注册时间"	,"客户公司注册资金"	,"实际缴费资金"	,"天眼查行业分类"	,"是否中标"	,"中标次数"	,"中标项目|3个"	,"中标金额"	,"客户设置关键词"	,"客户搜索词"])
+
+def deal3():
+    filename = "../data/导出工厂.xlsx"
+    df = pd.DataFrame(filename)
+    count = 0
+    for item in df["实缴"]:
+        if getUnifyMoney(item)>getUnifyMoney("5000万"):
+            count += 1
+            print(count)
+
+def exportEnterpriseByName():
+    df = pd.read_csv("../data/中标家具公司.csv",encoding="GBK")
+
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+
+        primary_key = [('name',str(item["name"]))]
+
+        columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
+
+        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
+
+        print(return_row)
+
+        for _item in return_row.attribute_columns:
+            if _item[0]=="contacts":
+                a = json.loads(_item[1])
+                for i in a:
+                    if i.get("mobile_no","")==item["phone"] or i.get("phone_no","")==item["phone"]:
+                        item["contact_person"] = i.get("contact_person","")
+            else:
+                item[_item[0]] = _item[1]
+
+    list_dict = []
+    for name,phone in zip(df["name"],df["phone"]):
+        list_dict.append({"name":name,"phone":phone})
+
+    task_queue = queue.Queue()
+    for item in list_dict:
+        task_queue.put(item)
+
+    result_queue = queue.Queue()
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_ots=pool_ots)
+    mt.run()
+
+    columns = ["name","contact_person","phone","reg_capital","actual_capital","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = []
+    for item in list_dict:
+        for _key in columns:
+            df_data[_key].append(item.get(_key,""))
+    df1 = pd.DataFrame(df_data)
+    df1.to_csv("中标家具公司1.csv")
+
+def getCompanys():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    sql = '''select C.login_id as 登陆名,B.company ,B.contactname as 联系人,B.phone as 联系电话 ,(select MLEVELNAME from sys_memberlevel where id =A.memberlevelid) as 会员等级,( select name from b2c_mall_staff_basic_info where userid=B.aftermarket) as 售后客服   from bxkc.bxkc_member_term A,bxkc.b2c_mall_staff_basic_info B,bxkc.b2c_user_login_info C
+where A.USERID=B.USERID and B.USERID=C.USERID and B.innerOrg like '广州%'
+and A.memberlevelid!=81 and A.status='01' and str_to_date('2020-11-20','%Y-%m-%d') between  A.stime and A.etiem ;
+'''
+    cursor.execute(sql)
+    vol = cursor.description
+    list_company = []
+    rows = cursor.fetchall()
+    for row in rows:
+        _company = {}
+        for _vol,_value in zip(vol,row):
+            _name = _vol[0]
+            _company[_name] = _value
+        list_company.append(_company)
+    return list_company
+
+def exportEnterprise_byindustry(page_time,
+                                columns = ["name","address","business_scope","province","city","district","reg_capital","phone","estiblish_time"],
+                                keywords = ["钢材","水泥","五金","水电","暖通","暖气","电缆"]):
+
+    list_should_q = []
+    for _key in keywords:
+        list_should_q.append(WildcardQuery("industry","*%s*"%_key))
+        list_should_q.append(WildcardQuery("nicknames","*%s*"%_key))
+    key_query = BoolQuery(should_queries=list_should_q)
+
+    #WildcardQuery("industry","*建筑*")
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[RangeQuery("bidi_id",0,include_lower=True),
+                                         key_query,
+                                         RangeQuery("estiblish_time",range_to="2017-01-01")])
+
+    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                      SearchQuery(bool_query, limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    all_rows = 0
+    df_data = {}
+    for key in columns:
+        df_data[key] = []
+    for row in rows:
+        _dict = dict()
+        for part in row:
+            for item in part:
+                _dict[item[0]] = item[1]
+        for key in columns:
+            df_data[key].append(_dict.get(key,""))
+        # if "reg_capital" in _dict:
+        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
+        #     if _money is not None:
+        #         if getUnifyMoney(_money.group())>2000000:
+        #             for key in columns:
+        #                 df_data[key].append(_dict.get(key,""))
+    all_rows += len(rows)
+
+    # print(next_token)
+    while(next_token):
+        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                          SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        for row in rows:
+            _dict = dict()
+            for part in row:
+                for item in part:
+                    _dict[item[0]] = item[1]
+            for key in columns:
+                df_data[key].append(_dict.get(key,""))
+        # if "reg_capital" in _dict:
+        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
+        #     if _money is not None:
+        #         if getUnifyMoney(_money.group())>2000000:
+        #             for key in columns:
+        #                 df_data[key].append(_dict.get(key,""))
+        all_rows += len(rows)
+        print(all_rows,total_count,len(df_data[columns[0]]))
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/enterprise_2017_a.csv",columns=columns)
+
+
+def getTyc_company():
+    root_path = ["G:/文档/tyc国企","G:/文档/tyc机构"]
+    list_files = []
+    for _path in root_path:
+        for file in os.listdir(_path):
+            list_files.append(os.path.join(_path,file))
+
+    list_files = ["G:/文档/tyc机构\\高级搜索导出数据结果—自定义条件—天眼查(W20011656561610789770227).xlsx"]
+
+    pool_mysql = ConnectorPool(method_init=getConnection_testmysql,init_num=10,max_num=30)
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for _file in list_files:
+        task_queue.put(_file)
+
+    def _handle(_file,task_queue,pool_mysql):
+        print("handle",_file)
+        conn = pool_mysql.getConnector()
+        cursor = conn.cursor()
+        df = pd.read_excel(_file,header=2)
+        for name,social_credit,identification,regist_num,organization_code in zip(df["公司名称"],df["统一社会信用代码"],df["纳税人识别号"],df["注册号"],df["组织机构代码"]):
+            try:
+                sql = " insert into Enterprise(name,social_credit,identification,regist_num,organization_code) values ('%s','%s','%s','%s','%s')"%(name,social_credit,identification,regist_num,organization_code)
+                cursor.execute(sql)
+            except Exception as e:
+                print("error")
+        conn.commit()
+        pool_mysql.putConnector(conn)
+
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,20,pool_mysql=pool_mysql)
+    mt.run()
+
+set_columns = set()
+list_df_columns = []
+
+def set_dict_item(_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns:
+        set_columns.add(name)
+        list_df_columns.append(getLegal_str(name))
+
+def exportEnterprise_by_bidNum():
+
+
+
+    columns = ["name","contacts","province","city","address","reg_location"]
+
+    list_data = []
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_not_queries=[
+        ExistsQuery("tyc_id"),
+        RangeQuery("bid_number",1),
+        RangeQuery("status",401,451),
+        BoolQuery(should_queries=[NestedQuery("contacts",ExistsQuery("contacts.phone_no")),
+        NestedQuery("contacts",ExistsQuery("contacts.mobile_no"))])
+                                           ])
+    for _prov in ["北京","天津"]:
+        bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=[TermQuery("province",_prov)]),
+                                             BoolQuery(should_queries=[MatchPhraseQuery("nicknames","地产"),MatchPhraseQuery("nicknames","酒店")]),
+                                             NestedQuery("contacts",WildcardQuery("contacts.mobile_no","1*"))])
+        #
+        # bool_query = BoolQuery(must_queries=[MatchPhraseQuery("nicknames","物资回收"),
+        #                                      TermQuery("province","贵州")]
+                               # ,must_not_queries=[ExistsQuery("tyc_id"),NestedQuery("contacts",ExistsQuery("contacts"))]
+                               #                   )
+
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+
+
+
+        def getData(df_data,rows):
+            list_dict = getRow_ots(rows)
+            for _dict in list_dict:
+                print(_dict)
+
+                for mobile_person,mobile_no in getMobiles(_dict.get("contacts","[]")):
+                # for contact_person,mobile_no in getMobiles(_dict.get("contacts","[{}]")):
+                    _d = {}
+                    set_dict_item(_d,"名称",_dict.get("name",""))
+                    set_dict_item(_d,"省份",_dict.get("province",""))
+                    set_dict_item(_d,"城市",_dict.get("city",""))
+                    set_dict_item(_d,"联系人",mobile_person)
+                    set_dict_item(_d,"手机",mobile_no)
+                    list_data.append(_d)
+
+                # _d = {}
+                # set_dict_item(_d,"名称",_dict.get("name",""))
+                # set_dict_item(_d,"省份",_dict.get("province",""))
+                # set_dict_item(_d,"城市",_dict.get("city",""))
+                # list_data.append(_d)
+                # mobile_person,mobile_no = getOneContact(_dict.get("contacts"))
+                # if mobile_no!="":
+                #     set_dict_item(_d,"联系人",mobile_person)
+                #     set_dict_item(_d,"手机",mobile_no)
+                #     # _address = _dict.get("address","")
+                #     # reg_location = _dict.get("reg_location","")
+                #     # if _address=="":
+                #     #     _address = reg_location
+                #     # set_dict_item(_d,"地址",_address)
+                #     list_data.append(_d)
+
+        getData(df_data,rows)
+        _count = len(rows)
+        while(next_token):
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                              SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            getData(df_data,rows)
+            _count += len(rows)
+            if _count>=300:
+                break
+    df_data = {}
+
+    for item in list_data:
+        for k in list_df_columns:
+            if k not in df_data:
+                df_data[k] = []
+            df_data[k].append(item.get(k))
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s_enterprise_bidinum.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
+
+def make_Legal_enterprise():
+    import codecs
+    def format(_e):
+        if _e is None:
+            return None
+        if not isinstance(_e,str):
+            return None
+        if re.search("^[a-zA-Z0-9]+$",_e) is not None:
+            return None
+        if re.search("[<《]>-。\-\.\?]",_e) is not None:
+            return None
+        _e1 = re.sub("\s+","",_e.replace("(","(").replace(")",")"))
+        if re.search("[省市区县乡镇]$",_e) is not None:
+            return None
+        if len(_e1)>=4:
+            return _e1
+        return None
+    set_enterprise = set()
+    df = pd.read_csv("../data/other/enterprise_bidinum.csv", encoding="GBK")
+
+    _count = 0
+    for _e in df["name"]:
+        _count += 1
+        if _count%10000==0:
+            print(_count)
+        _e1 = format(_e)
+        if _e1 is not None:
+            set_enterprise.add(_e1)
+
+    conn = getConnection_testmysql()
+    cursor = conn.cursor()
+    sql = " select name from Enterprise "
+    cursor.execute(sql)
+    rows = cursor.fetchmany(10000)
+    while rows:
+        for row in rows:
+            _count += 1
+            if _count%10000==0:
+                print(_count)
+            _e = row[0]
+            _e1 = format(_e)
+            if _e1 is not None:
+                set_enterprise.add(_e1)
+        rows = cursor.fetchmany(10000)
+
+    with codecs.open("../data/other/LEGAL_ENTERPRISE.txt", "w", encoding="UTF8") as f:
+        for _e in list(set_enterprise):
+            f.write(_e+"\n")
+
+
+def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","industry","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code"]):
+    task_queue = queue.Queue()
+    result_queue= queue.Queue()
+
+    for _enterprise in list_enterprise:
+        task_queue.put(str(_enterprise))
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+        try:
+            primary_key = [("name",item)]
+            consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
+            dict_data = getRow_ots_primary(return_row)
+            if dict_data is not None:
+                result_queue.put({item:dict_data})
+        except Exception as e:
+            traceback.print_exc()
+
+        pool_ots.putConnector(ots_client)
+
+    pool_ots = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=50,pool_ots=pool_ots)
+    mt.run()
+
+    dict_enterprise = {}
+    while True:
+        try:
+            _dict = result_queue.get(False)
+            for k,v in _dict.items():
+                dict_enterprise[k] = v
+        except Exception as e:
+            break
+    return dict_enterprise
+
+
+
+def getOneContact(contacts,tojson=True,mobile_first=True,mobile_only=True):
+    mobile_person = ""
+    mobile_no = ''
+    phone_person = ""
+    phone_no = ''
+    if contacts is None:
+        return "",""
+    try:
+        if tojson:
+            list_contacts = json.loads(contacts)
+        else:
+            list_contacts = contacts
+
+        for _contact in list_contacts:
+            if _contact.get("mobile_no","")!="":
+                mobile_person = _contact.get("contact_person","")
+                mobile_no = _contact.get("mobile_no","")
+            if _contact.get("phone_no","")!="":
+                phone_person = _contact.get("contact_person","")
+                phone_no = _contact.get("phone_no","")
+        if mobile_first:
+            if mobile_no!="":
+                return mobile_person,mobile_no
+            else:
+                if mobile_only:
+                    return mobile_person,mobile_no
+    except Exception as e:
+        pass
+    return phone_person,phone_no
+
+def getMobiles(contacts,to_json=True):
+    if to_json:
+        list_contacts = json.loads(contacts)
+    else:
+        list_contacts = contacts
+    list_result = []
+    for _c in list_contacts:
+        if _c.get("mobile_no","")!="":
+            list_result.append([_c.get("contact_person",""),_c.get("mobile_no")])
+    return list_result
+
+def getEnterpriseData(list_enterprise,df_data):
+    def getEnterpriseData(list_enterprise,df_data):
+        for _e in list_enterprise:
+            _dict = {}
+            set_dict_item(_dict,"公司名称",_e.get("name"))
+
+
+            set_dict_item(_dict,"省份",_e.get("province"))
+            set_dict_item(_dict,"城市",_e.get("city"))
+            set_dict_item(_dict,"法人",_e.get("legal_person"))
+            set_dict_item(_dict,"法人电话",_e.get("phone"))
+            _match = re.search("^1\d{10}",_e.get("phone",""))
+            set_dict_item(_dict,"是否手机","是" if _match is not None else "否")
+            # set_dict_item(_dict,"企业属性",v.get("business_scope",""))
+            # set_dict_item(_dict,"行业",v.get("industry",""))
+            # contact_person,mobile_no = getOneContact(v.get("contacts",'[]'))
+            # set_dict_item(_dict,"所有联系方式",v.get("contacts"))
+            # set_dict_item(_dict,"联系人",contact_person)
+            # set_dict_item(_dict,"手机号",mobile_no)
+            # set_dict_item(_dict,"注册时间",v.get("estiblish_time",""))
+            # set_dict_item(_dict,"注册资金",v.get("reg_capital",""))
+            # set_dict_item(_dict,"bid_number",v.get("bid_number",0))
+            # set_dict_item(_dict,"招标次数",v.get("zhao_biao_number",0))
+            # set_dict_item(_dict,"投标次数",v.get("tou_biao_number",0))
+            # set_dict_item(_dict,"中标次数",v.get("zhong_biao_number",0))
+            # set_dict_item(_dict,"主营产品",v.get("products",""))
+            for k,v in _dict.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+def exportEnterprise():
+    def getEnterpriseData(list_enterprise,df_data):
+        for _e in list_enterprise:
+            _dict = {}
+            set_dict_item(_dict,"公司名称",_e.get("name"))
+
+            bool_query = BoolQuery(must_queries=[
+                TermQuery("enterprise_name",_e.get("name"))
+            ])
+
+            rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
+                                                                           SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC)]),limit=5,get_total_count=False),
+                                                                           ColumnsToGet(["contact_person","position","phone_no"],ColumnReturnType.SPECIFIED))
+            list_row = getRow_ots(rows)
+            for _i in range(1,6):
+                if _i-1<len(list_row):
+                    set_dict_item(_dict,"企业联系人%d"%_i,"%s(%s)\n%s"%(list_row[_i-1].get("contact_person",""),list_row[_i-1].get("position",""),list_row[_i-1].get("phone_no","")))
+                else:
+                    set_dict_item(_dict,"企业联系人%d"%_i,"")
+
+# 企业状态	行业	机构类型	注册资本	成立时间	企业注册地	企业地址
+
+            set_dict_item(_dict,"企业状态",_e.get("reg_status"))
+            set_dict_item(_dict,"行业",_e.get("industry"))
+            set_dict_item(_dict,"机构类型","公司")
+            set_dict_item(_dict,"注册资本",_e.get("reg_capital"))
+            set_dict_item(_dict,"成立时间",_e.get("found_date"))
+            set_dict_item(_dict,"企业注册地","%s-%s"%(_e.get("province",""),_e.get("city","")))
+            set_dict_item(_dict,"企业地址",_e.get("reg_location"))
+
+            # _match = re.search("^1\d{10}",_e.get("phone",""))
+            # set_dict_item(_dict,"是否手机","是" if _match is not None else "否")
+            # set_dict_item(_dict,"企业属性",v.get("business_scope",""))
+            # set_dict_item(_dict,"行业",v.get("industry",""))
+            # contact_person,mobile_no = getOneContact(v.get("contacts",'[]'))
+            # set_dict_item(_dict,"所有联系方式",v.get("contacts"))
+            # set_dict_item(_dict,"联系人",contact_person)
+            # set_dict_item(_dict,"手机号",mobile_no)
+            # set_dict_item(_dict,"注册时间",v.get("estiblish_time",""))
+            # set_dict_item(_dict,"注册资金",v.get("reg_capital",""))
+            # set_dict_item(_dict,"bid_number",v.get("bid_number",0))
+            # set_dict_item(_dict,"招标次数",v.get("zhao_biao_number",0))
+            # set_dict_item(_dict,"投标次数",v.get("tou_biao_number",0))
+            # set_dict_item(_dict,"中标次数",v.get("zhong_biao_number",0))
+            # set_dict_item(_dict,"主营产品",v.get("products",""))
+            for k,v in _dict.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+
+    a = '''
+
+    '''
+
+    sys_keys = splitIntoList(a,"\s")
+
+    # data = pd.read_excel("../data/用户投标情况导出.xlsx")
+    _name_c = "公司名称"
+    list_enterprise = []
+    columns = ["province","city","legal_person","phone","reg_status","industry","reg_capital","found_date","reg_location"]
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        # RangeQuery("zhong_biao_number",1000)
+        # TermQuery("qualifications_number",0),
+        # MatchPhraseQuery("nicknames","公司"),
+        generateBoolShouldQuery(["province"],["四川"],WildcardQuery),
+        generateBoolShouldQuery([""])
+        # generateBoolShouldQuery(["province"],["上海","江苏","浙江","安徽","福建","江西","山东"],WildcardQuery),
+        # generateBoolShouldQuery(["nicknames"],["工程","建筑","建设"],MatchPhraseQuery)
+    ],
+        must_not_queries=[RangeQuery("status",401,451)])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("zhong_biao_number",SortOrder.ASC)]),limit=100,get_total_count=True),
+                                                                   columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+    print("total_count",total_count)
+    list_data = getRow_ots(rows)
+    list_enterprise.extend(list_data)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+        print("%d/%d"%(len(list_enterprise),total_count))
+        list_data = getRow_ots(rows)
+        list_enterprise.extend(list_data)
+        if len(list_enterprise)>=200:
+            break
+
+
+    # dict_enterprise = getDictEnterprise(data[_name_c][:1050000],
+    df_data = {}
+    getEnterpriseData(list_enterprise,df_data)
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s企业导出.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
+
+
+import numpy as np
+def exportEnterprise_by_phone():
+
+    ots_client = getConnect_ots()
+
+    filename = "C:\\Users\\Administrator\\Desktop\\用户数据0910.xlsx"
+
+
+    df = pd.read_excel(filename)
+    astr_phone = df["手机"]
+    all_count = 0
+    _begin = 0
+    int_count = 0
+    while _begin<5582:
+        # should_q = []
+        # print("-=")
+        # for str_phone,str_enter,int_throw,int_search in zip(astr_phone[_begin:_begin+100],df["公司名称"][_begin:_begin+100],df["浏览条数"][_begin:_begin+100],df["搜索次数"][_begin:_begin+100]):
+        #     if str(str_phone) !="nan" and str(str_enter)!="nan" and str(int_search)=="nan" and str(int_throw)!="nan":
+        #         int_count += 1
+        #         print(str_phone,str_enter,int_throw,int_search)
+        #         _phone = str(int(str_phone))
+        #         # should_q.append(NestedQuery("contacts",TermQuery("contacts.mobile_no",_phone)))
+        #         should_q.append(MatchPhraseQuery("nicknames",str(str_enter)))
+        # _begin += 100
+        # if should_q:
+        #     bool_query = BoolQuery(should_queries=should_q)
+        #     rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+        #                                                                    SearchQuery(bool_query,get_total_count=True),
+        #                                                                    columns_to_get=ColumnsToGet(["nicknames"],ColumnReturnType.SPECIFIED))
+        try:
+            str_enter = str(df["公司名称"][_begin])
+            consumed, return_row, next_token = ots_client.get_row("enterprise",[('name',str_enter)], ["nicknames"], None, 1)
+            rows = getRow_ots_primary(return_row)
+            total_count = len(rows)
+            _begin += 1
+            int_count += 1
+
+            if total_count>0:
+                all_count += total_count
+                print("===",str_enter,int_count,all_count)
+        except Exception as e:
+            pass
+    print("===",int_count,all_count)
+
+def attachColumn():
+    filename = "../data/中标单位.xlsx"
+    list_data = {}
+    list_enterprise = []
+    df1 = pd.read_excel(filename)
+    for _name in df1["中标单位"]:
+        list_enterprise.append(_name)
+    d_e = getDictEnterprise(list_enterprise,["legal_person","phone"])
+    df_data = {}
+    columns = ["name","legal_person","phone"]
+    for _name in list_enterprise:
+        for _c in columns:
+            if _c not in df_data:
+                df_data[_c] = []
+            df_data[_c].append(d_e.get(_name).get(_c))
+    df = pd.DataFrame(df_data)
+    df.to_excel("%s.attach.xlsx"%(filename))
+
+def transform_enterprise():
+    conn_source = getConnection_testmysql()
+    conn_target = getConnection_oracle()
+    cursor_source = conn_source.cursor()
+    cursor_target = conn_target.cursor()
+
+    sql = "  select name,province,city,credit_code,org_number,tax_number from enterprise_build "
+    cursor_source.execute(sql)
+    rows_source = cursor_source.fetchmany(10)
+    excepted = False
+    _index = 0
+    while True:
+
+        try:
+            if excepted:
+                print("==")
+                for _r in rows_source:
+                    _sql = " insert into BXKC.COMPANY_NAME_INFO(COMPANY_NAME,PROVINCE,CITY,TAX_NUM,ORG_NUM,CREDIT_CODE) values ('%s','%s','%s','%s','%s','%s')"%(_r[0],_r[1],_r[2],_r[5],_r[4],_r[3])
+                    _sql = _sql.replace("None","")
+                    cursor_target.execute(_sql)
+                conn_target.commit()
+                excepted = False
+            else:
+                _sql = " INSERT ALL"
+                for _r in rows_source:
+                    _sql += " into BXKC.COMPANY_NAME_INFO(COMPANY_NAME,PROVINCE,CITY,TAX_NUM,ORG_NUM,CREDIT_CODE) values ('%s','%s','%s','%s','%s','%s') "%(_r[0],_r[1],_r[2],_r[5],_r[4],_r[3])
+                _sql = _sql +" select 1 from dual "
+                _sql = _sql.replace("None","")
+                cursor_target.execute(_sql)
+                conn_target.commit()
+                excepted = False
+
+        except Exception as e:
+            excepted = True
+            traceback.print_exc()
+        rows_source = cursor_source.fetchmany(1000)
+        _index += 1
+        print(_index,excepted)
+        if not rows_source or len(rows_source)==0:
+            break
+
+
+def exportEnterprise_GMV():
+
+    task_queue = queue.Queue()
+
+    ots_client = getConnect_ots()
+
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery("zhong_biao_number",20,100)
+    ])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("zhong_biao_number")]),limit=100,get_total_count=True),
+                                                                   ColumnsToGet(["zhao_biao_number"],ColumnReturnType.SPECIFIED))
+    list_dict = getRow_ots(rows)
+    for _dict in list_dict:
+        task_queue.put(_dict)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       ColumnsToGet(["zhao_biao_number"],ColumnReturnType.SPECIFIED))
+        list_dict = getRow_ots(rows)
+        for _dict in list_dict:
+            task_queue.put(_dict)
+        if task_queue.qsize()>=10000:
+            break
+
+    def _handle(_dict,result_queue,ots_client):
+        name = _dict.get("name")
+
+        bool_query = BoolQuery(must_queries=[
+            RangeQuery("page_time","2020-01-01","2021-12-31",True,True),
+            TermQuery("win_tenderer",name)
+        ])
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(["page_time","win_bid_price"],ColumnReturnType.SPECIFIED))
+        list_rows = getRow_ots(rows)
+        _dict["c3"] = 0
+        _dict["c6"] = 0
+        _dict["c12"] = 0
+        _dict["c24"] = 0
+        for _row in list_rows:
+            page_time = _row.get("page_time")
+            win_bid_price = _row.get("win_bid_price",0)
+            if page_time>="2021-10-01":
+                _dict["c3"] += win_bid_price
+                _dict["c6"] += win_bid_price
+                _dict["c12"] += win_bid_price
+                _dict["c24"] += win_bid_price
+            elif page_time>="2021-07-01":
+                _dict["c6"] += win_bid_price
+                _dict["c12"] += win_bid_price
+                _dict["c24"] += win_bid_price
+            elif page_time>="2021-01-01":
+                _dict["c12"] += win_bid_price
+                _dict["c24"] += win_bid_price
+            else:
+                _dict["c24"] += win_bid_price
+        while next_token:
+            ows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                          ColumnsToGet(["page_time","win_bid_price"],ColumnReturnType.SPECIFIED))
+            list_rows = getRow_ots(rows)
+            for _row in list_rows:
+                page_time = _row.get("page_time")
+                win_bid_price = _row.get("win_bid_price",0)
+                if page_time>="2021-10-01":
+                    _dict["c3"] += win_bid_price
+                    _dict["c6"] += win_bid_price
+                    _dict["c12"] += win_bid_price
+                    _dict["c24"] += win_bid_price
+                elif page_time>="2021-07-01":
+                    _dict["c6"] += win_bid_price
+                    _dict["c12"] += win_bid_price
+                    _dict["c24"] += win_bid_price
+                elif page_time>="2021-01-01":
+                    _dict["c12"] += win_bid_price
+                    _dict["c24"] += win_bid_price
+                else:
+                    _dict["c24"] += win_bid_price
+        result_queue.put(_dict)
+
+    result_queue = queue.Queue()
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,30,ots_client=ots_client)
+    mt.run()
+    list_item = []
+    while True:
+        try:
+            _dict = result_queue.get(False)
+            list_item.append(_dict)
+        except Exception as e:
+            break
+    df_data = {"公司名称":[],
+               "近3个月营收":[],
+               "近6个月营收":[],
+               "近12个月营收":[],
+               "近24个月营收":[]}
+    for _dict in list_item:
+        df_data["公司名称"].append(_dict.get("name"))
+        df_data["近3个月营收"].append(_dict.get("c3"))
+        df_data["近6个月营收"].append(_dict.get("c6"))
+        df_data["近12个月营收"].append(_dict.get("c12"))
+        df_data["近24个月营收"].append(_dict.get("c24"))
+    df = pd.DataFrame(df_data)
+    df.to_excel("蚂蚁测试数据.xlsx",columns=["公司名称","近3个月营收","近6个月营收","近12个月营收","近24个月营收"])
+
+
+
+def attachColumn1():
+
+    filename = "全国剩下数据16570-1(2).xlsx"
+    df = pd.read_excel(filename)
+    list_enter = list(set(df["公司名"]))
+    dict_en = getDictEnterprise(list_enter)
+    list_zhongbiao = []
+    for company in df["公司名"]:
+        _zb = dict_en.get(company,{}).get("zhong_biao_number",0)
+        if _zb>0:
+            _c = "是"
+        else:
+            _c = "否"
+        list_zhongbiao.append(_c)
+    df["是否中标"] = list_zhongbiao
+    df.to_excel("全国剩下数据16570-1(2)11.xlsx")
+
+def exportContact():
+    filename = "../data/2023-03-06_190109_to_excel.xlsx"
+    df = pd.read_excel(filename)
+    list_ename = df["_id"]
+
+    list_dict = []
+    for _en in list_ename:
+        if isinstance(_en,(str)) and _en!="":
+            _dict = {"enterprise_name":_en}
+            list_dict.append(_dict)
+    task_queue = queue.Queue()
+    for _d in list_dict:
+        task_queue.put(_d)
+    ots_client = getConnect_ots()
+    def _handle(_d,result_queue):
+
+        _name = _d["enterprise_name"]
+        bool_query = BoolQuery(must_queries=[TermQuery("name",_name)])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                       SearchQuery(bool_query,limit=1),
+                                                                       columns_to_get=ColumnsToGet(["reg_location"],return_type=ColumnReturnType.SPECIFIED))
+        l_data = getRow_ots(rows)
+        if len(l_data)>0:
+            _d.update(l_data[0])
+
+        bool_query = BoolQuery(must_queries=[TermQuery("enterprise_name",_name),
+                                             BoolQuery(should_queries=[TermQuery("is_legal_person",1),
+                                                                       TermQuery("is_mobile",1)])])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
+                                                                       SearchQuery(bool_query,limit=5),
+                                                                       columns_to_get=ColumnsToGet(["enterprise_name","contact_person","phone_no","position"],return_type=ColumnReturnType.SPECIFIED))
+        l_data = getRow_ots(rows)
+        if len(l_data)>0:
+            _d.update(l_data[0])
+
+    mt = MultiThreadHandler(task_queue,_handle,None,60)
+    mt.run()
+    df_data= {}
+    columns = ["name","contact_person","phone_no","reg_location"]
+    for _d in list_dict:
+        if "phone_no" in _d:
+            for c in columns:
+                if c not in df_data:
+                    df_data[c] = []
+                df_data[c].append(getLegal_str(_d.get(c,"")))
+
+
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s_export_enterprise.xlsx"%(getCurrent_date(format="%Y-%m-%d_%H%M%S")),encoding="utf",columns=columns)
+
+def getTycCompany():
+
+    filename = "公司地址(1).xlsx"
+    df = pd.read_excel(filename)
+    list_name = df["name"]
+    task_queue = queue.Queue()
+
+    list_data = []
+    for _i in range(len(list_name)):
+        _name = list_name[_i]
+
+        _d = {"企业名称":_name,
+              "地址":df["address"][_i],
+              "注册地址":df["reg_location"][_i]}
+        task_queue.put(_d)
+        list_data.append(_d)
+
+    ots_client = getConnect_ots()
+    columns = ["legal_person","phone_number"]
+    def _handle(item,result_queue):
+        try:
+            bool_query = BoolQuery(must_queries=[TermQuery("name",item.get("企业名称"))])
+            rows, next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                SearchQuery(bool_query,limit=1),columns_to_get=ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
+            item["count"] = len(getRow_ots(rows))
+            if item["count"]==1:
+                _d = getRow_ots(rows)[0]
+                item["法人"] = _d.get("legal_person")
+                item["法人电话"] = _d.get("phone_number")
+                # item["简称"] = _d.get("alias")
+                item["营业状态"] = _d.get("reg_status")
+
+            bool_query = BoolQuery(must_queries=[
+                RangeQuery("status",201,301),
+                generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item.get("企业名称")],MatchPhraseQuery)
+            ])
+            rows, next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(bool_query,limit=1,get_total_count=True),columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+
+            item["公告数量"] = total_count
+
+            bool_query = BoolQuery(must_queries=[
+                TermQuery("status",1),
+                TermQuery("enterprise_name",item.get("企业名称")),
+
+            ])
+            rows, next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
+                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC)]),limit=10,get_total_count=True),
+                                                                            columns_to_get=ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
+            list_concat = getRow_ots(rows)
+            concat = ""
+            for data_i in range(len(list_concat)):
+                data = list_concat[data_i]
+                concat += "联系人%d%s(%s)\n"%(data_i+1,data.get("contact_person",""),data.get("phone_no",""))
+            item["联系人"] = concat
+        except Exception:
+            traceback.print_exc()
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+    columns = ["企业名称","法人","法人电话","地址","注册地址","公告数量","联系人"]
+    df_data = {}
+    for data in list_data:
+        for c in columns:
+            if c not in df_data:
+                df_data[c] = []
+            df_data[c].append(data.get(c))
+    df = pd.DataFrame(df_data)
+    df.to_excel("%s.xlsx"%filename,columns=columns)
+
+
+
+
+
+if __name__=="__main__":
+    # getTyc_company()
+    getTycCompany()
+    # exportEnterprise_by_bidNum()
+    # print(getDictEnterprise(["南宁宏基建筑工程有限责任公司"],["phone"]))
+    # exportEnterprise_by_phone()
+    # make_Legal_enterprise()
+    # transform_enterprise()
+    # exportEnterprise()
+    # exportContact()
+    # attachColumn()
+    # attachColumn()
+
+    # ots_client = getConnect_ots()
+    # bool_query = BoolQuery(must_queries=[RangeQuery("tyc_id",1,include_lower=True),
+    #                                      RangeQuery("bid_number",4,include_lower=True)
+    #                                      ])
+    # bool_query = BoolQuery(must_queries=[TermQuery("bid_number",0)],
+    #                        must_not_queries=[ExistsQuery("tyc_id"),NestedQuery("contacts",ExistsQuery("contacts"))])
+    #
+    #
+    # columns = ["name","contacts","province","city","address","reg_location"]
+    # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+    #                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
+    #                                                                   ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    # print(total_count)
+    # exportEnterprise_GMV()
+

+ 649 - 0
export/exportEs.py

@@ -0,0 +1,649 @@
+#coding:utf8
+
+import elasticsearch
+from elasticsearch import Elasticsearch
+
+from utils.Utils import getLegal_str
+
+# 内网:http://es-cn-lbj3cjmy3000djxak.elasticsearch.aliyuncs.com:9200
+# 外网:http://es-cn-lbj3cjmy3000djxak.public.elasticsearch.aliyuncs.com:9200
+
+es = Elasticsearch(["http://es-cn-lbj3cjmy3000djxak.public.elasticsearch.aliyuncs.com:9200"],
+                   http_auth=('elastic','WWBu9#1HWHo$$gJm'),
+                   port=9200,timeout=60
+                   )
+print(es.info())
+
+keywords = "无人机、游艇、风电、碳塑、改性塑料、高分子、新材料、自行车、复合材料、碳纤维"
+# keywords = "无人机、游艇、风电、碳塑、改性塑料"
+keywords = "高分子、新材料、自行车、复合材料、碳纤维"
+list_should = []
+for keyword in keywords.split("、"):
+    list_should.extend([
+        # {'match_phrase':
+        #      {'nicknames':keyword
+        #       # {
+        #       # "query": "医院",  # >= 大于等于
+        #       # # # "lt": 1650038400000  # < 小于
+        #       # }
+        #       },
+        #  },
+        {'match_phrase':
+             {'business_scope':keyword
+              # {
+              # "query": "医院",  # >= 大于等于
+              # # # "lt": 1650038400000  # < 小于
+              # }
+              },
+         },
+        # {'match_phrase':
+        #      {'description':keyword
+        #       # {
+        #       # "query": "医院",  # >= 大于等于
+        #       # # # "lt": 1650038400000  # < 小于
+        #       # }
+        #       },
+        #  }
+    ])
+# print(es.indices.get_mapping())
+body = {
+    "_source": "_name",
+    'query': {  # 查询命令
+        "bool": {
+            "must":[
+                {"has_child":{
+    "type":"contacts",
+    "query":{
+        "bool":{
+            'must': [
+                {'match_phrase':
+                     {'nicknames':'医院'
+                      # {
+                      # "query": "医院",  # >= 大于等于
+                      # # # "lt": 1650038400000  # < 小于
+                      # }
+                      },
+                 },
+                {'term':
+                     {'contacts_is_legal_person':0
+                      # {
+                      # "query": "医院",  # >= 大于等于
+                      # # # "lt": 1650038400000  # < 小于
+                      # }
+                      },
+                 },{'term':
+                        {'contacts_is_mobile':1
+                         # {
+                         # "query": "医院",  # >= 大于等于
+                         # # # "lt": 1650038400000  # < 小于
+                         # }
+                         },
+                    }
+            ]
+        }
+    }
+}},
+                # {"bool":{
+                #     "should":[
+                #         {"term":{
+                #             "district":"鹤山"
+                #         }
+                #         }
+                #     ]
+                # }},
+                # {"bool":{
+                #     "should":list_should
+                # }}
+
+            ]
+            # 'must': [
+            #
+            #
+            #     # {"range":{"tyc_id":{"gt":1111}}}
+            #     # {'match_phrase':
+            #     #     {'contacts_is_legal_person':1
+            #     #         # {
+            #     #         # "query": "医院",  # >= 大于等于
+            #     #         # # # "lt": 1650038400000  # < 小于
+            #     #         # }
+            #     #     },
+            #     # },{'match_phrase':
+            #     #                        {'contacts_is_mobile':1
+            #     #                         # {
+            #     #                         # "query": "医院",  # >= 大于等于
+            #     #                         # # # "lt": 1650038400000  # < 小于
+            #     #                         # }
+            #     #                         },
+            #     #    }
+            # ]
+
+        },
+
+    },
+    # "sort": [
+    #         {"id": "desc"}
+    #     ]
+    "_source":"reg_location,address,location"
+}
+
+str_district = '''
+梅河口
+公主岭
+定州
+辛集
+晋州
+新乐
+滦州
+遵化
+迁安
+塔城
+乌苏
+武安
+阿勒泰
+南宫
+沙河
+安国
+高碑店
+涿州
+伊宁市
+奎屯
+霍尔果斯
+平泉
+阿图什
+泊头
+黄骅
+河间
+任丘
+库尔勒
+霸州
+三河
+深州
+博乐
+阿拉山口
+昌吉
+阜康
+和田市
+喀什
+阿克苏
+库车
+古交
+高平
+怀仁
+介休
+河津
+永济
+原平
+侯马
+霍州
+青铜峡
+孝义
+汾阳
+灵武
+德令哈
+格尔木
+霍林郭勒
+玉树
+满洲里
+牙克石
+扎兰屯
+额尔古纳
+根河
+丰镇
+乌兰浩特
+阿尔山
+锡林浩特
+二连浩特
+同仁
+合作
+江阴
+宜兴
+临夏市
+邳州
+新沂
+溧阳
+张家港
+昆山
+太仓
+常熟
+海安
+如皋
+启东
+华亭
+东台
+仪征
+高邮
+丹阳
+扬中
+句容
+玉门
+敦煌
+兴化
+靖江
+泰兴
+建德
+余姚
+慈溪
+瑞安
+乐清
+诸暨
+嵊州
+海宁
+平湖
+桐乡
+兰溪
+东阳
+永康
+义乌
+神木
+江山
+子长
+临海
+温岭
+玉环
+龙泉
+韩城
+华阴
+兴平
+彬州
+延吉大安
+图们
+敦化
+和龙
+珲春
+龙井
+巢湖
+扶余
+无为
+洮南
+
+临江
+集安
+双辽
+磐石
+桦甸
+蛟河
+舒兰
+榆树
+德惠
+桐城
+潜山
+兴城
+北票
+凌源
+界首
+调兵山
+开原
+天长
+明光
+灯塔
+大石桥
+盖州
+广德
+宁国
+凌海
+北镇
+东港
+凤城
+海城
+瓦房店
+庄河
+新民
+漠河
+安达
+肇东
+海伦
+嫩江
+北安
+五大连池
+绥芬河
+海林
+宁安
+穆棱
+东宁
+福清
+龙海
+同江
+富锦
+抚远
+石狮
+晋江
+南安
+铁力
+永安
+邵武
+武夷山
+建瓯
+漳平
+虎林
+密山
+福安
+福鼎
+讷河
+尚志
+五常
+瑞昌
+共青城
+庐山
+德兴
+高安
+丰城
+樟树
+景洪
+井冈山
+文山
+龙南
+瑞金
+乐平
+蒙自
+个旧
+开远
+弥勒
+楚雄
+大理
+贵溪
+香格里拉
+泸水
+胶州
+平度
+莱西
+芒市
+瑞丽
+滕州
+龙口
+莱阳
+莱州
+招远
+栖霞
+海阳
+诸城
+青州
+寿光
+安丘
+昌邑
+高密
+邹城
+曲阜
+新泰
+肥城
+荣成
+乳山
+腾冲
+水富
+澄江
+宣威
+安宁
+都匀
+福泉
+乐陵
+禹城
+凯里
+临清
+兴义
+兴仁
+赤水
+仁怀
+盘州
+新郑
+巩义
+登封
+荥阳
+新密
+清镇
+偃师
+舞钢
+汝州
+林州
+华蓥
+卫辉
+辉县
+长垣
+西昌
+沁阳
+孟州
+康定
+马尔康
+禹州
+长葛
+万源
+灵宝
+义马
+阆中
+永城
+项城
+峨眉山
+隆昌
+射洪
+邓州
+广汉
+什邡
+绵竹
+江油
+都江堰
+彭州
+崇州
+邛崃
+简阳
+大冶
+丹江口
+宜都
+当阳
+枝江
+凭祥
+枣阳
+宜城
+老河口
+合山
+靖西
+平果
+钟祥
+京山
+北流
+汉川
+应城
+安陆
+桂平
+东兴
+松滋
+石首
+监利
+洪湖
+岑溪
+麻城
+武穴
+荔浦
+赤壁
+广水
+恩施
+罗定
+普宁
+宁乡
+浏阳
+英德
+连州
+醴陵
+韶山
+湘乡
+阳春
+耒阳
+常宁
+邵东
+武冈
+陆丰
+汨罗
+临湘
+津市
+沅江
+涟源
+冷水江
+兴宁
+资兴
+高州
+化州
+信宜
+台山
+鹤山
+开平
+恩平
+洪江
+吉首
+四会
+廉江
+吴川
+雷州
+乐昌
+南雄
+龙港
+邹平
+利川
+茫崖
+'''
+list_district = []
+for s in str_district.split("\n"):
+    s = s.strip()
+    if s!="":
+        list_district.append(s)
+
+#查询联系人
+body = {
+    "query":{
+        "bool":{
+            "must":[
+                {"term": {"my_join_field": "contacts"}},
+                {"term":{"contacts_is_mobile":1}},
+                {"term":{"contacts_is_legal_person":0}},
+                {"range":{"contacts_update_time":{"gte":"2023-01-01 00:00:00"}}},
+
+                # {"nested":{
+                #     "path":"my_join_field",
+                #     "query":{
+                #         "bool":{
+                #             "must":[
+                #                 {"match_phrase":{"my_join_field.parent":"医院"}}
+                #             ]
+                #         }
+                #     }
+                # }}
+                # {"match_phrase":{"contacts_enterprise_name":"广州"}}
+                {
+                    "has_parent":{
+                        "parent_type": "enterprise",
+                        "query":{
+                            "bool":{
+                                "must":[
+                                    {"term":{"district":"鹤山"}},
+                                    {"match_phrase":{"nicknames":"医院"}},
+                                ],
+                                "must_not":[
+                                    {"match_phrase":{"nicknames":"美容"}},
+                                    {"match_phrase":{"nicknames":"动物"}},
+                                    {"match_phrase":{"nicknames":"植物"}},
+                                ]
+                            }
+                        }
+                    }
+                }
+            ],
+
+        }
+    },
+    "_source":["contacts_enterprise_name","contacts_person_name","contacts_phone_no"]
+}
+
+
+def search_data(es,body,nums,get_total=False):
+
+    every_page_number = min(nums,1000)
+
+    list_result = []
+    query = es.search(body,rest_total_hits_as_int=True,scroll='5m',size=every_page_number)
+    total_count = query["hits"]["total"]
+    _scroll_id = query["_scroll_id"]
+    print(query["hits"]["hits"])
+    list_result.extend(query["hits"]["hits"])
+
+    print("es total_count:%d"%(total_count))
+    if get_total:
+        get_nums = total_count
+    else:
+        get_nums = min(nums,total_count)
+    page_numbers = get_nums//every_page_number+1
+    for _n in range(page_numbers):
+        if len(list_result)>=get_nums:
+            break
+        query = es.scroll(scroll_id=_scroll_id,scroll='5m')
+        _scroll_id = query["_scroll_id"]
+        list_result.extend(query["hits"]["hits"])
+        print("%d/%d"%(len(list_result),total_count))
+    return list_result
+
+def data_to_excel(list_result,columns):
+    import pandas as pd
+    from utils.Utils import getCurrent_date
+
+    df_data = {}
+    for c in columns:
+        df_data[c] = []
+    for _d in list_result:
+        _d = _d.get("_source",{})
+        for c in columns:
+            print(c,getLegal_str(_d.get(c)))
+            df_data[c].append(getLegal_str(_d.get(c)))
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s_to_excel.xlsx"%(getCurrent_date(format="%Y-%m-%d_%H%M%S")),columns=columns)
+
+if __name__ == '__main__':
+
+    # print(search_data(es,body,3000))
+    list_result = []
+    list_district_should = []
+    for district in list_district:
+        list_district_should.append({"term":{"district":district}})
+    body = {
+        "query":{
+            "bool":{
+                "must":[
+                    {"term":{"contacts_status":1}},
+                    {"term": {"my_join_field": "contacts"}},
+                    {"term":{"contacts_is_mobile":1}},
+                    {"term":{"contacts_is_legal_person":0}},
+                    {"range":{"contacts_update_time":{"gte":"2023-01-01 00:00:00"}}},
+
+                    # {"nested":{
+                    #     "path":"my_join_field",
+                    #     "query":{
+                    #         "bool":{
+                    #             "must":[
+                    #                 {"match_phrase":{"my_join_field.parent":"医院"}}
+                    #             ]
+                    #         }
+                    #     }
+                    # }}
+                    # {"match_phrase":{"contacts_enterprise_name":"广州"}}
+                    {
+                        "has_parent":{
+                            "parent_type": "enterprise",
+                            "query":{
+                                "bool":{
+                                    "must":[
+                                        {"bool":{"should":list_district_should}},
+                                        # {"term":{"district":district}},
+                                        {"match_phrase":{"nicknames":"医院"}},
+                                    ],
+                                    "must_not":[
+                                        {"match_phrase":{"nicknames":"美容"}},
+                                        {"match_phrase":{"nicknames":"动物"}},
+                                        {"match_phrase":{"nicknames":"宠物"}},
+                                        {"match_phrase":{"nicknames":"植物"}},
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                ],
+
+            }
+        },
+        "_source":["contacts_enterprise_name","contacts_person_name","contacts_phone_no"]
+    }
+    list_result.extend(search_data(es,body,10000000,False))
+    data_to_excel(list_result,["contacts_enterprise_name","contacts_person_name","contacts_phone_no"])

+ 451 - 0
export/exportProject.py

@@ -0,0 +1,451 @@
+#encoding:GBK
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+from utils.hashUtil import aesCipher
+from export.exportEnterprise import getDictEnterprise,getOneContact
+from export.exportUtils import generateBoolShouldQuery
+
+
+data_path = "../data/"
+
+set_columns = set()
+list_df_columns = []
+
+def set_dict_item(_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns:
+        set_columns.add(name)
+        list_df_columns.append(getLegal_str(name))
+
+def getDict_docchannel():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    sql  = "select channel_id,chnlname from sys_channel "
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    _dict = dict()
+    for row in rows:
+        _dict[row[0]] = row[1]
+    return _dict
+
+def exportProject_by_pagetime():
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+
+
+    set_enter = set()
+    str_enter = '''
+    成都四方伟业软件股份有限公司
+    北京数字冰雹信息技术有限公司
+    北京睿呈时代信息科技有限公司
+    北京五一视界数字孪生科技股份有限公司
+    易达云图(深圳)科技有限公司
+    北京优锘科技有限公司
+    深圳市鸿普森科技股份有限公司
+    厦门图扑软件科技有限公司
+    四川相数科技有限公司
+    '''
+    for a in re.split("\s+",str_enter):
+        if a.strip()!="":
+            set_enter.add(a.strip())
+
+    columns = ["docids","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","win_bid_price","win_tenderer","bidding_budget"]
+    columns = ["page_time","province","city","win_tenderer"]
+    dict_channel = getDict_docchannel()
+    def getData(df_data,rows,set_line):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            item = {}
+            _dict = row
+            set_dict_item(item,"docids",_dict.get("docids",""))
+            set_dict_item(item,"项目名称",_dict.get("project_name",""))
+            set_dict_item(item,"项目编号",_dict.get("project_code",""))
+            # set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+            # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item(item,"省份",_dict.get("province",""))
+            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+            set_dict_item(item,"城市",_dict.get("city",""))
+            set_dict_item(item,"发布时间",_dict.get("page_time",""))
+
+            set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))
+
+
+            set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item(item,"代理单位",_dict.get("agency",""))
+            set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+            # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+            set_dict_item(item,"招标金额",_dict.get("bidding_budget",""))
+            set_dict_item(item,"中标金额",_dict.get("win_bid_price",""))
+            set_dict_item(item,"中标单位",_dict.get("win_tenderer",""))
+
+            sub_docs_json = _dict.get("sub_docs_json")
+            if sub_docs_json is not None:
+                for _doc in json.loads(sub_docs_json):
+                    if "win_tenderer" in _doc:
+                        set_dict_item(item,"中标单位",_doc["win_tenderer"])
+                    if "win_tenderee_manager" in _doc:
+                        set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+                    if "win_tenderee_phone" in _doc:
+                        set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+                    if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                        set_dict_item(item,"中标金额",_doc["win_bid_price"])
+                    if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                        set_dict_item(item,"招标金额",_doc["bidding_budget"])
+
+
+            if "招标金额" not in item:
+                set_dict_item(item,"招标金额","")
+            if "中标金额" not in item:
+                set_dict_item(item,"中标金额","")
+            if "中标单位" not in item:
+                set_dict_item(item,"中标单位","")
+
+            if "中标单位联系人" not in item:
+                set_dict_item(item,"中标单位联系人","")
+            if "中标单位联系电话" not in item:
+                set_dict_item(item,"中标单位联系电话","")
+
+            # if item["中标单位"] not in set_enter:
+            #     continue
+
+            _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
+            # if _line in set_line:
+            #     continue
+            # if item["招标金额"]=="":
+            #     continue
+
+            # set_line.add(_line)
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+    # list_province = ["江西","湖南","四川","安徽"]
+    list_province = ["全国"]
+    for _province in list_province:
+        df_data = {}
+
+        str_p = '''
+        家具
+
+            '''
+        # str_p = '''
+        # 教育信息化 教学设备 智慧校园 互联网教育
+        # '''
+        list_prov = re.split("\s|、",str_p)
+        list_mu = []
+        for _p in list_prov:
+            if _p.strip()=="":
+                continue
+            print(_p)
+            list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
+
+        s_tenderee = '教育局、中学、小学'
+        list_should_ten = []
+        for _p in re.split("、",s_tenderee):
+            if _p.split()=="":
+                continue
+            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
+            # list_should_ten.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
+
+        list_should_chan = []
+        list_should_chan.append(TermQuery("docchannel",101))
+        # list_should_chan.append(TermQuery("docchannel",101))
+        # list_should_chan.append(TermQuery("docchannel",102))
+
+        should_q1 = BoolQuery(should_queries=list_mu)
+        should_q2 = BoolQuery(should_queries=list_should_ten)
+        should_q3 = BoolQuery(should_queries=list_should_chan)
+        bool_query = BoolQuery(must_queries=[
+            generateBoolShouldQuery(["doctextcon"],["家具"],MatchPhraseQuery),
+            generateBoolShouldQuery(["province"],["广东","安徽","江苏","浙江","四川","北京"],TermQuery),
+            WildcardQuery("win_tenderer","*"),
+            ])
+
+
+        table_name = "project2"
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search(table_name, "%s_index"%table_name,
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("page_time",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+        print(total_count)
+        set_line = set()
+        _count = len(rows)
+        getData(df_data,rows,set_line)
+        while next_token:
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search(table_name, "%s_index"%table_name,
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            getData(df_data,rows,set_line)
+            _count += len(rows)
+            # if len(df_data[list(df_data.keys())[0]])>=300:
+            #     break
+
+        set_enterprise = set()
+        for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+            set_enterprise.add(_tenderee)
+            set_enterprise.add(_agency)
+            set_enterprise.add(_win_tenderer)
+        if "" in set_enterprise:
+            set_enterprise.remove("")
+        if None in set_enterprise:
+            set_enterprise.remove(None)
+        # dict_enterprise = getDictEnterprise(list(set_enterprise))
+        # if len(set_enterprise)>0:
+        #     for _i in range(len(df_data["招标单位"])):
+        #         _enterprise_name = df_data["招标单位"][_i]
+        #         if df_data["招标联系人电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["招标联系人"][_i] = _person
+        #                 df_data["招标联系人电话"][_i] = _phone
+        #
+        #         _enterprise_name = df_data["代理单位"][_i]
+        #         if df_data["代理联系人电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["代理联系人"][_i] = _person
+        #                 df_data["代理联系人电话"][_i] = _phone
+        #
+        #         _enterprise_name = df_data["中标单位"][_i]
+        #         if df_data["中标单位联系电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["中标单位联系人"][_i] = _person
+        #                 df_data["中标单位联系电话"][_i] = _phone
+
+        # print(df_data)
+        df1 = pd.DataFrame(df_data)
+        df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
+
+def exportProjectWithOneDocid():
+    ots_client = getConnect_ots()
+
+    list_data = []
+    bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-05-28")])
+    columns = ["docids","project_name"]
+
+    rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.ASC)]),get_total_count=True,limit=100),
+                                                                   columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+    list_dict = getRow_ots(rows)
+    for _dict in list_dict:
+        if len(_dict["docids"].split(","))==1:
+            list_data.append(_dict)
+    _count = len(list_dict)
+    while True:
+        if not next_token:
+            break
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+        list_dict = getRow_ots(rows)
+        _count += len(list_dict)
+        print("%d/%d"%(_count,total_count))
+        for _dict in list_dict:
+            if len(_dict["docids"].split(","))==1:
+                list_data.append(_dict)
+    _index = 0
+
+    task_queue = queue.Queue()
+    for _dict in list_data:
+        task_queue.put(_dict)
+
+    def _handle(_dict,result_queue):
+        docid = _dict["docids"]
+        project_name = _dict["project_name"]
+        _dict["candidate"] = []
+        _dict["total_count"] = 0
+        if len(project_name)>0:
+            doc_query = BoolQuery(must_queries=[MatchPhraseQuery("doctextcon",project_name)
+                                                ,RangeQuery("status",201,300,True,True)],
+                                  must_not_queries=[TermQuery("docid",docid)])
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(doc_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=10,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
+            l_d = getRow_ots(rows)
+            for _d in l_d:
+                _dict["candidate"].append(_d["docid"])
+        _dict["total_count"] = total_count
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+    df_data = {}
+    for _d in list_data:
+        for k,v in _d.items():
+            if k not in df_data:
+                df_data[k] = []
+            df_data[k].append(v)
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s_未合并.xlsx"%(getCurrent_date("%Y-%m-%d %H%M%S")))
+
+def getPayStaffName():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    sql = " select company,userid,phone,contactname,aftermarket from bxkc.b2c_mall_staff_basic_info where MEMBERLEVELID is not null and MEMBERLEVELID <> 81"
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    dict_staff = {}
+    for row in rows:
+        company,userid,phone,contactname,aftermarket = row
+        if company is not None:
+            dict_staff[company] = {"userid":userid,"phone":phone,"contactname":contactname,"aftermarket":aftermarket}
+    return dict_staff
+
+def exportCompanyByCycleProduct():
+    filename = "../data/周期项目识别.csv"
+    df = pd.read_csv(filename,encoding='gbk')
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    pool_conn = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
+    _count = 0
+    for tenderee,product,last_time,avg_period,min_period,max_period,json_docid in zip(df["tenderee"],df["product"],df["last_time"],df["avg_period"],df["min_period"],df["max_period"],df["json_docid"]):
+        _dict = {"tenderee":tenderee,"product":product,"last_time":last_time,"avg_period":avg_period,"min_period":min_period,
+                 "max_period":max_period,"json_docid":json_docid}
+        task_queue.put(_dict)
+        _count += 1
+
+    sstr_staff = getPayStaffName()
+    ots_client = getConnect_ots()
+
+    def _comsumer(_dict,result_queue,ots_client,sstr_staff,pool_conn):
+        new_dict = {"招标人":_dict["tenderee"],"产品":_dict["product"],"上次招标":_dict["last_time"],
+                    "预计招标范围":"%s-%s"%(timeAdd(_dict["last_time"],_dict["min_period"]),timeAdd(_dict["last_time"],_dict["max_period"])),
+                    "周期":_dict["avg_period"],"历史招标":_dict["json_docid"]}
+        aint_docid = json.loads(_dict["json_docid"])
+        aobj_should_q_docid = []
+
+        consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",_dict["tenderee"])], ["contacts"], None, 1)
+        dict_tmp = getRow_ots_primary(return_row)
+        contacts = dict_tmp.get("contacts")
+        phone_person,phone_no =  getOneContact(contacts)
+        new_dict["招标人联系人"] = phone_person
+        new_dict["招标人联系电话"] = phone_no
+
+        for int_docid in aint_docid:
+            aobj_should_q_docid.append(TermQuery("docids",int_docid))
+        bool_query = BoolQuery(should_queries=aobj_should_q_docid)
+        columns = ['win_tenderer','second_tenderer','third_tenderer']
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,limit=100,get_total_count=True),
+                                                                       ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
+        adict_rows = getRow_ots(rows)
+        for dict_row in adict_rows:
+            for _k,_company in dict_row.items():
+                if _k in columns and _company is not None and _company!="":
+                    _succeed = True
+                    new_dict1 = {}
+                    for k,v in new_dict.items():
+                        new_dict1[k] = v
+                    new_dict1["潜在客户"] = _company
+                    consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",_company)], ["contacts"], None, 1)
+                    dict_tmp = getRow_ots_primary(return_row)
+                    contacts = dict_tmp.get("contacts")
+                    phone_person,phone_no =  getOneContact(contacts)
+                    new_dict1["潜在客户联系人"] = phone_person
+                    new_dict1["潜在客户联系电话"] = phone_no
+
+                    if _company in sstr_staff:
+                        company_info = sstr_staff[_company]
+                        new_dict1["付费客户"] = "是"
+                        conn = pool_conn.getConnector()
+                        try:
+                            cursor = conn.cursor()
+                            sql = " select name from bxkc.b2c_mall_staff_basic_info where userid='%s'"%(company_info.get("aftermarket",""))
+                            cursor.execute(sql)
+                            rows = cursor.fetchall()
+                            if len(rows)>0:
+                                new_dict1["归属客服"] = rows[0][0]
+                            else:
+                                new_dict1["归属客服"] = ""
+                            new_dict1["付费客户联系人"] = company_info.get("contactname","")
+                            new_dict1["付费客户电话"] = company_info.get("phone","")
+                            sql = " select date_FORMAT(etiem,\'%Y-%m-%d\') from bxkc.bxkc_member_term where userid='"+company_info.get("userid","")+"' and memberlevelid<>81 order by etiem desc limit 1"
+                            cursor.execute(sql)
+                            rows = cursor.fetchall()
+                            if len(rows)>0:
+                                etime = rows[0][0]
+                                new_dict1["付费客户到期日"] = etime
+                                if time.mktime(time.strptime(etime,"%Y-%m-%d"))>time.mktime(time.localtime()):
+                                    new_dict1["付费客户到期"] = "否"
+                                else:
+                                    new_dict1["付费客户到期"] = "是"
+                            else:
+                                new_dict1["付费客户到期日"] = ""
+                                new_dict1["付费客户到期"] = ""
+
+                        except Exception as e:
+                            traceback.print_exc()
+                            _succeed = False
+                        finally:
+                            pool_conn.putConnector(conn)
+
+                    else:
+                        new_dict1["付费客户"] = "否"
+                        new_dict1["归属客服"] = ""
+                        new_dict1["付费客户联系人"] = ""
+                        new_dict1["付费客户电话"] = ""
+                        new_dict1["付费客户到期日"] = ""
+                        new_dict1["付费客户到期"] = ""
+                    if _succeed:
+                        result_queue.put(new_dict1)
+
+    mt = MultiThreadHandler(task_queue,_comsumer,result_queue,ots_client=ots_client,sstr_staff=sstr_staff,pool_conn=pool_conn,thread_count=30)
+    mt.run()
+
+    df_data = {}
+    set_staff = set()
+    while True:
+        try:
+            _dict = result_queue.get(timeout=1)
+            tenderee = _dict.get("招标人","")
+            product = _dict.get("产品","")
+            staff = _dict.get("潜在客户","")
+            _s = "%s-%s-%s"%(tenderee,product,staff)
+            if _s in set_staff:
+                continue
+            set_staff.add(_s)
+            for k,v in _dict.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        except Exception as e:
+            break
+
+
+    df1 = pd.DataFrame(df_data)
+    df1.to_excel("../data/%s_周期项目.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
+
+
+def appendCellphones():
+
+    file = "../data/"
+
+
+if __name__=="__main__":
+    exportProject_by_pagetime()
+    # exportProjectWithOneDocid()
+    # exportCompanyByCycleProduct()

+ 479 - 0
export/exportUtils.py

@@ -0,0 +1,479 @@
+#coding:UTF8
+from utils.Utils import *
+from dataSource.source import getConnect_ots
+from utils.multiThread import MultiThreadHandler
+from queue import Queue
+import json
+from utils.hashUtil import aesCipher
+import inspect
+from tablestore import *
+
+set_columns = set()
+list_df_columns = []
+
+from Crypto.Cipher import PKCS1_v1_5 as Cipher_pksc1_v1_5
+from Crypto.PublicKey import RSA
+
+def rsa_encrpt(string, public_key):
+    rsakey = RSA.importKey(public_key)  # 读取公钥
+    cipher = Cipher_pksc1_v1_5.new(rsakey)
+    # 因为encryptor.encrypt方法其内部就实现了加密再次Base64加密的过程,所以这里实际是通过下面的1和2完成了JSEncrypt的加密方法
+    encrypt_text = cipher.encrypt(string.encode())  # 1.对账号密码组成的字符串加密
+    return encrypt_text
+
+def getOneContact(contacts,tojson=True,mobile_first=True,mobile_only=True,high_level_first=True):
+    mobile_person = ""
+    mobile_no = ''
+    phone_person = ""
+    phone_no = ''
+    if contacts is None:
+        return "",""
+    try:
+        if isinstance(contacts,str):
+            list_contacts = json.loads(contacts)
+        else:
+            list_contacts = contacts
+
+        list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
+        for _contact in list_contacts:
+            _level = _contact.get("level")
+            is_mobile = _contact.get("is_mobile",0)
+            if is_mobile==1:
+                _contact["mobile_no"] = _contact.get("phone_no")
+            if _contact.get("mobile_no","")!="":
+                mobile_person = _contact.get("contact_person","")
+                mobile_no = _contact.get("mobile_no","")
+                if _level==40:
+                    if mobile_person!="":
+                        mobile_person += "(法人)"
+
+            if _contact.get("phone_no","")!="":
+                phone_person = _contact.get("contact_person","")
+                phone_no = _contact.get("phone_no","")
+                if _level==40:
+                    if phone_person!="":
+                        phone_person += "(法人)"
+
+        if mobile_first:
+            if mobile_no!="" and mobile_person!="":
+                return mobile_person,mobile_no
+            else:
+                if mobile_only:
+                    return mobile_person,mobile_no
+    except Exception as e:
+        pass
+    return phone_person,phone_no
+
+
+def getMobiles(contacts,to_json=True):
+    if to_json:
+        list_contacts = json.loads(contacts)
+    else:
+        list_contacts = contacts
+    list_result = []
+    for _c in list_contacts:
+        if _c.get("mobile_no","")!="":
+            list_result.append([_c.get("contact_person",""),_c.get("mobile_no")])
+    return list_result
+
+def set_dict_item(_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns:
+        set_columns.add(name)
+        list_df_columns.append(getLegal_str(name))
+
+def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns1:
+        set_columns1.add(name)
+        list_df_columns1.append(getLegal_str(name))
+
+def getRowData_document(df_data,rows,set_line,list_keyword,dict_channel):
+
+    # list_data = getRow_ots(rows)
+    for row in rows:
+        item = {}
+        _dict = row
+        set_dict_item(item,"docid",_dict.get("docid",""))
+        set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+        set_dict_item(item,"公告内容",_dict.get("doctextcon",""))
+        set_dict_item(item,"附件内容",_dict.get("attachmenttextcon",""))
+        set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+        set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")+row.get("attachmenttextcon","")).replace("(","(").replace(")",")")))))))
+        set_dict_item(item,"产品",_dict.get("product",""))
+        set_dict_item(item,"省份",_dict.get("province",""))
+        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+        set_dict_item(item,"城市",_dict.get("city",""))
+        set_dict_item(item,"区县",_dict.get("district",""))
+        set_dict_item(item,"发布时间",_dict.get("page_time",""))
+        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
+        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
+        set_dict_item(item,"创建时间",_dict.get("crtime",""))
+        set_dict_item(item,"招标方式",_dict.get("bidway",""))
+
+        set_dict_item(item,"行业一级分类",_dict.get("industry",""))
+        set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
+
+        set_dict_item(item,"uuid",_dict.get("uuid"))
+
+        set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))
+
+        set_dict_item(item,"项目编号",_dict.get("project_code",""))
+        set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+        set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+        set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+        set_dict_item(item,"代理单位",_dict.get("agency",""))
+        set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+        set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+        set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
+        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
+        sub_docs_json = _dict.get("sub_docs_json")
+        set_tenderer = set()
+        if sub_docs_json is not None:
+            for _doc in json.loads(sub_docs_json):
+                if "win_tenderer" in _doc:
+                    set_dict_item(item,"中标单位",_doc["win_tenderer"])
+                if "second_tenderer" in _doc:
+                    set_tenderer.add(_doc.get("second_tenderer"))
+                if "third_tenderer" in _doc:
+                    set_tenderer.add(_doc.get("third_tenderer"))
+                if "win_tenderee_manager" in _doc:
+                    set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+                if "win_tenderee_phone" in _doc:
+                    set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+                if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                    set_dict_item(item,"中标金额",_doc["win_bid_price"])
+                if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                    set_dict_item(item,"招标金额",_doc["bidding_budget"])
+        set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
+        if "招标金额" not in item:
+            set_dict_item(item,"招标金额","")
+        if "中标金额" not in item:
+            set_dict_item(item,"中标金额","")
+        if "中标单位" not in item:
+            set_dict_item(item,"中标单位","")
+
+        if "中标单位联系人" not in item:
+            set_dict_item(item,"中标单位联系人","")
+        if "中标单位联系电话" not in item:
+            set_dict_item(item,"中标单位联系电话","")
+
+        # if item["中标单位"] not in set_enter:
+        #     continue
+
+        _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["中标金额"]))
+        # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
+        #     continue
+        # if _line in set_line:
+        #     continue
+        # if _dict.get("docid","") in set_ig_docid:
+        #     continue
+        # if item["招标金额"]=="":
+        #     continue
+
+        # set_line.add(_line)
+        for k,v in item.items():
+            if k not in df_data:
+                df_data[k] = []
+            df_data[k].append(v)
+
+def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","industry","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code"]):
+    task_queue = Queue()
+    result_queue= Queue()
+
+    for _enterprise in list_enterprise:
+        task_queue.put(str(_enterprise))
+    def _handle(item,result_queue,ots_client):
+        try:
+            primary_key = [("name",item)]
+            consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
+            dict_data = getRow_ots_primary(return_row)
+
+
+            if dict_data is not None:
+                bool_q = BoolQuery(must_queries=[TermQuery("enterprise_name",item),
+                                                 TermQuery("status",1)])
+                rows,next_token,total_count,is_allowed = ots_client.search("enterprise_contact","enterprise_contact_index",
+                                                                           SearchQuery(bool_q,limit=10),
+                                                                           columns_to_get=ColumnsToGet(["contact_person","phone_no","is_mobile","level","is_legal_person","is_manager","is_shareholder"],ColumnReturnType.SPECIFIED))
+                list_contact = getRow_ots(rows)
+                dict_data["contacts"] = list_contact
+                result_queue.put({item:dict_data})
+
+        except Exception as e:
+            traceback.print_exc()
+
+
+    ots_client = getConnect_ots()
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=50,ots_client=ots_client)
+    mt.run()
+
+    dict_enterprise = {}
+    while True:
+        try:
+            _dict = result_queue.get(False)
+            for k,v in _dict.items():
+                dict_enterprise[k] = v
+        except Exception as e:
+            break
+    return dict_enterprise
+
+def splitIntoList(_str,_splitkeys):
+    list_words = []
+    for _word in re.split(_splitkeys,_str):
+        if _word.strip()=="":
+            continue
+        list_words.append(_word)
+    return list_words
+
+def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+    set_enterprise = set()
+    if len(df_data.keys())>0:
+        for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+            set_enterprise.add(_tenderee)
+            set_enterprise.add(_agency)
+            set_enterprise.add(_win_tenderer)
+        if "" in set_enterprise:
+            set_enterprise.remove("")
+        if None in set_enterprise:
+            set_enterprise.remove(None)
+        dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system","company_org_type","reg_capital","actual_capital","contacts","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code","legal_person_name","phone_number"])
+
+        # conn = getConnection_oracle()
+        # cursor = conn.cursor()
+        if len(set_enterprise)>0:
+            for _i in range(len(df_data["招标单位"])):
+                _enterprise_name = df_data["招标单位"][_i]
+                if df_data["招标联系人电话"][_i]=="":
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        _person,_phone = getOneContact(contacts)
+                        df_data["招标联系人"][_i] = _person
+                        df_data["招标联系人电话"][_i] = _phone
+
+                if "信用代码" not in df_data:
+                    df_data["信用代码"] = []
+                df_data["信用代码"].append(dict_enterprise.get(_enterprise_name,{}).get("credit_code",""))
+
+                if "招标人采购系统" not in df_data:
+                    df_data["招标人采购系统"] = []
+                df_data["招标人采购系统"].append(dict_enterprise.get(_enterprise_name,{}).get("procurement_system",""))
+
+                if "招标人类型" not in df_data:
+                    df_data["招标人类型"] = []
+                df_data["招标人类型"].append(dict_enterprise.get(_enterprise_name,{}).get("company_org_type",""))
+
+                # if "原网地址" not in df_data:
+                #     df_data["原网地址"] = []
+                # if df_data["公告类别"][_i]=="招标公告":
+                #     table_name = "bxkc.T_ZHAO_BIAO_GONG_GAO"
+                # else:
+                #     table_name = "bxkc.T_ZHONG_BIAO_XIN_XI"
+                # sql = "select detail_link from %s where id='%s' "%(table_name,df_data["uuid"][_i])
+                # cursor.execute(sql)
+                # rows = cursor.fetchall()
+                # if len(rows)>0:
+                #     df_data["原网地址"].append(rows[0][0])
+                # else:
+                #     df_data["原网地址"].append("")
+
+                _enterprise_name = df_data["代理单位"][_i]
+                if df_data["代理联系人电话"][_i]=="":
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        _person,_phone = getOneContact(contacts)
+                        df_data["代理联系人"][_i] = _person
+                        df_data["代理联系人电话"][_i] = _phone
+
+                _enterprise_name = df_data["中标单位"][_i]
+                if get_legal_person:
+                    _person = dict_enterprise.get(_enterprise_name,{}).get("legal_person_name","")
+                    _phone = dict_enterprise.get(_enterprise_name,{}).get("phone_number","")
+                    if len(_phone)==11 and _phone[0]=="1":
+                        df_data["中标单位联系人"][_i] = _person
+                        df_data["中标单位联系电话"][_i] = _phone
+                else:
+                    if df_data["中标单位联系电话"][_i]=="":
+                        contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        if contacts is not None:
+                            _person,_phone = getOneContact(contacts,mobile_only=True)
+                            df_data["中标单位联系人"][_i] = _person
+                            df_data["中标单位联系电话"][_i] = _phone
+    list_df_columns.extend(['信用代码','招标人采购系统','招标人类型'])
+
+
+def generateBoolShouldQuery(list_field,list_should_words,cls):
+    list_should_q = []
+    assert isinstance(list_field,(list))
+    assert isinstance(list_should_words,(list))
+    assert cls in (TermQuery,MatchPhraseQuery,WildcardQuery,RangeQuery)
+    for word in list_should_words:
+        for field in list_field:
+            if cls in (RangeQuery,):
+                list_should_q.append(cls(field,*field))
+            if cls in (WildcardQuery,):
+                list_should_q.append(cls(field,"*%s*"%word))
+            else:
+                list_should_q.append(cls(field,word))
+    return BoolQuery(should_queries=list_should_q)
+
+# excel 数据处理库
+import openpyxl
+
+# excel 数据样式设置类
+from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
+from openpyxl.styles import Border, Side, colors
+import pandas as pd
+from openpyxl.utils import get_column_letter
+from openpyxl.styles import Font, Alignment
+
+import os
+
+
+def my_border(t_border, b_border, l_border, r_border):
+    border = Border(top=Side(border_style=t_border, color=colors.BLACK),
+                    bottom=Side(border_style=b_border, color=colors.BLACK),
+                    left=Side(border_style=l_border, color=colors.BLACK),
+                    right=Side(border_style=r_border, color=colors.BLACK))
+    return border
+
+#初始化制定区域边框为所有框线
+def format_border(ws,s_column, s_index, e_column , e_index):
+    for row in tuple(ws[s_column + str(s_index):e_column + str(e_index)]):
+        for cell in row:
+            cell.border = my_border('thin', 'thin', 'thin', 'thin')
+
+def adjust_excel(source,target,columns=["A","B","C","D"]):
+
+    wb = openpyxl.load_workbook(source)
+    for sheet in wb.sheetnames:
+        ws = wb[sheet]
+        df = pd.read_excel(source,sheet_name=sheet)
+        # 把表头改到最后一行
+        df.loc[len(df)]=list(df.columns)
+
+        list_row_height = []
+
+        for col in df.columns:
+            # 获取列序号
+            index = list(df.columns).index(col)
+            # 获取行字母表头
+            letter = get_column_letter(index+1)
+            # 获取当前列最大宽度
+            collen = df[col].apply(lambda x :len(str(x).encode())).max()
+            # 设置列宽为最大长度比例
+            _width = min(30,collen*0.9)
+            ws.column_dimensions[letter].width = _width
+
+        for i in df.index:
+            # 设置单元格对齐方式 Alignment(horizontal=水平对齐模式,vertical=垂直对齐模式,text_rotation=旋转角度,wrap_text=是否自动换行)
+            alignment = Alignment(horizontal='center', vertical='center', text_rotation=0, wrap_text=True)
+
+            format_border(ws,columns[0], 0, columns[-1], len(df)) # 根据实际列数量修改
+            for j in columns:
+                ws[j+str(i+1)].alignment = alignment
+
+            list_width = []
+
+            for col in df.columns:
+                # 获取列序号
+                index = list(df.columns).index(col)
+                # 获取行字母表头
+                letter = get_column_letter(index+1)
+                # 获取当前列最大宽度
+                list_width.append(len(str(df[col][i]).encode()))
+
+            ws.row_dimensions[index].height = 20+max(list_width)//30*5
+
+    wb.save(target)
+
+def getDocument(list_query,columns,table_name="document",table_index="document_index",thread_count=30,sort_column="page_time"):
+    task_queue = Queue()
+    for _q in list_query:
+        task_queue.put(_q)
+    print("task_queue_size",task_queue.qsize())
+    result_queue = Queue()
+    list_row = []
+    ots_client = getConnect_ots()
+    def _handle(_dict,result_queue,ots_client):
+        try:
+            item = _dict.get("query")
+            _limit = _dict.get("limit")
+            keyword = _dict.get("keyword")
+            l_rows = []
+            rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
+                                                                           SearchQuery(item,sort=Sort(sorters=[FieldSort(sort_column,SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                           ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            dict_row = getRow_ots(rows)
+            if keyword is not None:
+                for _row in dict_row:
+                    _row["keyword"] = keyword
+            l_rows.extend(dict_row)
+            log("total count:%d"%total_count)
+            _count = len(dict_row)
+            while next_token:
+                if _limit and len(l_rows)>=_limit:
+                    break
+                rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
+                                                                               SearchQuery(item,next_token=next_token,limit=100,get_total_count=True),
+                                                                               ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+                dict_row = getRow_ots(rows)
+                if keyword is not None:
+                    for _row in dict_row:
+                        _row["keyword"] = keyword
+                l_rows.extend(dict_row)
+
+                _count += len(dict_row)
+                print("%d/%d"%(_count,total_count))
+            result_queue.put(l_rows)
+        except Exception as e:
+            traceback.print_exc()
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count,ots_client=ots_client)
+    mt.run()
+    while 1:
+        try:
+            dict_row = result_queue.get(False)
+            list_row.extend(dict_row)
+        except Exception as e:
+            break
+    return list_row
+
+
+class ExportEntity():
+
+    def __init__(self,table,table_index,list_query,columns,just_get_totol_count=False):
+
+        self.table = table
+        self.table_index = table_index
+        self.ots_client = getConnect_ots()
+
+        self.task_queue = Queue()
+        self.result_queue = Queue()
+
+        for _q in list_query:
+            self.task_queue.put(_q)
+
+    def _handle(self,item,task_queue):
+        self.ots_client.search(self.table,self.table_index,
+                               SearchQuery())
+
+    def export(self):
+        pass
+
+class A():
+
+    pass
+
+def test(a,bc={"a":2}):
+    pass
+if __name__ == '__main__':
+    # print(dir(test))
+    # _sign = inspect.signature(test)
+    # print(_sign.parameters)
+    # for _o in _sign.parameters:
+    #     print(_sign.parameters[_o].default)
+
+    a = A()
+    print(str(a))

BIN
jobs/docchannel.pk


+ 2555 - 16
jobs/exportJobs.py

@@ -4,39 +4,2578 @@ import sys
 import os
 sys.path.append(os.path.join(os.path.dirname(__file__),".."))
 
-from utils.Utils import sendEmail,getCurrent_date
+import requests
+
+from utils.Utils import sendEmail,getCurrent_date,log
 
 import datetime
 import time
-from export.exportDocument import exportDocument_medicine,list_df_columns
+from export.exportDocument import *
 import pandas as pd
 
 from apscheduler.schedulers.blocking import BlockingScheduler
 
+from export.exportUtils import rsa_encrpt
+
+import urllib.parse
+
+import base64
+
 def export_medicine_friday():
-    current_date = getCurrent_date("%Y-%m-%d")
-    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-6*24*60*60))
+    current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-24*60*60))
+    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-7*24*60*60))
     if current_date<="2022-04-25":
+        if datetime.datetime.now().weekday()==0:
+            for i in range(10):
+                try:
+                    df_data = exportDocument_medicine(start_time,current_date)
+                    df = pd.DataFrame(df_data)
+
+                    filename = os.path.dirname(__file__)+"/data/%s年%s至%s医疗数据导出.xlsx"%(start_time[:4],start_time,current_date)
+                    df.to_excel(filename,columns=list_df_columns)
+
+                    host = "smtp.exmail.qq.com"
+                    username = "vip@bidizhaobiao.com"
+                    password = "Biaoxun66-"
+                    receivers = ["1985262186@qq.com","1175730271@qq.com","1265797328@qq.com","1289358902@qq.com"]
+                    attachs = [filename]
+
+                    sendEmail(host,username,password,receivers,attachs=attachs)
+                    break
+                except Exception as e:
+                    traceback.print_exc()
+                    log(str(e))
+                    time.sleep(60)
+
+def export2():
+    def removeData(df_data):
+        list_remove_index = []
+        list_c = df_data.get("招标人采购系统",[])
+        for _c_i in range(len(list_c)):
+            if list_c[_c_i]=="企业采购系统":
+                list_remove_index.append(_c_i)
+        list_remove_index.reverse()
+        print(list_remove_index)
+        for k,v in df_data.items():
+            for _rc in list_remove_index:
+                v.pop(_rc)
+        for k,v in df_data.items():
+            v = v[:500]
+
+    log("start export2:>>>>>>>>>>>>>>>")
+    current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-24*60*60))
+    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-7*24*60*60))
+    if current_date<="2024-02-28":
         if datetime.datetime.now().weekday()==4:
-            df_data = exportDocument_medicine(start_time,current_date)
-            df = pd.DataFrame(df_data)
+            for i in range(10):
+                try:
+                    # start_time='2022-07-22'
+                    # current_date = '2022-07-28'
+                    log("start exporting export2:=================")
+                    # columns = ["doctitle","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
+                    columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
+                    dict_channel = getDict_docchannel()
+
+                    list_query = []
+
+                    str_keyword = '''
+
+机房建设 	多媒体教室建设 	数据中心机房 	班班通	电子设备
+视频 	LED	监控	视频监控系统	信息安全
+信息化建设	智能化	网络安全服务	网络安全系统	等级保护测评
+信息系统	大数据中心	智慧教室建设	数据安全建设	可视化指挥联动平台
+音频处理器	教学专用仪器	多媒体扩声系统	智慧校园	数字化
+交换	路由	无线	城域网建设	触控一体机
+教学设备				
+		
+
+ 
+                    '''
+                    list_keyword = splitIntoList(str_keyword,"[\s\n、,,]")
+
+                    str_not_keyword = '''
+                    '''
+                    list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
+
+                    tenderee_keywrod = "医院、大学、高校、高中"
+                    list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
+
+                    log(str(list_keyword))
+                    bool_query = BoolQuery(must_queries=[
+                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery),
+                        generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
+                        RangeQuery("page_time",start_time,current_date,True,True),
+                        RangeQuery("status",151,300,True,True),
+                        # TermQuery("procurement_system","公安系统"),
+                        generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                        # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                        # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                    ],
+                        # must_not_queries=[
+                        #     generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
+                        # ]
+                    )
+
+                    list_query.append({"query":bool_query,"limit":700})
+                    list_row = getDocument(list_query,columns)
+
+                    log("get document %d rows"%len(list_row))
+
+                    df_data = {}
+                    set_line = set()
+                    # list_row = filterRow(list_row)
+                    getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
+
+                    fixContactPerson(df_data,list_df_columns)
+
+                    removeData(df_data)
+                    df = pd.DataFrame(df_data)
+
+
+                    filename = os.path.dirname(__file__)+"/data/%s年%s至%s数据导出.xlsx"%(start_time[:4],start_time,current_date)
+                    df.to_excel(filename,columns=list_df_columns)
+                    log(str(filename))
+
+                    host = "smtp.exmail.qq.com"
+                    username = "vip@bidizhaobiao.com"
+                    password = "Biaoxun66-"
+                    receivers = ["1175730271@qq.com","493894608@qq.com"]
+                    # receivers = ["1175730271@qq.com"]
+                    attachs = [filename]
+
+                    sendEmail(host,username,password,receivers,attachs=attachs)
+                    break
+                except Exception as e:
+                    traceback.print_exc()
+
+def export5():
+    '''
+    客户信息
+公司名:武汉市浩盛特种建材有限责任公司
+
+客户姓名:高诗琴
+
+会员号:15392839439
+
+会员等级:高级会员
+成交金额:5900元
+服务期:12个月
+截止推送项目时间:2024-02-17
+
+申请原因:2.16成交的新客户,客户反馈跟销售沟通是人工每天整理好项目信息推送,现在寻求技术协助支持。
+客户需求:
+地区:河北省、北京市、天津市、山东省、江苏省、上海市、浙江省、福建省、广东省、海南省
+业务关键词:膨胀剂,抗裂剂,防水剂,外加剂,防腐剂,阻锈剂,密实剂,耐久性,氧化镁,镁质,涂料
+
+(排除词,漆),需要招标信息
+工作日下午四点文档信息推送到邮箱365531448@qq.com,每天更新最新的项目信息10条(不足10条可以按照更新信息推送)
+    :return:
+    '''
+    def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows[:10]:
+            _index += 1
+            item = {}
+            _dict = row
+
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province",""))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item_columns(set_col,df_columns,item,"发布时间",_dict.get("page_time",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告标题",_dict.get("doctitle",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位联系人电话",_dict.get("agency_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"链接","http://www.bidizhaobiao.com/info-%s.html"%str(_dict.get("docid","")))
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+
+    log("start export5:>>>>>>>>>>>>>>>")
+    current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+    if current_date<="2024-02-17":
+        weekday = datetime.datetime.now().weekday()
+        if weekday>=0 and weekday<=4:
+            for i in range(10):
+                try:
+                    # start_time='2022-07-22'
+                    # current_date = '2022-07-28'
+                    log("start exporting export2:=================")
+                    # columns = ["doctitle","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
+                    columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
+                    columns = ["doctitle","docchannel","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
+                    dict_channel = getDict_docchannel()
+
+                    list_query = []
+
+                    str_province = '''
+                    河北、北京、天津、山东、江苏、上海、浙江、福建、广东、海南				
+                    '''
+
+                    str_keyword = '''
+
+膨胀剂	抗裂剂	防水剂	外加剂	镁质
+防腐剂	阻锈剂	密实剂	耐久性	氧化镁
+				
+                    '''
+                    list_province = splitIntoList(str_province,"[\s\n、,,]")
+
+                    list_keyword = splitIntoList(str_keyword,"[\s\n、,,]")
+
+                    str_not_keyword = '''
+                    漆,复合风管,检测,涂料工程,涂料多乐士
+                    '''
+                    list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
+
+                    tenderee_keywrod = "医院、大学、高校、高中"
+                    list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
+
+                    log(str(list_keyword))
+                    bool_query = BoolQuery(must_queries=[
+                        generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
+                        generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
+                        TermQuery("page_time",current_date),
+                        RangeQuery("status",151,300,True,True),
+                        # TermQuery("procurement_system","公安系统"),
+                        generateBoolShouldQuery(["province"],list_province,TermQuery),
+                        # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                        # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                    ],
+                        must_not_queries=[
+                            generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
+                        ]
+                    )
+
+                    list_query.append({"query":bool_query,"limit":700})
+                    list_row = getDocument(list_query,columns)
+
+                    log("get document %d rows"%len(list_row))
+
+                    df_data = {}
+                    set_line = set()
+                    # list_row = filterRow(list_row)
+                    df_column = getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
+
+                    df = pd.DataFrame(df_data)
+                    print(df_data)
+
+
+                    filename = os.path.dirname(__file__)+"/data/%s年%s数据导出.xlsx"%(current_date[:4],current_date)
+                    df.to_excel(filename,columns=df_column)
+                    log(str(filename))
+
+                    host = "smtp.exmail.qq.com"
+                    username = "vip@bidizhaobiao.com"
+                    password = "Biaoxun66-"
+                    receivers = ["1175730271@qq.com","365531448@qq.com"]
+                    # receivers = ["1175730271@qq.com"]
+                    attachs = [filename]
+
+                    sendEmail(host,username,password,receivers,attachs=attachs)
+                    break
+                except Exception as e:
+                    traceback.print_exc()
+
+
+
+def export_document_except():
+    def getRowData(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+        _index = 0
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            _extract = json.loads(_dict.get("extract_json","{}"))
+            product_attrs = _extract.get("product_attrs",{"data":[{}]}).get("data")
+            docid = _dict.get("docid","")
+            win_tenderer = ""
+            bidding_budget = ""
+            sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+            for _doc in sub_docs_json:
+                if win_tenderer=="":
+                    win_tenderer = _doc.get("win_tenderer","")
+                if bidding_budget=="":
+                    bidding_budget= _doc.get("bidding_budget","")
+            for _attrs in product_attrs:
+
+                set_dict_item_columns(set_columns1,list_df_column1,item,"公告链接",'=HYPERLINK("http://www.bidizhaobiao.com/info-%s.html","查看公告")'%(str(docid)))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"公告时间",_dict.get("page_time"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"城市",_dict.get("city"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"地区",_dict.get("district"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"项目标题",_dict.get("doctitle"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"采购单位",_dict.get("tenderee"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位",win_tenderer)
+                set_dict_item_columns(set_columns1,list_df_column1,item,"产品名称",_dict.get("product"))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"规格型号",_attrs.get("specs",""))
+                set_dict_item_columns(set_columns1,list_df_column1,item,"品牌",_attrs.get("brand",""))
+                _quantity = ""
+                _uniPrice = ""
+                try:
+                    _quantity = int(_attrs.get("quantity","0"))
+                    if _quantity==0:
+                        _quantity = ""
+                except Exception as e:
+                    pass
+                try:
+                    _uniPrice = float(_attrs.get("_uniPrice","0"))
+                    if _uniPrice==0:
+                        _uniPrice = ""
+                except Exception as e:
+                    pass
+                set_dict_item_columns(set_columns1,list_df_column1,item,"数量",_quantity)
+                set_dict_item_columns(set_columns1,list_df_column1,item,"单价(元)",_uniPrice)
+                sumPrice = ""
+                if _quantity!="" and _uniPrice!="":
+                    sumPrice = _quantity*_uniPrice
+                set_dict_item_columns(set_columns1,list_df_column1,item,"总价(元)",sumPrice)
+                set_dict_item_columns(set_columns1,list_df_column1,item,"项目金额(元)",bidding_budget)
+
+
+
+                for k,v in item.items():
+                    if k not in df_data:
+                        df_data[k] = []
+                    df_data[k].append(v)
+
+    current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+    current_date = '2022-04-01'
+
+    columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
+    dict_channel = getDict_docchannel()
+
+
+    list_query = []
+
+    str_keyword = '''
+                    机房建设 	多媒体教室建设 	数据中心机房 	班班通	电子设备
+视频 	LED	监控	视频监控系统	信息安全
+信息化建设	网络安全	网络安全服务	网络安全系统	等级保护测评
+信息系统	大数据中心	智慧教室建设	数据安全建设	可视化指挥联动平台
+音频处理器	教学专用仪器	多媒体扩声系统	智慧校园	数字化
+交换	路由	无线	信息安全	触控一体机
+教学设备						
+                
+                    '''
+    list_keyword = splitIntoList(str_keyword,"[\s\n、,,]")
+
+    str_not_keyword = '''
+                    '''
+    list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
+
+    tenderee_keywrod = "医院、大学、高校、高中"
+    list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
+    s_q = BoolQuery(should_queries=[BoolQuery(must_queries=[TermQuery("docchannel",52)],
+                                              must_not_queries=[WildcardQuery("tenderee","*")]),
+                                    BoolQuery(must_queries=[generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery)],
+                                              must_not_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*"))])])
+    log(str(list_keyword))
+    set_columns1 = set()
+    list_df_columns1 = []
+    bool_query = BoolQuery(must_queries=[
+        TermQuery("page_time",current_date),
+        RangeQuery("status",151,451,True,True),
+        s_q
+    ],
+        # must_not_queries=[
+        #     generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
+        # ]
+    )
+
+    list_query.append({"query":bool_query,"limit":500000})
+    list_row = getDocument(list_query,columns)
+
+    log("get document %d rows"%len(list_row))
+    df_data = {}
+    set_line = set()
+    # list_row = filterRow(list_row,list_not_key)
+    getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
+
+    # fixContactPerson(df_data,list_df_columns)
+
+    df = pd.DataFrame(df_data)
+
+    filename = os.path.dirname(__file__)+"/data/%s异常数据导出.xlsx"%(current_date)
+    print(list_df_columns)
+    df.to_excel(filename,columns=list_df_columns)
+
+class Export3():
+
+    def trytimes(self):
+        for _ in range(3):
+            _succeed = self.export3()
+            if _succeed:
+                break
+
+    def export3(self,):
+
+        def getRowData(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+            _index = 0
+            for row in rows:
+
+
+                _index += 1
+                item = {}
+                _dict = row
+                _extract = json.loads(_dict.get("extract_json","{}"))
+                demand_info = _extract.get("demand_info",{"data":[]}).get("data")
+                docid = _dict.get("docid","")
+                win_tenderer = ""
+                bidding_budget = ""
+                win_bid_price = ""
+                sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+                for _doc in sub_docs_json:
+                    if win_tenderer=="":
+                        win_tenderer = _doc.get("win_tenderer","")
+                    if bidding_budget=="":
+                        bidding_budget= _doc.get("bidding_budget","")
+                    if win_bid_price=="":
+                        win_bid_price = _doc.get("win_bid_price","")
+                if len(demand_info)==0:
+                    demand_info = [{}]
+                demand_info = [{}]
+                for _attrs in demand_info:
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"序号",_index)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目编号",_dict.get("project_code"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_dict.get("project_name"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"城市",_dict.get("city"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"信息类型",dict_channel.get(_dict.get("docchannel")))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"发布时间",_dict.get("crtime"))
+
+                    b_or_w = ""
+                    if bidding_budget!="":
+                        b_or_w = float(bidding_budget)/10000
+                    elif win_bid_price!="":
+                        b_or_w = float(win_bid_price)/10000
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标金额/预算金额(万元)",b_or_w)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标单位",_dict.get("tenderee"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标项目联系人","%s\n%s"%(_dict.get("tenderee_contact",""),_dict.get("tenderee_phone","")))
+
+
+# 中标单位	中标项目联系人	中标单位主要联系人	中标单位联系人	代理单位	代理项目联系人	代理单位联系人	比地招标公告地址
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标单位联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位",win_tenderer)
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标项目联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位主要联系人","")
+
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理单位",_dict.get("agency",""))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理项目联系人","%s\n%s"%(_dict.get("agency_contact",""),_dict.get("agency_phone","")))
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理单位联系人","")
+
+
+                    _key ='''MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC8uZYYV6ls+5KMzUfvsPvv3XRdwkcTBj/ppB03mijUPHYTGvYSE0cQTbQrnIbXFtUYJguakpKLmVyH+T/w6vhxbQNlaykfe8RXEh4i4IJk8s/Qb0E0xODsjKBEr8VdDYeqqduWrtJpttXAvv93SsTPvgZBditRzJAzk0XH56zL1wIDAQAB'''
+
+                    _key = base64.b64decode(_key)
+                    _url_map = {"userId":"001795335",
+                                "timestamp":"%d"%(time.time()*1000),
+                                "docid":str(_dict.get("docid",""))}
+                    _encrpt_text = base64.b64encode(rsa_encrpt(json.dumps(_url_map),_key))
+                    _encrpt_text = urllib.parse.quote(_encrpt_text)
+                    _url = "http://www.bidizhaobiao.com/info-%s.html?emailSecret=%s"%(str(_dict.get("docid")),_encrpt_text)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"比地招标公告地址",_url)
+
+
+
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目编号",_dict.get("project_code"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_attrs.get("project_name"))
+                    #
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"需求",_attrs.get("demand"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"预算(元)",_attrs.get("budget"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"开始时间",_attrs.get("order_begin"))
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位",win_tenderer)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"产品名称",_attrs.get("product"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"规格型号",_attrs.get("specs",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"品牌",_attrs.get("brand",""))
+                    # _quantity = ""
+                    # _uniPrice = ""
+                    # try:
+                    #     _quantity = int(_attrs.get("quantity","0"))
+                    #     if _quantity==0:
+                    #         _quantity = ""
+                    # except Exception as e:
+                    #     pass
+                    # try:
+                    #     _uniPrice = float(_attrs.get("unitPrice","0"))
+                    #     if _uniPrice==0:
+                    #         _uniPrice = ""
+                    # except Exception as e:
+                    #     pass
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"数量",_quantity)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"单价(元)",_uniPrice)
+                    # sumPrice = ""
+                    # if _quantity!="" and _uniPrice!="":
+                    #     sumPrice = _quantity*_uniPrice
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"总价(元)",sumPrice)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目金额(元)",bidding_budget)
+
+
+
+                    for k,v in item.items():
+                        if k not in df_data:
+                            df_data[k] = []
+                        df_data[k].append(v)
+
+        def getRowData1(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+            _index = 0
+            for row in rows:
+
+
+                _index += 1
+                item = {}
+                _dict = row
+                _extract = json.loads(_dict.get("extract_json","{}"))
+                demand_info = _extract.get("demand_info",{"data":[]}).get("data")
+                docid = _dict.get("docid","")
+                win_tenderer = ""
+                bidding_budget = ""
+                win_bid_price = ""
+                sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+                for _doc in sub_docs_json:
+                    if win_tenderer=="":
+                        win_tenderer = _doc.get("win_tenderer","")
+                    if bidding_budget=="":
+                        bidding_budget= _doc.get("bidding_budget","")
+                    if win_bid_price=="":
+                        win_bid_price = _doc.get("win_bid_price","")
+                if len(demand_info)==0:
+                    demand_info = [{}]
+                demand_info = [{}]
+                for _attrs in demand_info:
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"序号",_index)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目编号",_dict.get("project_code"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"地区",_dict.get("district"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_dict.get("project_name"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购人",_dict.get("tenderee"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标时间",_dict.get("page_time"))
+
+
+                    b_or_w = ""
+                    if bidding_budget!="":
+                        b_or_w = float(bidding_budget)/10000
+                    elif win_bid_price!="":
+                        b_or_w = float(win_bid_price)/10000
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标预算",bidding_budget)
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标时间",_dict.get("page_time"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标金额",win_bid_price)
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标人",win_tenderer)
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购模式",_dict.get("bidway"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"命中关键词",_dict.get("keyword"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购品牌",'')
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购数量",'')
+
+
+
+                    for k,v in item.items():
+                        if k not in df_data:
+                            df_data[k] = []
+                        df_data[k].append(v)
+
+        def getContacts(contacts):
+
+            try:
+                if contacts is None or contacts=="":
+                    return ""
+
+                _contacts = json.loads(contacts)
+
+                list_c = []
+                _count = 0
+                for _c in _contacts:
+                    _count += 1
+                    contact_person = _c.get("contact_person","")
+                    phone_no = _c.get("phone_no","")
+                    list_c.append("%s(企业联系人%d)\n%s"%(str(contact_person),_count,str(phone_no)))
+                    if _count>=5:
+                        break
+                return ",\n".join(list_c)
+            except Exception as e:
+                return ""
+
+
+        def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+            set_enterprise = set()
+            if len(df_data.keys())>0:
+                for _tenderee,_agency,_win in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+                    set_enterprise.add(_tenderee)
+                    set_enterprise.add(_agency)
+                    set_enterprise.add(_win)
+                if "" in set_enterprise:
+                    set_enterprise.remove("")
+                if None in set_enterprise:
+                    set_enterprise.remove(None)
+                dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["contacts","procurement_system"])
+
+
+                # conn = getConnection_oracle()
+                # cursor = conn.cursor()
+                if len(set_enterprise)>0:
+                    for _i in range(len(df_data["招标单位"])):
+                        _enterprise_name = df_data["招标单位"][_i]
+
+                        df_data["招标单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        _enterprise_name = df_data["代理单位"][_i]
+
+                        df_data["代理单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        _enterprise_name = df_data["中标单位"][_i]
+
+                        df_data["中标单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        # if "采购系统" not in df_data:
+                        #     df_data["采购系统"] = []
+                        # df_data["采购系统"].append(dict_enterprise.get(_enterprise_name,{}).get("procurement_system",""))
+
+                    # list_df_columns.extend(['采购系统'])
+        '''
+        export by customer's subscription
+        :return:
+        '''
+        try:
+            ots_client = getConnect_ots()
+
+            subscription4 = '''
+            
+            触控一体机      交互平板    交互一体机   交互智能平板      交互大屏    教学一体机    智慧屏  智慧黑板     互动黑板    班班通    多媒体教室    多媒体设备   多媒体系统 智慧教室
+            '''
+            # provinces = "河北   山西  广东  海南  江苏  安徽  山东  河南  湖北  湖南  重庆   黑龙江  陕西    甘肃  青海   宁夏"
+            provinces = "海南  江苏  安徽 湖北  湖南  重庆   甘肃  青海   宁夏  河南 北京 天津"
+
+
+            dict_channel = getDict_docchannel()
+            list_province = splitIntoList(provinces,"\s|,|,|、")
+            list_subscription4 = splitIntoList(subscription4,"\s|,|,|、")
+
+            current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+            last_date = timeAdd(current_date,-1)
+
+            # last_date = "2023-12-08"
+            # current_date = "2023-12-10"
+
+            set_columns1 = set()
+            list_df_columns1 = []
+            bool_query = BoolQuery(must_queries=[
+                generateBoolShouldQuery(["docchannel"],[102,114],TermQuery),
+                RangeQuery("page_time",last_date),
+                RangeQuery("status",201,300,True,True),
+                generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_subscription4,MatchPhraseQuery),
+                generateBoolShouldQuery(["province"],list_province,WildcardQuery),
+                RangeQuery("crtime","%s 21:00:00"%last_date,"%s 21:00:00"%current_date)
+            ],
+                # must_not_queries=[
+                #     generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
+                # ]
+            )
+
+
+            columns = ["doctitle","docchannel","original_docchannel","product","crtime","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","extract_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
+            list_query = [{"query":bool_query,"limit":50000}]
+            list_row = getDocument(list_query,columns)
+
+            filename0 = os.path.dirname(__file__)+"/data/%s订阅4订阅5数据导出数量%d_%s.xlsx"%(current_date,len(list_row),getCurrent_date("%Y-%m-%d_%H%M%S"))
+            df_data = {}
+            if len(list_row)>0:
+                getRowData(df_data,list_row,dict_channel,set_columns1,list_df_columns1)
+
+                fixContactPerson(df_data,list_df_columns1)
+                df = pd.DataFrame(df_data)
+                df.to_excel(filename0,columns=list_df_columns1,index=False)
+            else:
+                df = pd.DataFrame(df_data)
+                df.to_excel(filename0,index=False)
+
+            host = "smtp.exmail.qq.com"
+            username = "vip1@bidizhaobiao.com"
+            password = "Biaoxun666+"
+            # receivers = ["1175730271@qq.com","995116318@qq.com","huangxiaofang@cvte.com"]
+            receivers = ["1175730271@qq.com","1208135584@qq.com","youyuer@cvte.com","chenyuxue@cvte.com"]
+
+            # receivers = ["1175730271@qq.com"]
+
+            attachs = [filename0]
+
+            sendEmail(host,username,password,receivers,attachs=attachs)
+
+            return True
+        except Exception as e:
+            traceback.print_exc()
+        return False
+
+
+    def export3_1(self,):
+
+        def getRowData(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+            _index = 0
+            for row in rows:
+
+
+                _index += 1
+                item = {}
+                _dict = row
+                _extract = json.loads(_dict.get("extract_json","{}"))
+                demand_info = _extract.get("demand_info",{"data":[]}).get("data")
+                docid = _dict.get("docid","")
+                win_tenderer = ""
+                bidding_budget = ""
+                win_bid_price = ""
+                sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+                for _doc in sub_docs_json:
+                    if win_tenderer=="":
+                        win_tenderer = _doc.get("win_tenderer","")
+                    if bidding_budget=="":
+                        bidding_budget= _doc.get("bidding_budget","")
+                    if win_bid_price=="":
+                        win_bid_price = _doc.get("win_bid_price","")
+                if len(demand_info)==0:
+                    demand_info = [{}]
+                demand_info = [{}]
+                for _attrs in demand_info:
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"序号",_index)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目编号",_dict.get("project_code"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_dict.get("project_name"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"城市",_dict.get("city"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"信息类型",dict_channel.get(_dict.get("docchannel")))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"发布时间",_dict.get("crtime"))
+
+                    b_or_w = ""
+                    if bidding_budget!="":
+                        b_or_w = float(bidding_budget)/10000
+                    elif win_bid_price!="":
+                        b_or_w = float(win_bid_price)/10000
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标金额/预算金额(万元)",b_or_w)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标单位",_dict.get("tenderee"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标项目联系人","%s\n%s"%(_dict.get("tenderee_contact",""),_dict.get("tenderee_phone","")))
+
+
+                    # 中标单位	中标项目联系人	中标单位主要联系人	中标单位联系人	代理单位	代理项目联系人	代理单位联系人	比地招标公告地址
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标单位联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位",win_tenderer)
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标项目联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位主要联系人","")
+
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位联系人","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理单位",_dict.get("agency",""))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理项目联系人","%s\n%s"%(_dict.get("agency_contact",""),_dict.get("agency_phone","")))
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"代理单位联系人","")
+
+
+                    _key ='''MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC8uZYYV6ls+5KMzUfvsPvv3XRdwkcTBj/ppB03mijUPHYTGvYSE0cQTbQrnIbXFtUYJguakpKLmVyH+T/w6vhxbQNlaykfe8RXEh4i4IJk8s/Qb0E0xODsjKBEr8VdDYeqqduWrtJpttXAvv93SsTPvgZBditRzJAzk0XH56zL1wIDAQAB'''
+
+                    _key = base64.b64decode(_key)
+                    _url_map = {"userId":"001795335",
+                                "timestamp":"%d"%(time.time()*1000),
+                                "docid":str(_dict.get("docid",""))}
+                    _encrpt_text = base64.b64encode(rsa_encrpt(json.dumps(_url_map),_key))
+                    _encrpt_text = urllib.parse.quote(_encrpt_text)
+                    _url = "http://www.bidizhaobiao.com/info-%s.html?emailSecret=%s"%(str(_dict.get("docid")),_encrpt_text)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"比地招标公告地址",_url)
+
+
+
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目编号",_dict.get("project_code"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_attrs.get("project_name"))
+                    #
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"需求",_attrs.get("demand"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"预算(元)",_attrs.get("budget"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"开始时间",_attrs.get("order_begin"))
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"中标单位",win_tenderer)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"产品名称",_attrs.get("product"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"规格型号",_attrs.get("specs",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"品牌",_attrs.get("brand",""))
+                    # _quantity = ""
+                    # _uniPrice = ""
+                    # try:
+                    #     _quantity = int(_attrs.get("quantity","0"))
+                    #     if _quantity==0:
+                    #         _quantity = ""
+                    # except Exception as e:
+                    #     pass
+                    # try:
+                    #     _uniPrice = float(_attrs.get("unitPrice","0"))
+                    #     if _uniPrice==0:
+                    #         _uniPrice = ""
+                    # except Exception as e:
+                    #     pass
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"数量",_quantity)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"单价(元)",_uniPrice)
+                    # sumPrice = ""
+                    # if _quantity!="" and _uniPrice!="":
+                    #     sumPrice = _quantity*_uniPrice
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"总价(元)",sumPrice)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目金额(元)",bidding_budget)
+
+
+
+                    for k,v in item.items():
+                        if k not in df_data:
+                            df_data[k] = []
+                        df_data[k].append(v)
+
+
+        def getContacts(contacts):
+
+            try:
+                if contacts is None or contacts=="":
+                    return ""
+
+                if isinstance(contacts,str):
+                    _contacts = json.loads(contacts)
+                else:
+                    _contacts = contacts
+
+                list_c = []
+                _count = 0
+                for _c in _contacts:
+                    _count += 1
+                    contact_person = _c.get("contact_person","")
+                    phone_no = _c.get("phone_no","")
+                    list_c.append("%s(企业联系人%d)\n%s"%(str(contact_person),_count,str(phone_no)))
+                    if _count>=5:
+                        break
+                print("getContacts",",\n".join(list_c))
+                return ",\n".join(list_c)
+            except Exception as e:
+                traceback.print_exc()
+                print("getContacts","")
+                return ""
+
+
+        def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+            set_enterprise = set()
+            if len(df_data.keys())>0:
+                for _tenderee,_agency,_win in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+                    set_enterprise.add(_tenderee)
+                    set_enterprise.add(_agency)
+                    set_enterprise.add(_win)
+                if "" in set_enterprise:
+                    set_enterprise.remove("")
+                if None in set_enterprise:
+                    set_enterprise.remove(None)
+                dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["contacts","procurement_system"])
+
+
+                # conn = getConnection_oracle()
+                # cursor = conn.cursor()
+                if len(set_enterprise)>0:
+                    for _i in range(len(df_data["招标单位"])):
+                        _enterprise_name = df_data["招标单位"][_i]
+
+                        df_data["招标单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        _enterprise_name = df_data["代理单位"][_i]
+
+                        df_data["代理单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        _enterprise_name = df_data["中标单位"][_i]
+
+                        df_data["中标单位联系人"][_i] = getContacts(dict_enterprise.get(_enterprise_name,{}).get("contacts"))
+
+                        # if "采购系统" not in df_data:
+                        #     df_data["采购系统"] = []
+                        # df_data["采购系统"].append(dict_enterprise.get(_enterprise_name,{}).get("procurement_system",""))
+
+                    # list_df_columns.extend(['采购系统'])
+        '''
+        export by customer's subscription
+        :return:
+        '''
+        try:
+            ots_client = getConnect_ots()
+
+            subscription4 = '''
+            
+            录播、三个课堂、互动课堂、同步课堂、双师课堂、专递课堂、微格教室、远程课堂、督导巡课、推门听课、互动教学
+            '''
+            # provinces = "河北   山西  广东  海南  江苏  安徽  山东  河南  湖北  湖南  重庆   黑龙江  陕西    甘肃  青海   宁夏"
+            provinces = "海南  江苏  安徽 湖北  湖南  重庆   甘肃  青海   宁夏  河南"
+
+
+            dict_channel = getDict_docchannel()
+            list_province = splitIntoList(provinces,"\s|,|,|、")
+            list_subscription4 = splitIntoList(subscription4,"\s|,|,|、")
+
+            current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+            last_date = timeAdd(current_date,-1)
+
+            # last_date = "2023-08-14"
+            # current_date = "2023-08-20"
+
+            set_columns1 = set()
+            list_df_columns1 = []
+            bool_query = BoolQuery(must_queries=[
+                generateBoolShouldQuery(["docchannel"],[102,114],TermQuery),
+                RangeQuery("page_time",last_date),
+                RangeQuery("status",201,300,True,True),
+                generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_subscription4,MatchPhraseQuery),
+                # generateBoolShouldQuery(["province"],list_province,WildcardQuery),
+                RangeQuery("crtime","%s 21:00:00"%last_date,"%s 21:00:00"%current_date)
+            ],
+                # must_not_queries=[
+                #     generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
+                # ]
+            )
+
+
+            columns = ["doctitle","docchannel","original_docchannel","product","crtime","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","extract_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
+            list_query = [{"query":bool_query,"limit":50000}]
+            list_row = getDocument(list_query,columns)
+
+            filename0 = os.path.dirname(__file__)+"/data/%s订阅4订阅5数据导出数量%d_%s.xlsx"%(current_date,len(list_row),getCurrent_date("%Y-%m-%d_%H%M%S"))
+            df_data = {}
+            if len(list_row)>0:
+                getRowData(df_data,list_row,dict_channel,set_columns1,list_df_columns1)
+
+                fixContactPerson(df_data,list_df_columns1)
+                df = pd.DataFrame(df_data)
+                df.to_excel(filename0,columns=list_df_columns1,index=False)
+            else:
+                df = pd.DataFrame(df_data)
+                df.to_excel(filename0,index=False)
 
-            filename = os.path.dirname(__file__)+"/data/%s年%s至%s医疗数据导出.xlsx"%(start_time[:-4],start_time,current_date)
-            df.to_excel(filename,columns=['公告标题', '公告类别', '省份', '城市', '发布时间', '项目编号', '招标单位', '招标联系人', '招标联系人电话', '代理单位', '代理联系人', '代理联系人电话', '比地招标公告地址', '中标单位', '中标金额', '招标金额', '中标单位联系人', '中标单位联系电话'])
             host = "smtp.exmail.qq.com"
-            username = "vip@bidizhaobiao.com"
-            password = "Biaoxun66-"
-            receivers = ["1985262186@qq.com","1175730271@qq.com","1265797328@qq.com","1289358902@qq.com"]
-            attachs = [filename]
+            username = "vip1@bidizhaobiao.com"
+            password = "Biaoxun666+"
+            # receivers = ["1175730271@qq.com","995116318@qq.com","huangxiaofang@cvte.com"]
+            receivers = ["1175730271@qq.com","1208135584@qq.com","wanghongyan@cvte.com"]
+
+            # receivers = ["1175730271@qq.com"]
+
+            attachs = [filename0]
+
             sendEmail(host,username,password,receivers,attachs=attachs)
 
+            return True
+        except Exception as e:
+            traceback.print_exc()
+        return False
+
+    def export4(self):
+
+        def getRowData(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+            dict_filter,dict_update = getFixData()
+            _index = 0
+            for row in rows:
+
 
-def job_medicine_friday():
+                _index += 1
+                item = {}
+                _dict = row
+                keys = row["keyword"]
+                log("================")
+                log(keys)
+                log(_dict.get("doctitle","")+_dict.get("doctextcon","")+_dict.get("attachmenttextcon",""))
+                _keyword = str(list(set([ i for i in re.findall("|".join([re.escape(i) for i in keys if i!='']),_dict.get("doctitle","")+_dict.get("doctextcon","")+_dict.get("attachmenttextcon","")) if i!='' ])))
+                log(_keyword)
+                _extract = json.loads(_dict.get("extract_json","{}"))
+                if len(set(keys)&set(["AR一体机","一体机","短焦投影仪","教学一体机","云桌面","录播","电子书包","电子班牌","智慧黑板"]))>0:
+                    product_attrs = _extract.get("product_attrs",{"data":[{}]}).get("data")
+                else:
+                    product_attrs = [{}]
+                if len(product_attrs)==0:
+                    product_attrs = [{}]
+                docid = _dict.get("docid","")
+                win_tenderer = ""
+                bidding_budget = ""
+                win_bid_price = ""
+                sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+                for _doc in sub_docs_json:
+                    if win_tenderer=="":
+                        win_tenderer = _doc.get("win_tenderer","")
+                    if bidding_budget=="":
+                        bidding_budget= _doc.get("bidding_budget","")
+                    if win_bid_price=="":
+                        win_bid_price = _doc.get("win_bid_price","")
+                _index1 = 0
+                if _dict.get("docid") in dict_filter:
+                    print("====filter")
+                    continue
+                if _dict.get("docid") in dict_update:
+                    print("====update")
+                    win_bid_price = dict_update[_dict.get("docid")].get("win_bid_price","")
+                    bidding_budget= dict_update[_dict.get("docid")].get("bidding_budget","")
+                for _attrs in product_attrs:
+                    _index1 += 1
+                    if re.search("运费",_attrs.get("product","")) is not None:
+                        if _index1==1:
+                            _attrs = {}
+                        else:
+                            continue
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"序号",_index)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"正文",_dict.get("doctextcon",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"附件",_dict.get("attachmenttextcon",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_dict.get("project_name",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"docid",_dict.get("docid"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"网站链接",'=HYPERLINK("http://www.bidizhaobiao.com/info-%s.html","查看公告")'%(str(docid)))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"城市",_dict.get("city"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"地区",_dict.get("district"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"公告时间",_dict.get("page_time"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标公告时间",'')
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目标题",_dict.get("doctitle"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购单位",_dict.get("tenderee"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标供应商",win_tenderer)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购方式",_dict.get("bidway",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标采购方式","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购系统","")
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"产品名称",_attrs.get("product"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"品牌",_attrs.get("brand",""))
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"规格型号",_attrs.get("specs",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"预算金额(元)",bidding_budget)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"招标预算金额(元)","")
+                    _quantity = ""
+                    _uniPrice = ""
+                    try:
+                        _quantity = int(re.sub("[^\d]","",_attrs.get("quantity","0")))
+                        if _quantity==0:
+                            _quantity = ""
+                    except Exception as e:
+                        pass
+                    try:
+                        _uniPrice = float(_attrs.get("unitPrice","0"))
+                        if _uniPrice==0:
+                            _uniPrice = ""
+                    except Exception as e:
+                        pass
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"数量",_quantity)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"单价(元)",_uniPrice)
+                    sumPrice = ""
+                    if _quantity!="" and _uniPrice!="":
+                        sumPrice = _quantity*_uniPrice
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"总价(元)",sumPrice)
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标金额(元)",win_bid_price)
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"关键词",_keyword)
+
+
+
+
+                    for k,v in item.items():
+                        if k not in df_data:
+                            df_data[k] = []
+                        df_data[k].append(v)
+        def filterRow(list_row,key="docid"):
+            set_id = set()
+            _filter_row = []
+            for row in list_row:
+                _v = row.get(key,"")
+                if _v in set_id:
+                    continue
+                set_id.add(_v)
+                _filter_row.append(row)
+            return _filter_row
+
+        def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+            set_enterprise = set()
+            if len(df_data.keys())>0:
+                for _tenderee,_win_tenderer in zip(df_data["采购单位"],df_data["中标供应商"]):
+                    set_enterprise.add(_tenderee)
+                    set_enterprise.add(_win_tenderer)
+                if "" in set_enterprise:
+                    set_enterprise.remove("")
+                if None in set_enterprise:
+                    set_enterprise.remove(None)
+                dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system"])
+
+                # conn = getConnection_oracle()
+                # cursor = conn.cursor()
+                if len(set_enterprise)>0:
+                    for _i in range(len(df_data["采购单位"])):
+                        _enterprise_name = df_data["采购单位"][_i]
+                        # if df_data["招标联系人电话"][_i]=="":
+                        #     contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        #     if contacts is not None:
+                        #         _person,_phone = getOneContact(contacts)
+                        #         df_data["招标联系人"][_i] = _person
+                        #         df_data["招标联系人电话"][_i] = _phone
+
+
+                        df_data["采购系统"][_i] = dict_enterprise.get(_enterprise_name,{}).get("procurement_system","")
+
+        def fixZhaobiao_page_time(df_data):
+            ots_client = getConnect_ots()
+
+            for i in range(len(df_data["docid"])):
+                _docid = df_data["docid"][i]
+                bool_query = BoolQuery(must_queries=[TermQuery("docids",_docid)])
+
+                rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                               SearchQuery(bool_query,limit=1),
+                                                                               ColumnsToGet(["zhao_biao_page_time","bidway","bidding_budget"],ColumnReturnType.SPECIFIED))
+                list_data = getRow_ots(rows)
+                if len(list_data)>0:
+                    df_data["招标公告时间"][i] = list_data[0].get("zhao_biao_page_time","")
+                    df_data["招标采购方式"][i] = list_data[0].get("bidway","")
+                    new_bidding_budget = list_data[0].get("bidding_budget",0)
+                    if df_data["预算金额(元)"][i]=="" or float(df_data["预算金额(元)"][i])==0:
+                        if df_data["中标金额(元)"][i]!="" and new_bidding_budget>=float(df_data["中标金额(元)"][i]):
+                            df_data["预算金额(元)"][i] = new_bidding_budget
+                    # df_data["招标预算金额(元)"][i] = list_data[0].get("bidding_budget","")
+
+        def popRows(df_data):
+            list_pop = []
+            set_docid = set()
+            for _c in range(len(df_data["采购系统"])):
+                if df_data["采购系统"][_c]!="教育系统" or df_data["中标金额(元)"][_c]=="":
+                    list_pop.append(_c)
+                    set_docid.add(df_data["docid"][_c])
+            list_pop.reverse()
+            for k,v in df_data.items():
+                for _p in list_pop:
+                    v.pop(_p)
+            _index = 0
+            set_docid1 = set()
+            for _i in range(len(df_data["序号"])):
+                _docid = df_data["docid"][_i]
+                if _docid not in set_docid1:
+                    _index += 1
+                set_docid1.add(_docid)
+                df_data["序号"][_i] = _index
+            return len(set_docid)
+
+        df = pd.read_excel("20220927v1.4.xlsx",sheetname=1)
+
+        list_search_dict = []
+        for title_kw,content_kw,title_rm,content_rm in zip(df["标题采集关键词"],df["全文采集关键词"],df["标题排除词"],df["全文排除词"]):
+            if not isinstance(title_kw,str):
+                title_kw = ""
+            if not isinstance(content_kw,str):
+                content_kw = ""
+            if not isinstance(title_rm,str):
+                title_rm = ""
+            if not isinstance(content_rm,str):
+                content_rm = ""
+            _dict = {"title_kw":re.split("\s|,|,",str(title_kw)),
+                     "content_kw":re.split("\s|,|,",str(content_kw)),
+                     "title_rm":re.split("\s|,|,",str(title_rm)),
+                     "content_rm":re.split("\s|,|,",str(content_rm))}
+            list_search_dict.append(_dict)
+
+
+        columns = ["extract_json","doctextcon","attachmenttextcon","doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
+
+        current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+        last_date = timeAdd(current_date,-1)
+
+        current_date = "2022-10-14"
+        last_date = "2023-05-07"
+
+
+        dict_channel = getDict_docchannel()
+        list_mq = []
+        list_query = []
+        for _d in list_search_dict:
+            print(_d)
+            kw_should_q = []
+            list_keys = []
+            title_kw = _d.get("title_kw",[])
+            if len(title_kw)>0:
+                kw_should_q.append(generateBoolShouldQuery(["doctitle"],title_kw,MatchPhraseQuery))
+                list_keys.extend(title_kw)
+            content_kw = _d.get("content_kw",[])
+            if len(content_kw)>0:
+                kw_should_q.append(generateBoolShouldQuery(["doctextcon","attachmenttextcon"],content_kw,MatchPhraseQuery))
+                list_keys.extend(content_kw)
+
+            rm_should_q = []
+            title_rm = _d.get("title_rm",[])
+            if len(title_rm)>0:
+                rm_should_q.append(generateBoolShouldQuery(["doctitle"],title_rm,MatchPhraseQuery))
+            content_rm = _d.get("content_rm",[])
+            if len(content_rm)>0:
+                rm_should_q.append(generateBoolShouldQuery(["doctextcon","attachmenttextcon"],content_rm,MatchPhraseQuery))
+
+            _query = BoolQuery(must_queries=[
+                generateBoolShouldQuery(["docchannel"],[101],TermQuery),
+                                             #       RangeQuery("page_time",last_date,current_date),
+                                             RangeQuery("page_time",current_date,last_date),
+                                             #       RangeQuery("status",201,300,True,True),
+                # TermQuery("docid",263568527),
+                                                   # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keys,MatchPhraseQuery),
+                BoolQuery(should_queries=kw_should_q),
+                                                   # generateBoolShouldQuery(["province"],list_province,WildcardQuery),
+                                                   # RangeQuery("crtime","%s"%last_date,"%s"%current_date)
+                                                   ],
+                                     must_not_queries=rm_should_q)
+            list_query.append({"query":_query,"limit":50000,"keyword":list_keys})
+            break
+
+
+
+        def getFixData():
+            filename = "C:\\Users\\Administrator\\Desktop\\视源数据清洗.xlsx"
+            dict_filter = {}
+            dict_update = {}
+            df = pd.read_excel(filename,0)
+            for _docid,bidding_budget,win_bid_price,is_dup,is_fb in zip(df["docid"],df["检查招标金额"],df["中标金额(元)"],df["是否重复"],df["是否废标"]):
+                if str(is_dup) in ("是","1") or str(is_fb) in ("是","1"):
+                    dict_filter[_docid] = 1
+                else:
+                    dict_update[_docid] = {"docid":_docid,
+                                           "bidding_budget":bidding_budget,
+                                           "win_bid_price":win_bid_price}
+            df = pd.read_excel(filename,1)
+            for _docid,bidding_budget,win_bid_price,is_dup,is_fb in zip(df["docid"],df["检查招标金额"],df["中标金额(元)"],df["是否重复"],df["是否废标"]):
+                if str(is_dup) in ("是","1") or str(is_fb) in ("是","1"):
+                    dict_filter[_docid] = 1
+                else:
+                    dict_update[_docid] = {"docid":_docid,
+                                           "bidding_budget":bidding_budget,
+                                           "win_bid_price":win_bid_price}
+            return dict_filter,dict_update
+
+
+
+        list_row = getDocument(list_query,columns)
+        list_row = filterRow(list_row)
+
+        print("list_row",len(list_row))
+
+        set_columns1 = set()
+        list_df_columns1 = []
+
+        df_data = {}
+        getRowData(df_data,list_row,dict_channel,set_columns1,list_df_columns1)
+
+        print("len df_data",len(df_data["docid"]))
+        fixContactPerson(df_data,list_df_columns1)
+
+        fixZhaobiao_page_time(df_data)
+        size_pop = popRows(df_data)
+        print("size_pop",size_pop)
+
+        df = pd.DataFrame(df_data)
+
+        filename = os.path.dirname(__file__)+"/data/%s关键词数据导出数量%d_%s.xlsx"%(current_date,len(list_row)-size_pop,getCurrent_date("%Y-%m-%d_%H%M%S"))
+        df.to_excel(filename,columns=list_df_columns1,index=False)
+
+        host = "smtp.exmail.qq.com"
+        username = "vip@bidizhaobiao.com"
+        password = "Biaoxun66-"
+        receivers = ["1175730271@qq.com","747012698@qq.com","995116318@qq.com"]
+        receivers = ["1175730271@qq.com"]
+        attachs = [filename]
+
+        sendEmail(host,username,password,receivers,attachs=attachs)
+
+    def export4_by_project(self):
+
+        def getRowData(df_data,rows,dict_channel,set_columns1,list_df_column1):
+
+            # dict_filter,dict_update = getFixData()
+            ots_client = getConnect_ots()
+            _index = 0
+            for row in rows:
+
+
+                _index += 1
+                item = {}
+                _dict = row
+                keys = row["keyword"]
+                log("================")
+                log(keys)
+                docid = row.get("docids","")
+                log(docid)
+                try:
+                    docid_0 = docid.split(",")[0]
+                except Exception as e:
+                    continue
+                log(docid)
+                log(_dict.get("doctitle","")+_dict.get("doctextcon","")+_dict.get("attachmenttextcon",""))
+                _keyword = str(list(set([ i for i in re.findall("|".join([re.escape(i) for i in keys if i!='']),_dict.get("doctitle","")+_dict.get("doctextcon","")+_dict.get("attachmenttextcon","")) if i!='' ])))
+                log(_keyword)
+                product_attrs = [{}]
+                # for doc in docid.split(","):
+                #     _q = BoolQuery(must_queries=[TermQuery("docid",int(doc))])
+                #     r,n,t,_ = ots_client.search("document","document_index",SearchQuery(_q),columns_to_get=ColumnsToGet(["extract_json",ColumnReturnType.SPECIFIED]))
+                #     r = getRow_ots(r)
+                #     if len(r)>0:
+                #         _extract = json.loads(r[0].get("extract_json","{}"))
+                #         if len(set(keys)&set(["AR一体机","一体机","短焦投影仪","教学一体机","云桌面","录播","电子书包","电子班牌","智慧黑板"]))>0:
+                #             product_attrs = _extract.get("product_attrs",{"data":[{}]}).get("data")
+                #         else:
+                #             product_attrs = [{}]
+                #         if len(product_attrs)==0:
+                #             product_attrs = [{}]
+                # win_tenderer = ""
+                # bidding_budget = ""
+                # win_bid_price = ""
+                # sub_docs_json = json.loads(_dict.get("sub_docs_json","[]"))
+                # for _doc in sub_docs_json:
+                #     if win_tenderer=="":
+                #         win_tenderer = _doc.get("win_tenderer","")
+                #     if bidding_budget=="":
+                #         bidding_budget= _doc.get("bidding_budget","")
+                #     if win_bid_price=="":
+                #         win_bid_price = _doc.get("win_bid_price","")
+                _index1 = 0
+                # if _dict.get("docid") in dict_filter:
+                #     print("====filter")
+                #     continue
+                # if _dict.get("docid") in dict_update:
+                #     print("====update")
+                #     win_bid_price = dict_update[_dict.get("docid")].get("win_bid_price","")
+                #     bidding_budget= dict_update[_dict.get("docid")].get("bidding_budget","")
+                for _attrs in product_attrs:
+                    _index1 += 1
+                    if re.search("运费",_attrs.get("product","")) is not None:
+                        if _index1==1:
+                            _attrs = {}
+                        else:
+                            continue
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"序号",_index)
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"正文",_dict.get("doctextcon",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"附件",_dict.get("attachmenttextcon",""))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"项目名称",_dict.get("project_name",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"docid",docid)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"网站链接",'=HYPERLINK("http://www.bidizhaobiao.com/info-%s.html","查看公告")'%(str(docid)))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"省份",_dict.get("province"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"城市",_dict.get("city"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"地区",_dict.get("district"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"公告时间",_dict.get("page_time"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标公告时间",_dict.get("zhao_biao_page_time"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"项目标题",_dict.get("doctitles"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购单位",_dict.get("tenderee"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标供应商",_dict.get("win_tenderer"))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购方式",_dict.get("bidway",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"招标采购方式","")
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"采购系统","")
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"产品名称",_attrs.get("product"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"品牌",_attrs.get("brand",""))
+
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"规格型号",_attrs.get("specs",""))
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"预算金额(元)",_dict.get("bidding_budget"))
+                    # set_dict_item_columns(set_columns1,list_df_column1,item,"招标预算金额(元)","")
+                    _quantity = ""
+                    _uniPrice = ""
+                    try:
+                        _quantity = int(re.sub("[^\d]","",_attrs.get("quantity","0")))
+                        if _quantity==0:
+                            _quantity = ""
+                    except Exception as e:
+                        pass
+                    try:
+                        _uniPrice = float(_attrs.get("unitPrice","0"))
+                        if _uniPrice==0:
+                            _uniPrice = ""
+                    except Exception as e:
+                        pass
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"数量",_quantity)
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"单价(元)",_uniPrice)
+                    sumPrice = ""
+                    if _quantity!="" and _uniPrice!="":
+                        sumPrice = _quantity*_uniPrice
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"总价(元)",sumPrice)
+
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"中标金额(元)",_dict.get("win_bid_price"))
+
+                    set_dict_item_columns(set_columns1,list_df_column1,item,"关键词",_keyword)
+
+
+
+
+                    for k,v in item.items():
+                        if k not in df_data:
+                            df_data[k] = []
+                        df_data[k].append(v)
+        def filterRow(list_row,key="docid"):
+            set_id = set()
+            _filter_row = []
+            for row in list_row:
+                _v = row.get(key,"")
+                if _v in set_id:
+                    continue
+                set_id.add(_v)
+                _filter_row.append(row)
+            return _filter_row
+
+        def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+            set_enterprise = set()
+            if len(df_data.keys())>0:
+                for _tenderee,_win_tenderer in zip(df_data["采购单位"],df_data["中标供应商"]):
+                    set_enterprise.add(_tenderee)
+                    set_enterprise.add(_win_tenderer)
+                if "" in set_enterprise:
+                    set_enterprise.remove("")
+                if None in set_enterprise:
+                    set_enterprise.remove(None)
+                dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system"])
+
+                # conn = getConnection_oracle()
+                # cursor = conn.cursor()
+                if len(set_enterprise)>0:
+                    for _i in range(len(df_data["采购单位"])):
+                        _enterprise_name = df_data["采购单位"][_i]
+                        # if df_data["招标联系人电话"][_i]=="":
+                        #     contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        #     if contacts is not None:
+                        #         _person,_phone = getOneContact(contacts)
+                        #         df_data["招标联系人"][_i] = _person
+                        #         df_data["招标联系人电话"][_i] = _phone
+
+
+                        df_data["采购系统"][_i] = dict_enterprise.get(_enterprise_name,{}).get("procurement_system","")
+
+        def fixZhaobiao_page_time(df_data):
+            ots_client = getConnect_ots()
+
+            for i in range(len(df_data["docid"])):
+                _docid = df_data["docid"][i]
+                bool_query = BoolQuery(must_queries=[TermQuery("docids",_docid)])
+
+                rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                               SearchQuery(bool_query,limit=1),
+                                                                               ColumnsToGet(["zhao_biao_page_time","bidway","bidding_budget"],ColumnReturnType.SPECIFIED))
+                list_data = getRow_ots(rows)
+                if len(list_data)>0:
+                    df_data["招标公告时间"][i] = list_data[0].get("zhao_biao_page_time","")
+                    df_data["招标采购方式"][i] = list_data[0].get("bidway","")
+                    new_bidding_budget = list_data[0].get("bidding_budget",0)
+                    if df_data["预算金额(元)"][i]=="" or float(df_data["预算金额(元)"][i])==0:
+                        if df_data["中标金额(元)"][i]!="" and new_bidding_budget>=float(df_data["中标金额(元)"][i]):
+                            df_data["预算金额(元)"][i] = new_bidding_budget
+                    # df_data["招标预算金额(元)"][i] = list_data[0].get("bidding_budget","")
+
+        def popRows(df_data):
+            list_pop = []
+            set_docid = set()
+            for _c in range(len(df_data["采购系统"])):
+                if df_data["采购系统"][_c]!="教育系统" or df_data["中标金额(元)"][_c]=="":
+                    list_pop.append(_c)
+                    set_docid.add(df_data["docid"][_c])
+            list_pop.reverse()
+            for k,v in df_data.items():
+                for _p in list_pop:
+                    v.pop(_p)
+            _index = 0
+            set_docid1 = set()
+            for _i in range(len(df_data["序号"])):
+                _docid = df_data["docid"][_i]
+                if _docid not in set_docid1:
+                    _index += 1
+                set_docid1.add(_docid)
+                df_data["序号"][_i] = _index
+            return len(set_docid)
+
+        df = pd.read_excel("20220927v1.4.xlsx",sheetname=1)
+
+        list_search_dict = []
+        for title_kw,content_kw,title_rm,content_rm in zip(df["标题采集关键词"],df["全文采集关键词"],df["标题排除词"],df["全文排除词"]):
+            if not isinstance(title_kw,str):
+                title_kw = ""
+            if not isinstance(content_kw,str):
+                content_kw = ""
+            if not isinstance(title_rm,str):
+                title_rm = ""
+            if not isinstance(content_rm,str):
+                content_rm = ""
+            _dict = {"title_kw":re.split("\s|,|,",str(title_kw)),
+                     "content_kw":re.split("\s|,|,",str(content_kw)),
+                     "title_rm":re.split("\s|,|,",str(title_rm)),
+                     "content_rm":re.split("\s|,|,",str(content_rm))}
+            list_search_dict.append(_dict)
+
+
+        columns = ["docids","doctitles","product","province","bidway","city","district","zhao_biao_page_time","page_time","industry","info_type","tenderee","bidding_budget","project_code","project_name","win_tenderer","win_bid_price","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
+
+        current_date = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())))
+        last_date = timeAdd(current_date,-1)
+
+        current_date = "2022-10-14"
+        last_date = "2023-05-07"
+
+
+        dict_channel = getDict_docchannel()
+        list_mq = []
+        list_query = []
+        for _d in list_search_dict:
+            print(_d)
+            kw_should_q = []
+            list_keys = []
+            title_kw = _d.get("title_kw",[])
+            if len(title_kw)>0:
+                kw_should_q.append(generateBoolShouldQuery(["doctitles"],title_kw,MatchPhraseQuery))
+                list_keys.extend(title_kw)
+            content_kw = _d.get("content_kw",[])
+            if len(content_kw)>0:
+                kw_should_q.append(generateBoolShouldQuery(["doctextcon","attachmenttextcon"],content_kw,MatchPhraseQuery))
+                list_keys.extend(content_kw)
+
+            rm_should_q = []
+            title_rm = _d.get("title_rm",[])
+            if len(title_rm)>0:
+                rm_should_q.append(generateBoolShouldQuery(["doctitles"],title_rm,MatchPhraseQuery))
+            content_rm = _d.get("content_rm",[])
+            if len(content_rm)>0:
+                rm_should_q.append(generateBoolShouldQuery(["doctextcon","attachmenttextcon"],content_rm,MatchPhraseQuery))
+
+            _query = BoolQuery(must_queries=[
+                # generateBoolShouldQuery(["docchannel"],[101],TermQuery),
+                # RangeQuery("page_time",last_date,current_date),
+                RangeQuery("page_time",current_date,last_date),
+                RangeQuery("status",201,300,True,True),
+                # TermQuery("docid",263568527),
+                # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keys,MatchPhraseQuery),
+                BoolQuery(should_queries=kw_should_q),
+                # # generateBoolShouldQuery(["province"],list_province,WildcardQuery),
+                # # RangeQuery("crtime","%s"%last_date,"%s"%current_date)
+            ],
+                must_not_queries=rm_should_q)
+            list_query.append({"query":_query,"limit":50000,"keyword":list_keys})
+
+
+
+        def getFixData():
+            filename = "C:\\Users\\Administrator\\Desktop\\视源数据清洗.xlsx"
+            dict_filter = {}
+            dict_update = {}
+            df = pd.read_excel(filename,0)
+            for _docid,bidding_budget,win_bid_price,is_dup,is_fb in zip(df["docid"],df["检查招标金额"],df["中标金额(元)"],df["是否重复"],df["是否废标"]):
+                if str(is_dup) in ("是","1") or str(is_fb) in ("是","1"):
+                    dict_filter[_docid] = 1
+                else:
+                    dict_update[_docid] = {"docid":_docid,
+                                           "bidding_budget":bidding_budget,
+                                           "win_bid_price":win_bid_price}
+            df = pd.read_excel(filename,1)
+            for _docid,bidding_budget,win_bid_price,is_dup,is_fb in zip(df["docid"],df["检查招标金额"],df["中标金额(元)"],df["是否重复"],df["是否废标"]):
+                if str(is_dup) in ("是","1") or str(is_fb) in ("是","1"):
+                    dict_filter[_docid] = 1
+                else:
+                    dict_update[_docid] = {"docid":_docid,
+                                           "bidding_budget":bidding_budget,
+                                           "win_bid_price":win_bid_price}
+            return dict_filter,dict_update
+
+
+
+        list_row = getDocument(list_query,columns,table_name="project2",table_index="project2_index")
+        list_row = filterRow(list_row,key="uuid")
+
+        print("list_row",len(list_row))
+
+        set_columns1 = set()
+        list_df_columns1 = []
+
+        df_data = {}
+        getRowData(df_data,list_row,dict_channel,set_columns1,list_df_columns1)
+
+        print("len df_data",len(df_data["docid"]))
+        fixContactPerson(df_data,list_df_columns1)
+
+        # fixZhaobiao_page_time(df_data)
+        # size_pop = popRows(df_data)
+        size_pop = 0
+        print("size_pop",size_pop)
+
+        df = pd.DataFrame(df_data)
+
+        filename = os.path.dirname(__file__)+"/data/%s关键词数据导出数量%d_%s.xlsx"%(current_date,len(list_row)-size_pop,getCurrent_date("%Y-%m-%d_%H%M%S"))
+        df.to_excel(filename,columns=list_df_columns1,index=False)
+
+        host = "smtp.exmail.qq.com"
+        username = "vip@bidizhaobiao.com"
+        password = "Biaoxun66-"
+        receivers = ["1175730271@qq.com","747012698@qq.com","995116318@qq.com"]
+        receivers = ["1175730271@qq.com"]
+        attachs = [filename]
+
+        sendEmail(host,username,password,receivers,attachs=attachs)
+
+    import json
+    def getUserInfo(self,code,userid):
+
+        data = {
+            "codeStr": code,
+            "userId": userid,
+        }
+
+        auth = {'Content-Type': 'application/json',"Authorization": "Bearer eyJhbGciOiJSUzI1NiJ9.eyJpc3MiOiJCWEtDX1VOSVRFX0FQSSIsInN1YiI6IjIwIiwiZGF0YSI6eyJkZXZpY2VUeXBlIjoiMjAiLCJyb2xlIjoiMjAiLCJpZCI6ImJ4a2Mtd29ya2JlbmNoIn0sImV4cCI6MTk2NjY5MjE4MSwiaWF0IjoxNjUxMDQ0MTgxfQ.lFurpoMOaVmCH3GKmJ_cqrseuSaEZJBndp8PE4QpjFY6R2mfXh5e96zn_Ma3qVkp-NKAlpA1nRYl1y08xkzj07KEx4HO_Nh6v3sfGwnDyRUz35SW3yr1fhvnbh5hEpnoCVJnFQfoMXFfn780VuOcKd01lNUjABFbSLvypDv8p-gJzkrE7z5YB53tZ_lGm_dbzKihTBjG1sBYEKJT3ekYz5Px_n-lw05IkUdbckE7n6Xj7PRPTaEjzoc3PF_pgESsdcTPSDhJlLR8x63YCqbmWy6ydhwnlEQE5rHIFA2Q5VhIxUcT7fXUBZNGmzRgaqxB4dIg2369IhpeBAJ2TQ63qg"}
+
+        r = requests.patch("https://unite.bidizhaobiao.com/api/v1/authorization/services",json=data,headers=auth)
+        if r.status_code==200:
+            return r.status_code,json.loads(r.content.decode("utf8"))
+        return r.status_code,""
+
+
+def export_15824381998():
+    def removeData(df_data):
+        list_remove_index = []
+        list_c = df_data.get("招标人采购系统",[])
+        for _c_i in range(len(list_c)):
+            if list_c[_c_i]=="企业采购系统":
+                list_remove_index.append(_c_i)
+        list_remove_index.reverse()
+        print(list_remove_index)
+        for k,v in df_data.items():
+            for _rc in list_remove_index:
+                v.pop(_rc)
+        for k,v in df_data.items():
+            v = v[:500]
+
+    def getRowData_zb(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            bidding_budget = 0
+            sub_docs = json.loads(_dict.get("sub_docs_json","[]"))
+            for doc in sub_docs:
+                if doc.get("bidding_budget",0)>0:
+                    bidding_budget = doc.get("bidding_budget",0)
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province",""))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item_columns(set_col,df_columns,item,"发布时间",_dict.get("page_time",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告标题",_dict.get("doctitle",""))
+            set_dict_item_columns(set_col,df_columns,item,"采购内容",_dict.get("product",""))
+            set_dict_item_columns(set_col,df_columns,item,"采购方式",_dict.get("bidway",""))
+            set_dict_item_columns(set_col,df_columns,item,"项目金额",bidding_budget)
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"获取文件截止时间",_dict.get("time_get_file_end",""))
+            set_dict_item_columns(set_col,df_columns,item,"开标时间",_dict.get("time_bidopen",""))
+            set_dict_item_columns(set_col,df_columns,item,"链接","http://www.bidizhaobiao.com/info-%s.html"%str(_dict.get("docid","")))
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+    def getRowData_yc(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            _type = _dict.get("type",0)
+            if _type==0:
+                _type = "周期预测"
+            elif _type==1:
+                _type = "采购意向"
+            elif _type==2:
+                _type = "到期预测"
+            else:
+                _type = "废标重招"
+
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province",""))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city",""))
+            set_dict_item_columns(set_col,df_columns,item,"预测类型",_type)
+            set_dict_item_columns(set_col,df_columns,item,"预计开始时间",_dict.get("may_begin",""))
+            set_dict_item_columns(set_col,df_columns,item,"预计结束时间",_dict.get("may_end",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"采购内容",_dict.get("product",""))
+            set_dict_item_columns(set_col,df_columns,item,"上次采购时间",_dict.get("last_page_time",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+    def getRowData_win(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            bidding_budget = 0
+            sub_docs = json.loads(_dict.get("sub_docs_json","[]"))
+            zbr = ""
+            zbje = ""
+            for doc in sub_docs:
+                if doc.get("bidding_budget",0)>0:
+                    bidding_budget = doc.get("bidding_budget",0)
+                if doc.get("win_tenderer") is not None:
+                    zbr += str(doc.get("win_tenderer"))+"\r\n"
+                    zbje += str(doc.get("win_bid_price",0))+"\r\n"
+                if doc.get("second_tenderer") is not None:
+                    zbr += str(doc.get("second_tenderer"))+"\r\n"
+                    zbje += str(doc.get("second_bid_price",0))+"\r\n"
+                if doc.get("third_tenderer") is not None:
+                    zbr += str(doc.get("third_tenderer"))+"\r\n"
+                    zbje += str(doc.get("third_bid_price",0))+"\r\n"
+
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province",""))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item_columns(set_col,df_columns,item,"发布时间",_dict.get("page_time",""))
+            set_dict_item_columns(set_col,df_columns,item,"公告标题",_dict.get("doctitle",""))
+            set_dict_item_columns(set_col,df_columns,item,"采购内容",_dict.get("product",""))
+            set_dict_item_columns(set_col,df_columns,item,"采购方式",_dict.get("bidway",""))
+            set_dict_item_columns(set_col,df_columns,item,"项目金额",bidding_budget)
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"获取文件截止时间",_dict.get("time_get_file_end",""))
+            set_dict_item_columns(set_col,df_columns,item,"开标时间",_dict.get("time_bidopen",""))
+            set_dict_item_columns(set_col,df_columns,item,"中标公司",zbr)
+            set_dict_item_columns(set_col,df_columns,item,"中标金额",zbje)
+            set_dict_item_columns(set_col,df_columns,item,"链接","http://www.bidizhaobiao.com/info-%s.html"%str(_dict.get("docid","")))
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+    def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+        set_enterprise = set()
+        if len(df_data.keys())>0:
+            for _tenderee,_agency in zip(df_data["招标单位"],df_data["代理单位"]):
+                set_enterprise.add(_tenderee)
+                set_enterprise.add(_agency)
+            if "" in set_enterprise:
+                set_enterprise.remove("")
+            if None in set_enterprise:
+                set_enterprise.remove(None)
+            dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system","company_org_type","reg_capital","actual_capital","contacts","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code","legal_person_name","phone_number"])
+
+            # conn = getConnection_oracle()
+            # cursor = conn.cursor()
+            if len(set_enterprise)>0:
+                for _i in range(len(df_data["招标单位"])):
+                    _enterprise_name = df_data["招标单位"][_i]
+                    if df_data["招标联系人电话"][_i]=="":
+                        contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        if contacts is not None:
+                            _person,_phone = getOneContact(contacts)
+                            df_data["招标联系人"][_i] = _person
+                            df_data["招标联系人电话"][_i] = _phone
+
+
+                    _enterprise_name = df_data["代理单位"][_i]
+                    if df_data["代理联系人电话"][_i]=="":
+                        contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        if contacts is not None:
+                            _person,_phone = getOneContact(contacts)
+                            df_data["代理联系人"][_i] = _person
+                            df_data["代理联系人电话"][_i] = _phone
+
+                    # _enterprise_name = df_data["中标单位"][_i]
+                    # if get_legal_person:
+                    #     _person = dict_enterprise.get(_enterprise_name,{}).get("legal_person_name","")
+                    #     _phone = dict_enterprise.get(_enterprise_name,{}).get("phone_number","")
+                    #     if len(_phone)==11 and _phone[0]=="1":
+                    #         df_data["中标单位联系人"][_i] = _person
+                    #         df_data["中标单位联系电话"][_i] = _phone
+                    # else:
+                    #     if df_data["中标单位联系电话"][_i]=="":
+                    #         contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    #         if contacts is not None:
+                    #             _person,_phone = getOneContact(contacts,mobile_only=True)
+                    #             df_data["中标单位联系人"][_i] = _person
+                    #             df_data["中标单位联系电话"][_i] = _phone
+
+    log("start export 15824381998:>>>>>>>>>>>>>>>")
+    current_date = getCurrent_date(format="%Y-%m-%d")
+    last_date = timeAdd(current_date,-2)
+    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-7*24*60*60))
+    if current_date<="2025-05-01":
+        for i in range(10):
+            try:
+                # start_time='2022-07-22'
+                # current_date = '2022-07-28'
+                log("start exporting export2:=================")
+                # columns = ["doctitle","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
+
+                dict_channel = getDict_docchannel()
+
+                list_query = []
+
+                str_keyword = '''
+
+光伏组件
+
+                '''
+                list_keyword = splitIntoList(str_keyword,"[\s\n、,,]")
+
+                str_not_keyword = '''
+                清洗机器人
+                '''
+                list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
+
+                tenderee_keywrod = "医院、大学、高校、高中"
+                list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
+
+                log(str(list_keyword))
+                columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_get_file_end","time_bidopen"]
+                bool_query = BoolQuery(must_queries=[
+                    generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
+                    generateBoolShouldQuery(["docchannel"],[51,52,102,103,104,105,114],TermQuery),
+                    RangeQuery("crtime",last_date+" 21:00:00",current_date+" 21:00:00",True,True),
+                    RangeQuery("status",201,300,True,True),
+                    # TermQuery("procurement_system","公安系统"),
+                    # generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                    # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                    # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                ],
+                    must_not_queries=[
+                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_not_key,MatchPhraseQuery),
+                    ]
+                )
+
+                list_row = getDocument([{"query":bool_query,"limit":500}],columns,thread_count=1)
+
+                log("get document %d rows"%len(list_row))
+
+                df_data = {}
+                set_line = set()
+                # list_row = filterRow(list_row)
+                df_columns = getRowData_zb(df_data,list_row,set_line,list_keyword,dict_channel,True)
+                fixContactPerson(df_data,df_columns)
+                df = pd.DataFrame(df_data)
+                
+                df_data_filter = {}
+                for c in df_columns:
+                    if c not in df_data_filter:
+                        df_data_filter[c] = []
+                for tenderee_i in range(len(df_data["招标单位"])):
+                    if df_data["招标单位"][tenderee_i] in ("国家电力投资集团有限公司","中国大唐集团有限公司"):
+                        for c in df_columns:
+                            df_data_filter[c].append(df_data[c][tenderee_i])
+                df_f = pd.DataFrame(df_data_filter)
+
+                time_end = timeAdd(current_date,60)
+                columns = ["project_name","type","province","city","bidding_budget","city","demand","last_page_time"]
+                bool_query = BoolQuery(must_queries=[
+                    generateBoolShouldQuery(["demand","product"],list_keyword,MatchPhraseQuery),
+                    RangeQuery("may_end",range_from=current_date),
+                    RangeQuery("may_end",range_to=time_end),
+                    # TermQuery("procurement_system","公安系统"),
+                    # generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                    # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                    # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                ],
+                    must_not_queries=[
+                        generateBoolShouldQuery(["demand","product"],list_not_key,MatchPhraseQuery),
+                    ]
+                )
+
+                list_row = getDocument([{"query":bool_query,"limit":500}],columns,table_name="preproject",table_index="preproject_index",thread_count=1,sort_column="may_end")
+
+                log("get document %d rows"%len(list_row))
+
+                df_data = {}
+                set_line = set()
+                # list_row = filterRow(list_row)
+                df_columns_yc = getRowData_yc(df_data,list_row,set_line,list_keyword,dict_channel,True)
+                fixContactPerson(df_data,df_columns_yc)
+                df1 = pd.DataFrame(df_data)
+
+                df_data_filter = {}
+                for c in df_columns_yc:
+                    if c not in df_data_filter:
+                        df_data_filter[c] = []
+                for tenderee_i in range(len(df_data["招标单位"])):
+                    if df_data["招标单位"][tenderee_i] in ("国家电力投资集团有限公司","中国大唐集团有限公司"):
+                        for c in df_columns_yc:
+                            if c not in df_data_filter:
+                                df_data_filter[c] = []
+                            df_data_filter[c].append(df_data[c][tenderee_i])
+                df1_f = pd.DataFrame(df_data_filter)
+
+
+                filename = os.path.dirname(__file__)+"/data/%s年%s招标数据导出%s.xlsx"%(start_time[:4],current_date,getCurrent_date(format="%Y-%m-%d_%H%M%S"))
+                with pd.ExcelWriter(filename) as writer:
+                    df.to_excel(writer,sheet_name="招标数据",columns=df_columns if not df.empty else None)
+
+                    df_f.to_excel(writer,sheet_name="特定甲方招标数据",columns=df_columns if not df_f.empty else None)
+                    df1.to_excel(writer,sheet_name="项目预测",columns=df_columns_yc[:-3] if not df1.empty else None)
+                    df1_f.to_excel(writer,sheet_name="特定甲方项目预测",columns=df_columns_yc[:-3] if not df1_f.empty else None)
+                log(str(filename))
+
+                host = "smtp.exmail.qq.com"
+                username = "vip@bidizhaobiao.com"
+                password = "Biaoxun66-"
+                receivers = ["1175730271@qq.com","565748324@qq.com","1396488964@qq.com"]
+                # receivers = ["1175730271@qq.com"]
+                attachs = [filename]
+
+                sendEmail(host,username,password,receivers,attachs=attachs)
+
+
+                df_data_filter = {}
+                for c in df_columns:
+                    if c not in df_data_filter:
+                        df_data_filter[c] = []
+                for tenderee_i in range(len(df_data["招标单位"])):
+                    if df_data["招标单位"][tenderee_i] in ("国家电力投资集团有限公司","国家能源投资集团有限责任公司","国家能源集团龙源阿里新能源(阿里)有限公司"):
+                        for c in df_columns:
+                            df_data_filter[c].append(df_data[c][tenderee_i])
+                df_f = pd.DataFrame(df_data_filter)
+
+                df_data_filter1 = {}
+                for c in df_columns_yc:
+                    if c not in df_data_filter1:
+                        df_data_filter1[c] = []
+                for tenderee_i in range(len(df_data["招标单位"])):
+                    if df_data["招标单位"][tenderee_i] in ("国家电力投资集团有限公司","国家能源投资集团有限责任公司","国家能源集团龙源阿里新能源(阿里)有限公司"):
+                        for c in df_columns_yc:
+                            if c not in df_data_filter1:
+                                df_data_filter1[c] = []
+                            df_data_filter1[c].append(df_data[c][tenderee_i])
+                df1_f1 = pd.DataFrame(df_data_filter1)
+
+                filename = os.path.dirname(__file__)+"/data/%s年%s招标数据导出%s.xlsx"%(start_time[:4],current_date,getCurrent_date(format="%Y-%m-%d_%H%M%S"))
+                with pd.ExcelWriter(filename) as writer:
+                    df_f.to_excel(writer,sheet_name="特定甲方招标数据",columns=df_columns if not df_f.empty else None)
+                    df1_f1.to_excel(writer,sheet_name="特定甲方项目预测",columns=df_columns_yc[:-3] if not df1_f.empty else None)
+                log(str(filename))
+
+                host = "smtp.exmail.qq.com"
+                username = "vip@bidizhaobiao.com"
+                password = "Biaoxun66-"
+                receivers = ["450604061@qq.com"]
+                # receivers = ["1175730271@qq.com"]
+                attachs = [filename]
+
+                sendEmail(host,username,password,receivers,attachs=attachs)
+
+
+                columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_get_file_end","time_bidopen"]
+                bool_query = BoolQuery(must_queries=[
+                    generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
+                    generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery),
+                    RangeQuery("crtime",last_date+" 21:00:00",current_date+" 21:00:00",True,True),
+                    RangeQuery("status",201,300,True,True),
+                    # TermQuery("procurement_system","公安系统"),
+                    # generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                    # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                    # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                ],
+                    must_not_queries=[
+                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_not_key,MatchPhraseQuery),
+                    ]
+                )
+
+                list_row = getDocument([{"query":bool_query,"limit":500}],columns,thread_count=1)
+
+                log("get document %d rows"%len(list_row))
+
+                df_data = {}
+                set_line = set()
+                # list_row = filterRow(list_row)
+                df_columns = getRowData_win(df_data,list_row,set_line,list_keyword,dict_channel,True)
+                fixContactPerson(df_data,df_columns)
+                df = pd.DataFrame(df_data)
+                # 调整行高
+                # for i in range(1, len(df) + 1):
+                #     df.style.set_properties(max_col=100, min_col=1, max_row=i, min_row=i, h=20, wrap_text=True)  # 设置行高为20,自动换行
+                styled_df = (df.style
+                             .set_properties(**{'text-align': 'center', 'white-space': 'pre-line'})
+                             .set_table_styles([{'selector': 'td', 'props': [('text-align', 'center')]}]))
+
+                filename = os.path.dirname(__file__)+"/data/%s年%s中标数据导出%s.xlsx"%(start_time[:4],current_date,getCurrent_date(format="%Y-%m-%d_%H%M%S"))
+                with pd.ExcelWriter(filename,engine="openpyxl") as writer:
+                    styled_df.to_excel(writer,columns = df_columns,index=False)
+
+                adjust_excel(filename,filename,columns=["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R"])
+
+                host = "smtp.exmail.qq.com"
+                username = "vip@bidizhaobiao.com"
+                password = "Biaoxun66-"
+                receivers = ["1175730271@qq.com","565748324@qq.com","1396488964@qq.com","1141385052@qq.com","1713739820@qq.com"]
+                # receivers = ["1175730271@qq.com"]
+                attachs = [filename]
+
+                sendEmail(host,username,password,receivers,attachs=attachs)
+                break
+            except Exception as e:
+                traceback.print_exc()
+
+def export_13510123669():
+    def removeData(df_data):
+        list_remove_index = []
+        list_c = df_data.get("招标人采购系统",[])
+        for _c_i in range(len(list_c)):
+            if list_c[_c_i]=="企业采购系统":
+                list_remove_index.append(_c_i)
+        list_remove_index.reverse()
+        print(list_remove_index)
+        for k,v in df_data.items():
+            for _rc in list_remove_index:
+                v.pop(_rc)
+        for k,v in df_data.items():
+            v = v[:500]
+
+    def getRowData_zb(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            bidding_budget = 0
+            sub_docs = json.loads(_dict.get("sub_docs_json","[]"))
+            zbr = ""
+            zbje = 0
+            win_tenderer_manager = ""
+            win_tenderer_phone = ""
+            for doc in sub_docs:
+                if doc.get("bidding_budget",0)>0:
+                    bidding_budget = doc.get("bidding_budget",0)
+                if doc.get("win_tenderer") is not None:
+                    zbr = str(doc.get("win_tenderer",""))
+                    zbje = doc.get("win_bid_price",0)
+                    win_tenderer_manager = str(doc.get("win_tenderer_manager",""))
+                    win_tenderer_phone = str(doc.get("win_tenderer_phone",""))
+
+            doc_type = ""
+            if dict_channel.get(_dict.get("docchannel",""),"")[:2] in ("中标","合同","候选","废标","开标","验收"):
+                doc_type = "中标"
+            else:
+                doc_type = "招标"
+
+            set_dict_item_columns(set_col,df_columns,item,"序号",_index)
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province","未知"))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city","未知"))
+            set_dict_item_columns(set_col,df_columns,item,"公告类型",doc_type)
+            set_dict_item_columns(set_col,df_columns,item,"公告名称",_dict.get("doctitle",""))
+            set_dict_item_columns(set_col,df_columns,item,"项目类型",'')
+            set_dict_item_columns(set_col,df_columns,item,"项目编号",_dict.get("project_code"))
+            set_dict_item_columns(set_col,df_columns,item,"发布时间",_dict.get("page_time",""))
+            set_dict_item_columns(set_col,df_columns,item,"信息类型","%s-%s"%(dict_channel.get(_dict.get("docchannel",""),""),_dict.get("bidway","")))
+
+            set_dict_item_columns(set_col,df_columns,item,"报名开始时间",_dict.get("time_registration_start",""))
+            set_dict_item_columns(set_col,df_columns,item,"报名截止时间",_dict.get("time_registration_end",""))
+            set_dict_item_columns(set_col,df_columns,item,"投标开始时间",_dict.get("time_bidstart",""))
+            set_dict_item_columns(set_col,df_columns,item,"投标结束时间",_dict.get("time_bidopen",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"预算金额(万元)",bidding_budget/10000)
+            set_dict_item_columns(set_col,df_columns,item,"成交金额(万元)",zbje/10000)
+
+
+
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标单位所在地","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人1","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话1","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人2","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话2","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人3","")
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话3","")
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位所在地","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人1","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话1","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人2","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话2","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人3","")
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话3","")
+
+            set_dict_item_columns(set_col,df_columns,item,"中标单位",zbr)
+            set_dict_item_columns(set_col,df_columns,item,"中标单位所在地","")
+
+            set_dict_item_columns(set_col,df_columns,item,"项目中标联系人",win_tenderer_manager)
+            set_dict_item_columns(set_col,df_columns,item,"项目中标联系电话",win_tenderer_phone)
+            set_dict_item_columns(set_col,df_columns,item,"中标法人","")
+            set_dict_item_columns(set_col,df_columns,item,"中标法人电话","")
+            set_dict_item_columns(set_col,df_columns,item,"中标股东","")
+            set_dict_item_columns(set_col,df_columns,item,"中标股东电话","")
+            set_dict_item_columns(set_col,df_columns,item,"中标高管","")
+            set_dict_item_columns(set_col,df_columns,item,"中标高管电话","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人1","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人电话1","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人2","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人电话2","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人3","")
+            set_dict_item_columns(set_col,df_columns,item,"中标联系人电话3","")
+
+
+            set_dict_item_columns(set_col,df_columns,item,"招标/采购内容",_dict.get("product",""))
+            set_dict_item_columns(set_col,df_columns,item,"资质","")
+            set_dict_item_columns(set_col,df_columns,item,"公告详情链接","http://www.bidizhaobiao.com/info-%s.html"%str(_dict.get("docid","")))
+            set_dict_item_columns(set_col,df_columns,item,"招投标分析(合作商)","")
+            set_dict_item_columns(set_col,df_columns,item,"历史中标单位","")
+            set_dict_item_columns(set_col,df_columns,item,"应对策略(作为填)","")
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+    def getRowData_yc(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
+
+        dict_line = {}
+        # list_data = getRow_ots(rows)
+        _index = 0
+        rows.sort(key=lambda x:x.get("docid",0),reverse=True)
+        set_col = set()
+        df_columns = []
+        for row in rows:
+            _index += 1
+            item = {}
+            _dict = row
+            _type = _dict.get("type",0)
+            if _type==0:
+                _type = "周期预测"
+            elif _type==1:
+                _type = "采购意向"
+            elif _type==2:
+                _type = "到期预测"
+            else:
+                _type = "废标重招"
+
+
+
+            set_dict_item_columns(set_col,df_columns,item,"省份",_dict.get("province",""))
+            set_dict_item_columns(set_col,df_columns,item,"城市",_dict.get("city",""))
+            set_dict_item_columns(set_col,df_columns,item,"类型",'')
+            set_dict_item_columns(set_col,df_columns,item,"预测类型",_type)
+            set_dict_item_columns(set_col,df_columns,item,"项目类型","")
+            set_dict_item_columns(set_col,df_columns,item,"预计开始时间",_dict.get("may_begin",""))
+            set_dict_item_columns(set_col,df_columns,item,"预计结束时间",_dict.get("may_end",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"采购内容",_dict.get("product",""))
+            set_dict_item_columns(set_col,df_columns,item,"上次采购时间",_dict.get("last_page_time",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理单位",_dict.get("agency",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item_columns(set_col,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
+
+            set_dict_item_columns(set_col,df_columns,item,"招投标分析(合作商)","")
+            set_dict_item_columns(set_col,df_columns,item,"历史中标单位","")
+            set_dict_item_columns(set_col,df_columns,item,"应对策略(作为填)","")
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+        return df_columns
+
+    def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
+        set_enterprise = set()
+        if len(df_data.keys())>0:
+            for _tenderee,_agency in zip(df_data["招标单位"],df_data["代理单位"]):
+                set_enterprise.add(_tenderee)
+                set_enterprise.add(_agency)
+            if "" in set_enterprise:
+                set_enterprise.remove("")
+            if None in set_enterprise:
+                set_enterprise.remove(None)
+            dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system","company_org_type","reg_capital","actual_capital","contacts","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code","legal_person_name","phone_number"])
+
+            # conn = getConnection_oracle()
+            # cursor = conn.cursor()
+            if len(set_enterprise)>0:
+                for _i in range(len(df_data["招标单位"])):
+                    _enterprise_name = df_data["招标单位"][_i]
+                    if df_data["招标联系人电话"][_i]=="":
+                        contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        if contacts is not None:
+                            _person,_phone = getOneContact(contacts)
+                            df_data["招标联系人"][_i] = _person
+                            df_data["招标联系人电话"][_i] = _phone
+                    df_data["项目类型"][_i] = dict_enterprise.get(_enterprise_name,{}).get("procurement_system")
+
+
+                    _enterprise_name = df_data["代理单位"][_i]
+                    if df_data["代理联系人电话"][_i]=="":
+                        contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                        if contacts is not None:
+                            _person,_phone = getOneContact(contacts)
+                            df_data["代理联系人"][_i] = _person
+                            df_data["代理联系人电话"][_i] = _phone
+
+                    # _enterprise_name = df_data["中标单位"][_i]
+                    # if get_legal_person:
+                    #     _person = dict_enterprise.get(_enterprise_name,{}).get("legal_person_name","")
+                    #     _phone = dict_enterprise.get(_enterprise_name,{}).get("phone_number","")
+                    #     if len(_phone)==11 and _phone[0]=="1":
+                    #         df_data["中标单位联系人"][_i] = _person
+                    #         df_data["中标单位联系电话"][_i] = _phone
+                    # else:
+                    #     if df_data["中标单位联系电话"][_i]=="":
+                    #         contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    #         if contacts is not None:
+                    #             _person,_phone = getOneContact(contacts,mobile_only=True)
+                    #             df_data["中标单位联系人"][_i] = _person
+                    #             df_data["中标单位联系电话"][_i] = _phone
+
+    def fixContactPerson1(df_data,list_df_columns,get_legal_person=False):
+        set_enterprise = set()
+        if len(df_data.keys())>0:
+            for _tenderee,_agency,_win in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+                set_enterprise.add(_tenderee)
+                set_enterprise.add(_agency)
+                set_enterprise.add(_win)
+            if "" in set_enterprise:
+                set_enterprise.remove("")
+            if None in set_enterprise:
+                set_enterprise.remove(None)
+            dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system","company_org_type","reg_capital","actual_capital","contacts","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code","legal_person_name","phone_number","reg_location","province","city"])
+
+            # print("dict_enterprise",dict_enterprise)
+            # conn = getConnection_oracle()
+            # cursor = conn.cursor()
+            if len(set_enterprise)>0:
+                for _i in range(len(df_data["招标单位"])):
+                    _enterprise_name = df_data["招标单位"][_i]
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        for _index in range(min(len(contacts),3)):
+                            contact_person,phone_no = contacts[_index].get("contact_person"),contacts[_index].get("phone_no")
+                            is_legal_person,is_manager,is_shareholder = contacts[_index].get("is_legal_person",0),contacts[_index].get("is_manager",0),contacts[_index].get("is_shareholder",0)
+                            df_data["招标联系人%s"%str(_index+1)][_i] = contact_person
+                            df_data["招标联系人电话%s"%str(_index+1)][_i] = phone_no
+                    df_data["招标单位所在地"][_i] = "%s-%s"%(dict_enterprise.get(_enterprise_name,{}).get("province",""),dict_enterprise.get(_enterprise_name,{}).get("city",""))
+                    df_data["项目类型"][_i] = dict_enterprise.get(_enterprise_name,{}).get("procurement_system")
+
+
+                    _enterprise_name = df_data["代理单位"][_i]
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        for _index in range(min(len(contacts),3)):
+                            contact_person,phone_no = contacts[_index].get("contact_person"),contacts[_index].get("phone_no")
+                            is_legal_person,is_manager,is_shareholder = contacts[_index].get("is_legal_person",0),contacts[_index].get("is_manager",0),contacts[_index].get("is_shareholder",0)
+                            df_data["代理联系人%s"%str(_index+1)][_i] = contact_person
+                            df_data["代理联系人电话%s"%str(_index+1)][_i] = phone_no
+                    df_data["代理单位所在地"][_i] = "%s-%s"%(dict_enterprise.get(_enterprise_name,{}).get("province",""),dict_enterprise.get(_enterprise_name,{}).get("city",""))
+
+                    _enterprise_name = df_data["中标单位"][_i]
+                    df_data["中标单位所在地"][_i] = "%s-%s"%(dict_enterprise.get(_enterprise_name,{}).get("province",""),dict_enterprise.get(_enterprise_name,{}).get("city",""))
+                    _person = dict_enterprise.get(_enterprise_name,{}).get("legal_person_name","")
+                    _phone = dict_enterprise.get(_enterprise_name,{}).get("phone_number","")
+                    if len(_phone)==11 and _phone[0]=="1":
+                        df_data["中标法人"][_i] = _person
+                        df_data["中标法人电话"][_i] = _phone
+
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        for _index in range(min(len(contacts),3)):
+                            contact_person,phone_no = contacts[_index].get("contact_person"),contacts[_index].get("phone_no")
+                            is_legal_person,is_manager,is_shareholder = contacts[_index].get("is_legal_person",0),contacts[_index].get("is_manager",0),contacts[_index].get("is_shareholder",0)
+
+                            if is_manager:
+                                df_data["中标高管"][_i] = contact_person
+                                df_data["中标高管电话"][_i] = phone_no
+                            if is_shareholder:
+                                df_data["中标股东"][_i] = contact_person
+                                df_data["中标股东电话"][_i] = phone_no
+                            df_data["中标联系人%s"%str(_index+1)][_i] = contact_person
+                            df_data["中标联系人电话%s"%str(_index+1)][_i] = phone_no
+
+    log("start export 15824381998:>>>>>>>>>>>>>>>")
+    current_date = getCurrent_date(format="%Y-%m-%d")
+    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-7*24*60*60))
+    if current_date<="2025-02-04" and datetime.datetime.now().weekday() in (1,4):
+        for i in range(10):
+            try:
+                if datetime.datetime.now().weekday()==1:
+                    last_date = timeAdd(current_date,-4)
+                if datetime.datetime.now().weekday() in (4,):
+
+                    last_date = timeAdd(current_date,-3)
+                # start_time='2022-07-22'
+                # current_date = '2022-07-28'
+                page_time_start = timeAdd(current_date,-7)
+                log("start exporting export2:=================")
+                # columns = ["doctitle","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
+
+                dict_channel = getDict_docchannel()
+
+                list_query = []
+
+                str_keyword = '''
+
+多功能位移机,失能群体安全转移,助行机器人,行走机器人,护理机器人,大小便护理,洗浴机,洗浴机器人,吃饭机器人,养老护理,老年评估,防摔,智能护理,智慧康养
+                '''
+                list_keyword = splitIntoList(str_keyword,"[\s\n、,,]")
+
+                str_not_keyword = '''
+                清洗机器人
+                '''
+                list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
+
+                tenderee_keywrod = "医院、大学、高校、高中"
+                list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
+
+                log(str(list_keyword))
+                columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_get_file_end","time_bidopen","time_bidstart","time_registration_start","time_registration_end"]
+                bool_query = BoolQuery(must_queries=[
+                    generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
+                    generateBoolShouldQuery(["docchannel"],[51,52,102,103,105,104,114,121,122,101,119,120,118],TermQuery),
+                    RangeQuery("crtime",last_date+" 10:00:00",current_date+" 10:00:00",True,True),
+                    RangeQuery("status",201,300,True,True),
+                    RangeQuery("page_time",page_time_start)
+                    # TermQuery("procurement_system","公安系统"),
+                    # generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                    # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                    # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                ],
+                    must_not_queries=[
+                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_not_key,MatchPhraseQuery),
+                    ]
+                )
+
+                list_row = getDocument([{"query":bool_query,"limit":500}],columns,thread_count=1)
+
+                log("get document %d rows"%len(list_row))
+
+                df_data = {}
+                set_line = set()
+                # list_row = filterRow(list_row)
+                df_columns = getRowData_zb(df_data,list_row,set_line,list_keyword,dict_channel,True)
+                fixContactPerson1(df_data,df_columns)
+                df = pd.DataFrame(df_data)
+
+                df_data_filter_11 = {}
+                for c in df_columns:
+                    if c not in df_data_filter_11:
+                        df_data_filter_11[c] = []
+                df_data_filter_12 = {}
+                for c in df_columns:
+                    if c not in df_data_filter_12:
+                        df_data_filter_12[c] = []
+
+                for _i in range(len(df_data["项目类型"])):
+
+                    if df_data["项目类型"][_i] in ("教育系统","科研系统"):
+                        for c in df_columns:
+                            if c not in df_data_filter_11:
+                                df_data_filter_11[c] = []
+                            df_data_filter_11[c].append(df_data[c][_i])
+
+                    else:
+                        for c in df_columns:
+                            if c not in df_data_filter_12:
+                                df_data_filter_12[c] = []
+                            df_data_filter_12[c].append(df_data[c][_i])
+
+
+                df_f11 = pd.DataFrame(df_data_filter_11)
+                df_f12 = pd.DataFrame(df_data_filter_12)
+
+
+
+                time_end = timeAdd(current_date,60)
+                columns = ["project_name","type","province","city","bidding_budget","city","demand","last_page_time"]
+                bool_query = BoolQuery(must_queries=[
+                    generateBoolShouldQuery(["demand","product"],list_keyword,MatchPhraseQuery),
+                    RangeQuery("may_end",range_from=current_date),
+                    RangeQuery("may_end",range_to=time_end),
+                    # TermQuery("procurement_system","公安系统"),
+                    # generateBoolShouldQuery(["province"],["湖南"],TermQuery),
+                    # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
+                    # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
+                ],
+                    must_not_queries=[
+                        generateBoolShouldQuery(["demand","product"],list_not_key,MatchPhraseQuery),
+                    ]
+                )
+
+                list_row = getDocument([{"query":bool_query,"limit":500}],columns,table_name="preproject",table_index="preproject_index",thread_count=1,sort_column="may_end")
+
+                log("get document %d rows"%len(list_row))
+
+                df_data = {}
+                set_line = set()
+                # list_row = filterRow(list_row)
+                df_columns_yc = getRowData_yc(df_data,list_row,set_line,list_keyword,dict_channel,True)
+                fixContactPerson(df_data,df_columns_yc)
+
+
+                for _i in range(len(df_data["项目类型"])):
+                    if df_data["项目类型"][_i] in ("政府办公室系统","财政系统","民事系统","企业采购系统","卫生系统"):
+                        df_data["类型"][_i] = "政企"
+                    elif df_data["项目类型"][_i] in ("教育系统","科研系统"):
+                        df_data["类型"][_i] = "教育"
+                    else:
+                        df_data["类型"][_i] = "其他"
+                df1 = pd.DataFrame(df_data)
+
+
+
+                filename = os.path.dirname(__file__)+"/data/%s年%s项目数据导出.xlsx"%(start_time[:4],current_date)
+                with pd.ExcelWriter(filename) as writer:
+                    df_f11.to_excel(writer,sheet_name="教育",columns=df_columns if not df_f11.empty else None,index=False)
+                    df_f12.to_excel(writer,sheet_name="政企",columns=df_columns if not df_f12.empty else None,index=False)
+                    df1.to_excel(writer,sheet_name="项目预测",columns=df_columns_yc if not df1.empty else None)
+                log(str(filename))
+
+                host = "smtp.exmail.qq.com"
+                username = "vip@bidizhaobiao.com"
+                password = "Biaoxun66-"
+                # receivers = ["724949655@qq.com","1396488964@qq.com"]
+                receivers = ["md47@zuowei.com"]
+
+                # receivers = ["1175730271@qq.com"]
+                attachs = [filename]
+
+                sendEmail(host,username,password,receivers,attachs=attachs)
+
+
+                break
+            except Exception as e:
+                traceback.print_exc()
+
+def job_export():
     _scheduler = BlockingScheduler()
-    _scheduler.add_job(export_medicine_friday,"cron",day_of_week='fri',hour=18)
+
+    e3 = Export3()
+    # _scheduler.add_job(export_medicine_friday,"cron",hour=8)
+    # _scheduler.add_job(export_medicine_friday,"cron",second="*/1")
+    _scheduler.add_job(export2,"cron",hour=9)
+    _scheduler.add_job(export5,"cron",hour=16)
+    # _scheduler.add_job(e3.export4,"cron",hour=7)
+    _scheduler.add_job(e3.trytimes,"cron",hour=21)
+    _scheduler.add_job(e3.export3_1,"cron",hour=21)
+    _scheduler.add_job(export_15824381998,"cron",hour=21)
+    _scheduler.add_job(export_13510123669,"cron",hour=10)
     _scheduler.start()
 
+if __name__=="__main__":
+    # job_export()
+    # export_medicine_friday()
+    # export2()
+    # export_document_except()
+    # e3 = Export3()
+    # e3.export4_by_project()
+    # e3.export4()
+    # e3.trytimes()
+    # e3.export3_1()
+    # export5()
+    # export_15824381998()
+    export_13510123669()
+
 
 
-if __name__=="__main__":
-    job_medicine_friday()

+ 9 - 0
jobs/runJobs.py

@@ -0,0 +1,9 @@
+
+import sys,os
+sys.path.append(os.path.join(os.path.dirname(__file__),".."))
+
+from jobs.exportJobs import job_export
+
+
+if __name__ == '__main__':
+    job_export()

+ 67 - 0
utils/ERNIE_utils.py

@@ -0,0 +1,67 @@
+
+import requests
+import json
+
+def get_access_token():
+    """
+    使用 API Key,Secret Key 获取access_token,替换下列示例中的应用API Key、应用Secret Key
+    """
+
+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=gnwVXv96An9qMYqq9eWbeNqk&client_secret=mDsRQbCPsV4N7x28LbwkhTAaLmrrDnXk"
+
+    payload = json.dumps("")
+    headers = {
+        'Content-Type': 'application/json',
+        'Accept': 'application/json'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response.json().get("access_token")
+
+def main():
+    # _token = get_access_token()
+    _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
+
+    payload = json.dumps({
+        "messages": [
+            {
+                "role": "user",
+                "content": '''
+                假设分类是建筑建材-建筑涂料的相关产品词“面漆”
+                请拓展其相关行业产品词,列举30个
+                '''
+            }
+        ]
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+
+    print(response.text)
+
+def chat(message):
+    _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
+
+    payload = json.dumps({
+        "messages": [
+            {
+                "role": "user",
+                "content": '''
+                %s
+                '''%message
+            }
+        ]
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response
+
+if __name__ == '__main__':
+    main()

+ 12 - 1
utils/Utils.py

@@ -12,7 +12,7 @@ import os
 
 from threading import RLock
 
-from pai_tf_predict_proto import tf_predict_pb2
+# from pai_tf_predict_proto import tf_predict_pb2
 import requests
 
 import time
@@ -544,6 +544,11 @@ def getModel_word():
 # getModel_w2v()
 # getModel_word()
 
+def formatArea(area):
+    if area is not None and len(area)>=3:
+        return re.sub("[省市区县]","",area)
+    return area
+
 def findAllIndex(substr,wholestr):
     '''
     @summary: 找到字符串的子串的所有begin_index
@@ -915,6 +920,12 @@ def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
     _time = time.strftime(format,time.localtime())
     return _time
 
+def timeAdd(_time,days):
+    a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
+
+    _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
+    return _time1
+
 def getLocation(_str):
     list_names = list(dict_name_locations.keys())
     name_pattern = "(?P<locations>%s)"%"|".join(list_names)

+ 59 - 0
utils/killthreads.py

@@ -0,0 +1,59 @@
+
+import psutil
+
+import os
+import ctypes,inspect
+import traceback
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+
+    if isinstance(thread,(int)):
+        _ident = thread
+    else:
+        _ident = thread.ident
+    print(_ident)
+    _async_raise(_ident, SystemExit)
+
+for pid in [124564]:
+
+    try:
+
+        proc = psutil.Process(pid)
+
+        exeFile = os.path.basename(proc.exe())
+
+        threads = psutil._psutil_windows.proc_threads(pid)
+
+        times = 0
+
+        for thread in threads:
+
+            print(thread)
+            stop_thread(thread[0])
+
+            for timeUsed in thread[1:]:
+
+                times += timeUsed
+
+        print('='*20)
+
+        print('Exe file:', os.path.basename(proc.exe()))
+
+        print('Number of threads:', len(threads))
+
+        print('Time used:', times)
+
+    except:
+        traceback.print_exc()

+ 1 - 0
utils/multiThread.py

@@ -58,6 +58,7 @@ class MultiThreadHandler(object):
         self.args = args
         self.kwargs = kwargs
 
+
     def run(self):
         for i in range(self.thread_count):
             th = _taskHandler(self.task_queue,self.task_handler,self.result_queue,*self.args,**self.kwargs)

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä