|
@@ -0,0 +1,2308 @@
|
|
|
+#encoding:GBK
|
|
|
+import sys
|
|
|
+import os
|
|
|
+sys.path.append("../")
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+from dataSource.source import *
|
|
|
+import json
|
|
|
+from utils.multiThread import MultiThreadHandler
|
|
|
+import queue
|
|
|
+from utils.Utils import *
|
|
|
+from dataSource.pool import ConnectorPool
|
|
|
+import re
|
|
|
+from tablestore import *
|
|
|
+import traceback
|
|
|
+from utils.hashUtil import aesCipher
|
|
|
+from export.exportEnterprice import getDictEnterprise,getOneContact
|
|
|
+
|
|
|
+
|
|
|
+data_path = "../data/"
|
|
|
+
|
|
|
+def getCompanyTenderer():
|
|
|
+
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ company = item
|
|
|
+ dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
|
|
|
+ dict_result["company"] = company
|
|
|
+ graph = getConnect_neo4j()
|
|
|
+ cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
|
|
|
+ finded = graph.run(cql)
|
|
|
+ finded_ids = json.loads(json.dumps(finded.data()))
|
|
|
+ dict_result["count"] = len(finded_ids)
|
|
|
+ mongoDB = getConnect_mongodb()
|
|
|
+ coll_zb = mongoDB.zhongbiao_extraction
|
|
|
+ if len(finded_ids)>0:
|
|
|
+ cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
|
|
|
+ finded = graph.run(cql)
|
|
|
+ finded_names = json.loads(json.dumps(finded.data()))
|
|
|
+ list_names = [_i["p.project_name"] for _i in finded_names]
|
|
|
+ dict_result["project_name"] = str(list_names)
|
|
|
+
|
|
|
+ cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
|
|
|
+ finded = graph.run(cql)
|
|
|
+ finded_money = json.loads(json.dumps(finded.data()))
|
|
|
+ whole_money = 0
|
|
|
+ for item in finded_money:
|
|
|
+ if item["r.price"] is not None:
|
|
|
+ whole_money += getUnifyMoney(item["r.price"])
|
|
|
+ dict_result["whole_money"] = str(whole_money)
|
|
|
+ dict_competitor = {}
|
|
|
+ for item in finded_ids:
|
|
|
+ docId = item["p.zhong_biao_id"]
|
|
|
+ if docId is not None:
|
|
|
+ rows = coll_zb.find({"docId":docId})
|
|
|
+ for row in rows:
|
|
|
+ keys = ["second_tenderer","third_tenderer"]
|
|
|
+ for _key in keys:
|
|
|
+ if _key in row:
|
|
|
+ if row[_key] not in dict_competitor:
|
|
|
+ dict_competitor[row[_key]] = 0
|
|
|
+ dict_competitor[row[_key]] += 1
|
|
|
+ list_competitor = []
|
|
|
+ for _key in dict_competitor:
|
|
|
+ list_competitor.append([_key,dict_competitor[_key]])
|
|
|
+ list_competitor.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ list_competitors = [i[0] for i in list_competitor[:10]]
|
|
|
+ dict_result["competitor"] = str(list_competitors)
|
|
|
+ result_queue.put(dict_result)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ # filename = "成交客户匹配中标项目的需求.xlsx"
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
+ # list_company = df["公司名字"]
|
|
|
+ # company = list_company[0]
|
|
|
+ list_company = []
|
|
|
+ filename = "../data/服务型客户.txt"
|
|
|
+ with open(filename,"r",encoding="GBK") as f:
|
|
|
+ while(True):
|
|
|
+ line = f.readline()
|
|
|
+ if not line:
|
|
|
+ break
|
|
|
+ list_company.append(line.strip())
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for company in list_company:
|
|
|
+ task_queue.put(company)
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
|
|
|
+ handler.run()
|
|
|
+ list_company = []
|
|
|
+ list_zb = []
|
|
|
+ list_count = []
|
|
|
+ list_project = []
|
|
|
+ list_money = []
|
|
|
+ list_competitor = []
|
|
|
+ while(True):
|
|
|
+
|
|
|
+ try:
|
|
|
+ _result = result_queue.get(False)
|
|
|
+ list_company.append(_result.get("company",""))
|
|
|
+ list_zb.append("是" if _result.get("count","")>0 else "否")
|
|
|
+ list_count.append(_result.get("count",""))
|
|
|
+ list_project.append(_result.get("project_name",""))
|
|
|
+ list_money.append(_result.get("whole_money",""))
|
|
|
+ list_competitor.append(_result.get("competitor",""))
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ break
|
|
|
+ df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
|
|
|
+ df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
|
|
|
+
|
|
|
+def export_count_includeKeyword():
|
|
|
+ filename = "../data/jc001.xlsx"
|
|
|
+ list_name = []
|
|
|
+ list_count = []
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ _index = 0
|
|
|
+ for row in df["品目"]:
|
|
|
+ _name = row
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
|
|
|
+ if data is not None:
|
|
|
+ _count = data["response"]["numFound"]
|
|
|
+ else:
|
|
|
+ _count = 0
|
|
|
+ list_name.append(_name)
|
|
|
+ list_count.append(_count)
|
|
|
+ _index += 1
|
|
|
+ print(_index)
|
|
|
+ df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
|
|
|
+ df1.to_excel("%s_export.xls"%filename)
|
|
|
+
|
|
|
+def export_count_includeKeyword_multiThread():
|
|
|
+
|
|
|
+ def _handler(item,result_queue):
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
|
|
|
+ if data is not None:
|
|
|
+ _count = data["response"]["numFound"]
|
|
|
+ else:
|
|
|
+ _count = 0
|
|
|
+ result_queue.put([item,_count])
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ filename = "../data/jc001.xlsx"
|
|
|
+ list_name = []
|
|
|
+ list_count = []
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ _index = 0
|
|
|
+ for row in df["品目"]:
|
|
|
+ _name = row
|
|
|
+ task_queue.put(_name)
|
|
|
+ _index += 1
|
|
|
+ multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
|
|
|
+ multHandler.run()
|
|
|
+ while(True):
|
|
|
+ try:
|
|
|
+ item = result_queue.get(False)
|
|
|
+ list_name.append(item[0])
|
|
|
+ list_count.append(item[1])
|
|
|
+ except queue.Empty as e:
|
|
|
+ break
|
|
|
+ df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
|
|
|
+ df1.to_excel("%s_export.xls"%filename)
|
|
|
+
|
|
|
+def exportKeywords():
|
|
|
+ def _handle(item,result_queue,pool_mongo):
|
|
|
+ docId = item["docId"]
|
|
|
+ mongo = pool_mongo.getConnector()
|
|
|
+ zhongbiao = mongo.zhongbiao_extraction
|
|
|
+ zhaobiao = mongo.zhaobiao_extraction
|
|
|
+ _project = ""
|
|
|
+ rows = zhaobiao.find({"docId":docId},{"project_name":1})
|
|
|
+ find_flag = False
|
|
|
+ for row in rows:
|
|
|
+ find_flag = True
|
|
|
+ _project = row.get("project_name","")
|
|
|
+ if not find_flag:
|
|
|
+ rows = zhongbiao.find({"docId":docId},{"project_name":1})
|
|
|
+ for row in rows:
|
|
|
+ _project = row.get("project_name","")
|
|
|
+ item["project_name"] = _project
|
|
|
+ pool_mongo.putConnector(mongo)
|
|
|
+ result_queue.put(item)
|
|
|
+ list_key = []
|
|
|
+ dict_key_ids = dict()
|
|
|
+ with open("../data/品目.txt", "r", encoding="utf8") as f:
|
|
|
+ while(True):
|
|
|
+ row = f.readline()
|
|
|
+ if not row:
|
|
|
+ break
|
|
|
+ list_key.append(row)
|
|
|
+ dict_key_ids[row] = []
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
|
|
|
+ for item in data["response"]["docs"]:
|
|
|
+ dict_key_ids[row].append(item["id"])
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ for _key in dict_key_ids.keys():
|
|
|
+ for item in dict_key_ids[_key]:
|
|
|
+ task_queue.put({"docId":item,"project_name":""})
|
|
|
+ pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
|
|
|
+ mt.run()
|
|
|
+ dict_docId_projectname = {}
|
|
|
+ while(True):
|
|
|
+ try:
|
|
|
+ item = result_queue.get(False)
|
|
|
+ dict_docId_projectname[item["docId"]] = item["project_name"]
|
|
|
+ except Exception:
|
|
|
+ break
|
|
|
+ dict_key_count = dict()
|
|
|
+ for _key in dict_key_ids.keys():
|
|
|
+ set_key = set()
|
|
|
+ for docId in dict_key_ids[_key]:
|
|
|
+ set_key.add(dict_docId_projectname.get(docId,""))
|
|
|
+ dict_key_count[_key] = len(set_key)
|
|
|
+ print("==")
|
|
|
+ for _key in list_key:
|
|
|
+ print(len(dict_key_ids[_key]))
|
|
|
+ print("==")
|
|
|
+ for _key in list_key:
|
|
|
+ print(dict_key_count[_key])
|
|
|
+ print("==")
|
|
|
+
|
|
|
+def getIndustryCompany():
|
|
|
+ def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
|
|
|
+ # mongoDB = getConnect_mongodb()
|
|
|
+ log(item["enterprise_name"])
|
|
|
+ mongoDB = pool_mongo.getConnector()
|
|
|
+ # coll_zb = mongoDB.enterprise_profile
|
|
|
+ # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
|
|
|
+ # for row in rows:
|
|
|
+ # item["regCapital"] = row.get("regCapital","")
|
|
|
+ # item["legal_person"] = row.get("legal_person","")
|
|
|
+ # item["phone"] = row.get("phone","")
|
|
|
+ # item["actualCapital"] = row.get("actualCapital","")
|
|
|
+ # item["industry"] = row.get("industry","")
|
|
|
+ # item["estiblishTime"] = row.get("estiblishTime","")
|
|
|
+ # item["socialStaffNum"] = row.get("socialStaffNum","")
|
|
|
+ # item["businessScope"] = row.get("businessScope","")
|
|
|
+ # graph = getConnect_neo4j()
|
|
|
+
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+ primary_key = [('name',item["enterprise_name"])]
|
|
|
+
|
|
|
+ columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
|
|
|
+
|
|
|
+ consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
|
|
|
+
|
|
|
+ if return_row is not None:
|
|
|
+ for att in return_row.attribute_columns:
|
|
|
+ item[att[0]] = att[1]
|
|
|
+
|
|
|
+ list_same_industry_company = []
|
|
|
+
|
|
|
+ if "industry" in item:
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
|
|
|
+
|
|
|
+ col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
|
|
|
+ SearchQuery(bool_query, limit=10, get_total_count=True),
|
|
|
+ col)
|
|
|
+
|
|
|
+ for row in rows:
|
|
|
+ for item1 in row[0]:
|
|
|
+ list_same_industry_company.append(item1[1])
|
|
|
+
|
|
|
+
|
|
|
+ # if "industry" in item:
|
|
|
+ # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
|
|
|
+ # for row in rows:
|
|
|
+ # print(row)
|
|
|
+ # list_same_industry_company.append(row.get("enterprise_name",""))
|
|
|
+ item["same_industry_company"] = list_same_industry_company
|
|
|
+
|
|
|
+ graph = pool_neo4j.getConnector()
|
|
|
+ company_name = item["enterprise_name"]
|
|
|
+ cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
|
|
|
+ finded = graph.run(cql)
|
|
|
+ data = json.loads(json.dumps(finded.data()))
|
|
|
+ _count = data[0]["_c"]
|
|
|
+ # list_project = []
|
|
|
+ # for _data in data:
|
|
|
+ # if _count<=3:
|
|
|
+ # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
|
|
|
+ # if _data["project_name"] is not None:
|
|
|
+ # list_project.append(_data["project_name"])
|
|
|
+ # _count += 1
|
|
|
+ item["count"] = _count
|
|
|
+ # item["project"] = str(list_project)
|
|
|
+ result_queue.put(item)
|
|
|
+ pool_mongo.putConnector(mongoDB)
|
|
|
+ pool_neo4j.putConnector(graph)
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+ log_tofile("export.log")
|
|
|
+ pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
|
|
|
+ pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
|
|
|
+ pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ # list_company = getCompanys()
|
|
|
+ # filename = "".join(["环境","生态","再生","回收","环保"])
|
|
|
+ list_company = []
|
|
|
+ filename = "../data/同行客户匹配.xlsx"
|
|
|
+ df = pd.read_excel(filename,sheetname=0)
|
|
|
+ for _com in df["公司名称"]:
|
|
|
+ print(_com)
|
|
|
+ if _com is not None and _com.strip()!="":
|
|
|
+ _company = {"enterprise_name":""}
|
|
|
+ _company["enterprise_name"] = _com
|
|
|
+ list_company.append(_company)
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for item in list_company:
|
|
|
+ task_queue.put(item)
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
|
|
|
+ _muti.run()
|
|
|
+ df_company = {}
|
|
|
+ set_key = set()
|
|
|
+ if len(list_company)>0:
|
|
|
+ for item in list_company:
|
|
|
+ for _key in item.keys():
|
|
|
+ set_key.add(_key)
|
|
|
+ if _key not in df_company:
|
|
|
+ df_company[_key] = []
|
|
|
+ list_key = list(set_key)
|
|
|
+ for item in list_company:
|
|
|
+ for _key in list_key:
|
|
|
+ df_company[_key].append(item.get(_key,""))
|
|
|
+ df1 = pd.DataFrame(df_company)
|
|
|
+
|
|
|
+ df1.to_excel("%s_export.xlsx"%(filename))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def exportWin_tenderer(time_from,time_to):
|
|
|
+ '''
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ ost_client = getConnect_ots()
|
|
|
+ last_docid = 0
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
|
|
|
+ TermQuery("docchannel",101),
|
|
|
+ RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
|
|
|
+ RangeQuery('docid', last_docid, include_lower=False)])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_project = []
|
|
|
+ def _getRow(list_project,rows,last_docid):
|
|
|
+ for row in rows:
|
|
|
+ project_name = row[1][0][1]
|
|
|
+ docid = row[0][1][1]
|
|
|
+ last_docid = docid
|
|
|
+ list_pack = json.loads(row[1][1][1])
|
|
|
+ _set_tenderer = set()
|
|
|
+ win_tenderer = ""
|
|
|
+ for _pack in list_pack:
|
|
|
+ if "win_tenderer" in _pack and win_tenderer=="":
|
|
|
+ win_tenderer = _pack["win_tenderer"]
|
|
|
+ if "second_tenderer" in _pack:
|
|
|
+ _set_tenderer.add(_pack["second_tenderer"])
|
|
|
+ if "third_tenderer" in _pack:
|
|
|
+ _set_tenderer.add(_pack["third_tenderer"])
|
|
|
+ list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
|
|
|
+ return last_docid
|
|
|
+ _getRow(list_project,rows,last_docid)
|
|
|
+ while(next_token):
|
|
|
+ print("%d/%d"%(len(list_project),total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ last_docid = _getRow(list_project,rows,last_docid)
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ for item in list_project:
|
|
|
+ task_queue.put(item)
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ if item["win_tenderer"]!="":
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+ consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
|
|
|
+ _dict = dict()
|
|
|
+ for _item in return_row.attribute_columns:
|
|
|
+ _dict[_item[0]] = _item[1]
|
|
|
+ for _key in _dict.keys():
|
|
|
+ item[_key] = _dict[_key]
|
|
|
+ data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
|
|
|
+ for _item in data["response"]["docs"]:
|
|
|
+ for _key in _item.keys():
|
|
|
+ item[_key] = _item[_key]
|
|
|
+ break
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
|
|
|
+ mt.run()
|
|
|
+ keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
|
|
|
+ df_data = {}
|
|
|
+ for _key in keys:
|
|
|
+ df_data[_key] = []
|
|
|
+ for item in list_project:
|
|
|
+ for _key in keys:
|
|
|
+ if _key in item:
|
|
|
+ df_data[_key].append(item[_key])
|
|
|
+ else:
|
|
|
+ df_data[_key].append("")
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
|
|
|
+
|
|
|
+def exportContact():
|
|
|
+ time_from = "2021-01-14"
|
|
|
+ time_to = "2021-01-15"
|
|
|
+ filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
|
|
|
+ df1 = pd.read_excel(filename)
|
|
|
+ set_company = set()
|
|
|
+ for item in df1["tenderer"]:
|
|
|
+ list_company = re.split("\['|', '|'\]|\[\]",item)
|
|
|
+ for _company in list_company:
|
|
|
+ if _company!="":
|
|
|
+ set_company.add(_company)
|
|
|
+
|
|
|
+ companys = list(set_company)
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ list_company = []
|
|
|
+ for _company in companys:
|
|
|
+ item = {"company_name":_company}
|
|
|
+ list_company.append(item)
|
|
|
+ task_queue.put(item)
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ company = item["company_name"]
|
|
|
+ data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
|
|
|
+ for _item in data["response"]["docs"]:
|
|
|
+ for _key in _item.keys():
|
|
|
+ item[_key] = _item[_key]
|
|
|
+ break
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
|
|
|
+ mt.run()
|
|
|
+ keys = ["company_name","contact_person","mobile_no","phone_no"]
|
|
|
+ df_data = {}
|
|
|
+ for _key in keys:
|
|
|
+ df_data[_key] = []
|
|
|
+ ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
|
|
|
+ for item in list_company:
|
|
|
+ for _key in keys:
|
|
|
+ if _key in item:
|
|
|
+ df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
|
|
|
+ else:
|
|
|
+ df_data[_key].append("")
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
|
|
|
+
|
|
|
+def countKeyword():
|
|
|
+ conn = getConnection_mysql()
|
|
|
+ cursor = conn.cursor()
|
|
|
+ print(0)
|
|
|
+ sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
|
|
|
+ cursor.execute(sql)
|
|
|
+ print(0.1)
|
|
|
+ df = pd.read_excel("万郡绿建细分关键词.xls")
|
|
|
+ list_keywords = df["细分类别"]
|
|
|
+ dict_keywords = dict()
|
|
|
+ for _key in list_keywords:
|
|
|
+ dict_keywords[_key] = 0
|
|
|
+ print(1)
|
|
|
+ from bs4 import BeautifulSoup
|
|
|
+ while(True):
|
|
|
+ rows = cursor.fetchmany(10000)
|
|
|
+ print("==")
|
|
|
+ if not rows:
|
|
|
+ break
|
|
|
+ for row in rows:
|
|
|
+ _html = BeautifulSoup(row[0],"lxml").getText()
|
|
|
+ for _key in list_keywords:
|
|
|
+ if re.search(_key,_html) is not None:
|
|
|
+ dict_keywords[_key] += 1
|
|
|
+ print(dict_keywords)
|
|
|
+ list_count = []
|
|
|
+ for _key in list_keywords:
|
|
|
+ list_count.append(dict_keywords[_key])
|
|
|
+ df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
|
|
|
+ df1.to_excel("关键词统计.xlsx")
|
|
|
+
|
|
|
+def countKeyword_solr():
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ keyword = item["keyword"]
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
|
|
|
+ _num = data["response"]["numFound"]
|
|
|
+ item["zhongbiao"] = _num
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
|
|
|
+ _num = data["response"]["numFound"]
|
|
|
+ item["zhaobiao"] = _num
|
|
|
+ result_queue.put(item)
|
|
|
+ file = "../data/关键词11.xlsx"
|
|
|
+ df = pd.read_excel(file)
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ print(df.keys())
|
|
|
+ for item in df["业务关键词"]:
|
|
|
+ task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
|
|
|
+ mt.run()
|
|
|
+ list_keyword = []
|
|
|
+ list_zhaobiao = []
|
|
|
+ list_zhongbiao = []
|
|
|
+ while(True):
|
|
|
+ try:
|
|
|
+ item = result_queue.get(False)
|
|
|
+ list_keyword.append(item["keyword"])
|
|
|
+ list_zhaobiao.append(item["zhaobiao"])
|
|
|
+ list_zhongbiao.append(item["zhongbiao"])
|
|
|
+ except Exception:
|
|
|
+ break
|
|
|
+ df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
|
|
|
+ df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
|
|
|
+
|
|
|
+def query_from_solr():
|
|
|
+ data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
|
|
|
+ _num = data["response"]["numFound"]
|
|
|
+ print(_num)
|
|
|
+
|
|
|
+def export_province_keyword_count():
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+ _province = item["province"]
|
|
|
+ print(item)
|
|
|
+ # keywords = item["keyword"]+" "+item["key"]
|
|
|
+ list_keyword = item["keyword"]
|
|
|
+ # for _temp in keywords.split(" "):
|
|
|
+ # if len(_temp)>0:
|
|
|
+ # list_keyword.append(_temp)
|
|
|
+ should_queries = []
|
|
|
+ must_not_q = []
|
|
|
+ for _temp in list_keyword:
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
|
|
|
+ must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
|
|
|
+
|
|
|
+
|
|
|
+ bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
|
|
|
+ page_time = item["page_time"]
|
|
|
+ bool_query = BoolQuery(must_queries=[bool_query_keyword
|
|
|
+ # ,WildcardQuery("publishtime","%s*"%page_time)
|
|
|
+ # ,MatchPhraseQuery("doctitle","服务")
|
|
|
+ ,RangeQuery("page_time","2021-04-22","2021-04-27",include_lower=True,include_upper=True),
|
|
|
+ TermQuery("docchannel",101),
|
|
|
+ RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
|
|
|
+ WildcardQuery('province', '%s*'%_province)
|
|
|
+ ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
|
|
|
+ ],
|
|
|
+ must_not_queries=must_not_q)
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
|
|
|
+ ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ item["count"] = total_count
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+
|
|
|
+ item["list_data"] = list_data
|
|
|
+ print(item)
|
|
|
+
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+ df = pd.read_excel("../data/省份信息.xlsx")
|
|
|
+ list_province = []
|
|
|
+ for _name,_type in zip(df["cname"],df["ctype"]):
|
|
|
+ if _type==20:
|
|
|
+ list_province.append(_name)
|
|
|
+
|
|
|
+ # filename = "../data/2021-02关键词导出数据.xlsx"
|
|
|
+ # dict_keyword = {}
|
|
|
+ # df1 = pd.read_excel(filename,dtype=str)
|
|
|
+ # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
|
|
|
+ # print("===",str(_keyword))
|
|
|
+ # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
|
|
|
+ # for _key in df1["关键词"]:
|
|
|
+ # dict_keyword[_key] = ""
|
|
|
+ keyword_str = '''
|
|
|
+ 快递 物流 供应链 运输 配送
|
|
|
+仓储 冷链 整车 服务
|
|
|
+ '''
|
|
|
+ list_key = []
|
|
|
+ for _k in re.split("\s",keyword_str):
|
|
|
+ _k1 = _k.strip()
|
|
|
+ if len(_k1)>0:
|
|
|
+ list_key.append(_k1)
|
|
|
+
|
|
|
+
|
|
|
+ list_task = []
|
|
|
+ page_time = "2020-11"
|
|
|
+ for _province in list_province:
|
|
|
+ list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for item in list_task:
|
|
|
+ task_queue.put(item)
|
|
|
+
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+ dict_key_data = dict()
|
|
|
+ list_data = []
|
|
|
+ for item in list_task:
|
|
|
+ list_data.extend(item["list_data"])
|
|
|
+ dict_channel = getDict_docchannel()
|
|
|
+ df_data= {}
|
|
|
+ print(list_data)
|
|
|
+ for row in list_data:
|
|
|
+ item = {}
|
|
|
+ _dict = row
|
|
|
+ set_dict_item(item,"docid",_dict.get("docid",""))
|
|
|
+ set_dict_item(item,"公告标题",_dict.get("doctitle",""))
|
|
|
+ set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
|
|
|
+ set_dict_item(item,"省份",_dict.get("province",""))
|
|
|
+ # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
|
|
|
+ set_dict_item(item,"城市",_dict.get("city",""))
|
|
|
+ set_dict_item(item,"发布时间",_dict.get("page_time",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"项目编号",_dict.get("project_code",""))
|
|
|
+ set_dict_item(item,"招标单位",_dict.get("tenderee",""))
|
|
|
+ set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
|
|
|
+ set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
|
|
|
+ set_dict_item(item,"代理单位",_dict.get("agency",""))
|
|
|
+ set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
|
|
|
+ set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
|
|
|
+ set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
|
|
|
+
|
|
|
+ sub_docs_json = _dict.get("sub_docs_json")
|
|
|
+ for _doc in json.loads(sub_docs_json):
|
|
|
+ if "win_tenderer" in _doc:
|
|
|
+ set_dict_item(item,"中标单位",_doc["win_tenderer"])
|
|
|
+ if "win_tenderee_manager" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
|
|
|
+ if "win_tenderee_phone" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
|
|
|
+ if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
|
|
|
+ set_dict_item(item,"中标金额",_doc["win_bid_price"])
|
|
|
+ if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
|
|
|
+ set_dict_item(item,"招标金额",_doc["bidding_budget"])
|
|
|
+ if "招标金额" not in item:
|
|
|
+ set_dict_item(item,"招标金额","")
|
|
|
+ if "中标金额" not in item:
|
|
|
+ set_dict_item(item,"中标金额","")
|
|
|
+ if "中标单位" not in item:
|
|
|
+ set_dict_item(item,"中标单位","")
|
|
|
+ if "中标单位联系人" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系人","")
|
|
|
+ if "中标单位联系电话" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系电话","")
|
|
|
+
|
|
|
+
|
|
|
+ _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
|
|
|
+ # if _line in set_line:
|
|
|
+ # continue
|
|
|
+ # if item["招标金额"]=="":
|
|
|
+ # continue
|
|
|
+ # set_line.add(_line)
|
|
|
+ for k,v in item.items():
|
|
|
+ if k not in df_data:
|
|
|
+ df_data[k] = []
|
|
|
+ df_data[k].append(v)
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
|
|
|
+ # for item in list_task:
|
|
|
+ # dict_key_data[item["key"]][item["province"]] = item
|
|
|
+ # dict_key_province = dict()
|
|
|
+ # dict_key_province["关键词"] = []
|
|
|
+ # for _province in list_province:
|
|
|
+ # dict_key_province[_province] = []
|
|
|
+ # for _key in dict_keyword.keys():
|
|
|
+ # dict_key_province["关键词"].append(_key)
|
|
|
+ # for _province in list_province:
|
|
|
+ # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
|
|
|
+ # columns = ["关键词"]
|
|
|
+ # columns.extend(list_province)
|
|
|
+ # df2 = pd.DataFrame(dict_key_province)
|
|
|
+ # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
|
|
|
+
|
|
|
+def export_keyword_count():
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",item["range_from"],item["range_to"]),
|
|
|
+ RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
|
|
|
+ MatchPhraseQuery(item["type"], item["keyword"])
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query, limit=1, get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
+ item["count"] = total_count
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+
|
|
|
+ range_from = "2019-01-01"
|
|
|
+ range_to = "2022-12-23"
|
|
|
+ _type = "doctextcon"
|
|
|
+
|
|
|
+ assert _type in ["doctitle","doctextcon"]
|
|
|
+ list_dict_key_count = []
|
|
|
+ filename = "../data/医院.xlsx"
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ for item in df["关键词"]:
|
|
|
+ list_dict_key_count.append({"keyword":item,"count":0,"range_from":range_from,"range_to":range_to,"type":_type})
|
|
|
+
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for item in list_dict_key_count:
|
|
|
+ task_queue.put(item)
|
|
|
+
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+ columns = ["keyword","count","range_from","range_to","type"]
|
|
|
+ df_data = {}
|
|
|
+ for _c in columns:
|
|
|
+ df_data[_c] = []
|
|
|
+ for item in list_dict_key_count:
|
|
|
+ for _c in columns:
|
|
|
+ if _c in item:
|
|
|
+ df_data[_c].append(item[_c])
|
|
|
+ else:
|
|
|
+ df_data[_c].append("")
|
|
|
+ df2 = pd.DataFrame(df_data)
|
|
|
+ df2.to_excel("%s_数量导出.xlsx"%filename,columns=columns)
|
|
|
+
|
|
|
+def export_keyword_title():
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ range_from = "2020-01-01"
|
|
|
+ range_to = "2022-12-23"
|
|
|
+ list_condition = [["医务室"],
|
|
|
+ ["医院"],
|
|
|
+ ["卫生院"],
|
|
|
+ ["卫生所"],
|
|
|
+ ["卫生室"],
|
|
|
+ ["社区卫生服务中心"]]
|
|
|
+
|
|
|
+ list_should_query = []
|
|
|
+ for _c in list_condition:
|
|
|
+ if len(_c)==1:
|
|
|
+ list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
|
|
|
+ else:
|
|
|
+ _must_query = []
|
|
|
+ for _q in _c:
|
|
|
+ _must_query.append(MatchPhraseQuery("doctitle",_q))
|
|
|
+ list_should_query.append(BoolQuery(must_queries=_must_query))
|
|
|
+ keyword_query = BoolQuery(should_queries=list_should_query)
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
|
|
|
+ RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
|
|
|
+ keyword_query
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ df_data = {"docid":[],"doctitle":[],"tenderee":[]}
|
|
|
+ def setData(df_data,rows):
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _dict in list_dict:
|
|
|
+ docid = _dict.get("docid","")
|
|
|
+ doctitle = _dict.get("doctitle","")
|
|
|
+ tenderee = _dict.get("tenderee","")
|
|
|
+
|
|
|
+ df_data["docid"].append(docid)
|
|
|
+ df_data["doctitle"].append(doctitle)
|
|
|
+ df_data["tenderee"].append(tenderee)
|
|
|
+ setData(df_data,rows)
|
|
|
+ _count = len(rows)
|
|
|
+ while next_token:
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ setData(df_data,rows)
|
|
|
+ _count += 100
|
|
|
+ print(_count,total_count)
|
|
|
+ file_begin = 0
|
|
|
+ file_length = 100000
|
|
|
+ _count = 0
|
|
|
+ while file_begin<len(df_data["docid"]):
|
|
|
+ _dict = dict()
|
|
|
+ for _key,v in df_data.items():
|
|
|
+ _dict[_key] = v[file_begin:file_begin+file_length]
|
|
|
+ _count += 1
|
|
|
+ file_begin += file_length
|
|
|
+ df = pd.DataFrame(_dict)
|
|
|
+ df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
|
|
|
+
|
|
|
+def exportArticle_by_websource():
|
|
|
+ # conn = getConnection_testmysql()
|
|
|
+ # cursor = conn.cursor()
|
|
|
+ # sql = "select web_source_no from web_source"
|
|
|
+ # cursor.execute(sql)
|
|
|
+ # rows = cursor.fetchmany(10)
|
|
|
+ # dict_websource = dict()
|
|
|
+ # while(rows):
|
|
|
+ # for row in rows:
|
|
|
+ # web_source_no = row[0]
|
|
|
+ # dict_websource[web_source_no] = []
|
|
|
+ # rows = cursor.fetchmany(1000)
|
|
|
+ #
|
|
|
+ # task_queue = queue.Queue()
|
|
|
+ # for _key in dict_websource.keys():
|
|
|
+ # task_queue.put({"key":_key,"list":dict_websource[_key]})
|
|
|
+ #
|
|
|
+ # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
|
|
|
+ # result_queue = queue.Queue()
|
|
|
+ # def _handle(item,result_queue,pool_ots):
|
|
|
+ # _key = item["key"]
|
|
|
+ # print(_key)
|
|
|
+ # ots_client = pool_ots.getConnector()
|
|
|
+ # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
|
|
|
+ # TermQuery('web_source_no', '%s'%_key)
|
|
|
+ # ])
|
|
|
+ #
|
|
|
+ # is_all_succeed = False
|
|
|
+ #
|
|
|
+ # while(not is_all_succeed):
|
|
|
+ # try:
|
|
|
+ # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ # SearchQuery(bool_query, limit=100, get_total_count=True),
|
|
|
+ # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ # list_zhaobiao = []
|
|
|
+ # list_zhongbiao = []
|
|
|
+ # for row in rows:
|
|
|
+ # _dict = dict()
|
|
|
+ # for values in row:
|
|
|
+ # for _v in values:
|
|
|
+ # _dict[_v[0]] = _v[1]
|
|
|
+ # if _dict["docchannel"]==52:
|
|
|
+ # list_zhaobiao.append(_dict)
|
|
|
+ # elif _dict["docchannel"]==101:
|
|
|
+ # list_zhongbiao.append(_dict)
|
|
|
+ # item["list"].extend(list_zhaobiao[:5])
|
|
|
+ # item["list"].extend(list_zhongbiao[:5])
|
|
|
+ # except Exception as e:
|
|
|
+ # print(str(e))
|
|
|
+ #
|
|
|
+ # pool_ots.putConnector(ots_client)
|
|
|
+ #
|
|
|
+ # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
|
|
|
+ # mt.run()
|
|
|
+ # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
|
|
|
+ # for k,v in dict_websource.items():
|
|
|
+ # for item in v:
|
|
|
+ # df_data["docid"].append(item["docid"])
|
|
|
+ # df_data["web_source_no"].append(k)
|
|
|
+ # df_data["docchannel"].append(item["docchannel"])
|
|
|
+ # df_data["dochtmlcon"].append(item["dochtmlcon"])
|
|
|
+ # df = pd.DataFrame(df_data)
|
|
|
+ # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
|
|
|
+
|
|
|
+ df = pd.read_csv("../data/websouce_doc.csv")
|
|
|
+ df_2000 = {"document_id":[],"document_text":[]}
|
|
|
+
|
|
|
+ begin = 80000
|
|
|
+ end = 100000
|
|
|
+ _count = 0
|
|
|
+ for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
|
|
|
+ if len(_text)>100000:
|
|
|
+ continue
|
|
|
+ df_2000["document_id"].append(_id)
|
|
|
+ df_2000["document_text"].append(_text)
|
|
|
+ df_2 = pd.DataFrame(df_2000)
|
|
|
+ df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
|
|
|
+ # save(dict_websource,"../data/dict_websource.pk")
|
|
|
+
|
|
|
+def getWinTenderer(sub_doc_json):
|
|
|
+ if sub_doc_json is not None:
|
|
|
+ sub_doc = json.loads(sub_doc_json)
|
|
|
+ for _doc in sub_doc:
|
|
|
+ if "win_tenderer" in _doc:
|
|
|
+ return _doc["win_tenderer"]
|
|
|
+ return ""
|
|
|
+
|
|
|
+
|
|
|
+def exportDocument_by_keywords(page_time,
|
|
|
+ list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+
|
|
|
+ for _k in list_keyword:
|
|
|
+ task_queue.put(_k)
|
|
|
+
|
|
|
+
|
|
|
+ def _handle(keyword,result_queue):
|
|
|
+
|
|
|
+ should_queries = []
|
|
|
+ for _temp in [keyword]:
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle",_temp))
|
|
|
+
|
|
|
+ bool_query_keyword = BoolQuery(should_queries=should_queries)
|
|
|
+
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
|
|
|
+ MatchPhraseQuery("doctitle",keyword),
|
|
|
+ TermQuery("docchannel","101")
|
|
|
+ ])
|
|
|
+
|
|
|
+
|
|
|
+ is_all_succeed = False
|
|
|
+
|
|
|
+
|
|
|
+ _count = 0
|
|
|
+ total_count = 1
|
|
|
+ next_token = None
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ for row in rows:
|
|
|
+ _dict = dict()
|
|
|
+ for values in row:
|
|
|
+ for _v in values:
|
|
|
+ _dict[_v[0]] = _v[1]
|
|
|
+ result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
|
|
|
+ print(keyword,next_token,total_count)
|
|
|
+ while(next_token):
|
|
|
+ try:
|
|
|
+
|
|
|
+ # print(next_token)
|
|
|
+ _count += len(rows)
|
|
|
+ print("%s:%d/%d"%(keyword,_count,total_count))
|
|
|
+ for row in rows:
|
|
|
+ _dict = dict()
|
|
|
+ for values in row:
|
|
|
+ for _v in values:
|
|
|
+ _dict[_v[0]] = _v[1]
|
|
|
+ result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
|
|
|
+ mt.run()
|
|
|
+ df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
|
|
|
+ while(True):
|
|
|
+ try:
|
|
|
+ item = result_queue.get(block=True,timeout=1)
|
|
|
+ for _k in df_data.keys():
|
|
|
+ if _k in item:
|
|
|
+ df_data[_k].append(item[_k])
|
|
|
+ else:
|
|
|
+ df_data[_k].append("")
|
|
|
+ except queue.Empty as e:
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
|
|
|
+
|
|
|
+def exportGovement():
|
|
|
+ should_queries1 = []
|
|
|
+ for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
|
|
|
+ should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
|
|
|
+ should_queries2 = []
|
|
|
+ for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
|
|
|
+ should_queries2.append(WildcardQuery("province","*%s*"%_temp))
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ page_time = "2020-12"
|
|
|
+ bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
|
|
|
+ BoolQuery(should_queries=should_queries2),
|
|
|
+ TermQuery("docchannel","52"),
|
|
|
+ RangeQuery("publishtime",page_time)])
|
|
|
+ columns = ["tenderee","tenderee_contact","tenderee_phone"]
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ print(total_count)
|
|
|
+ def getRow(rows,df_data,columns):
|
|
|
+ for row in rows:
|
|
|
+ _dict = dict()
|
|
|
+ for part in row:
|
|
|
+ for item in part:
|
|
|
+ _dict[item[0]] = item[1]
|
|
|
+ if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
|
|
|
+ for key in columns:
|
|
|
+ df_data[key].append(_dict.get(key,""))
|
|
|
+ all_rows = 0
|
|
|
+ df_data = {}
|
|
|
+ for key in columns:
|
|
|
+ df_data[key] = []
|
|
|
+ getRow(rows,df_data,columns)
|
|
|
+ _count = 100
|
|
|
+ while(next_token):
|
|
|
+ print(_count,total_count)
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ _count += 100
|
|
|
+ getRow(rows,df_data,columns)
|
|
|
+
|
|
|
+ df2 = pd.DataFrame(df_data)
|
|
|
+ df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
|
|
|
+
|
|
|
+def export_attachment():
|
|
|
+ filename = "../data/销售部-字段名.xlsx"
|
|
|
+
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+
|
|
|
+ list_dict = []
|
|
|
+ for _key in df["关键词"]:
|
|
|
+ if len(_key)>0:
|
|
|
+ list_dict.append({"keyword":_key})
|
|
|
+
|
|
|
+ def _handle(_dict,result_queue,pool_ots):
|
|
|
+ _keyword = _dict["keyword"]
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+ keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword),
|
|
|
+ MatchPhraseQuery("doctitle",_keyword)])
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True),
|
|
|
+ keyword_query])
|
|
|
+ columns = ["dochtmlcon","docid"]
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ print(_keyword,total_count)
|
|
|
+ hit_count = 0
|
|
|
+ def getData(rows,hit_count):
|
|
|
+ list_fields = getRow_ots(rows)
|
|
|
+ for _fields in list_fields:
|
|
|
+ dochtmlcon = _fields["dochtmlcon"]
|
|
|
+ docid = _fields["docid"]
|
|
|
+ _flag = False
|
|
|
+ for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon):
|
|
|
+ try:
|
|
|
+ result = requests.get(url,stream=True,timeout=100)
|
|
|
+ if result.status_code==200:
|
|
|
+ filename = get_file_name(url,result.headers)
|
|
|
+ with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f:
|
|
|
+ f.write(result.content)
|
|
|
+ _flag = True
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ if _flag:
|
|
|
+ hit_count += 1
|
|
|
+ return hit_count
|
|
|
+
|
|
|
+
|
|
|
+ hit_count = getData(rows,hit_count)
|
|
|
+ _count = len(rows)
|
|
|
+ while next_token:
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ _count += len(rows)
|
|
|
+ if _count>20000:
|
|
|
+ break
|
|
|
+ hit_count = getData(rows,hit_count)
|
|
|
+ if hit_count>20:
|
|
|
+ break
|
|
|
+
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+
|
|
|
+ for item in list_dict:
|
|
|
+ task_queue.put(item)
|
|
|
+
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+def exportIndustryCount():
|
|
|
+ import codecs
|
|
|
+ time_from = "2020-12-21"
|
|
|
+ time_to = "2020-12-25"
|
|
|
+
|
|
|
+ # dict_channel = {"51":{"type":"公告变更"},
|
|
|
+ # "52":{"type":"招标公告"},
|
|
|
+ # "101":{"type":"中标信息"},
|
|
|
+ # "102":{"type":"招标预告"},
|
|
|
+ # "103":{"type":"招标答疑"},
|
|
|
+ # "104":{"type":"招标文件"},
|
|
|
+ # "105":{"type":"资审结果"},
|
|
|
+ # "103":{"type":"招标控制价"},
|
|
|
+ # "100":{"type":"未知类型"}}
|
|
|
+
|
|
|
+
|
|
|
+ dict_industry = {}
|
|
|
+ meta_industry = load("../data/class2dalei_menlei.pkl")
|
|
|
+ for _key in meta_industry.keys():
|
|
|
+ dict_industry[_key] = {"type":_key}
|
|
|
+ print(dict_industry.keys())
|
|
|
+ return
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ for _key in dict_industry.keys():
|
|
|
+ task_queue.put(dict_industry[_key])
|
|
|
+
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
|
|
|
+ RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
|
|
|
+ columns = ["docid"]
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query, limit=1,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ item["count"] = total_count
|
|
|
+ columns = ["dochtmlcon"]
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ for row in rows:
|
|
|
+ _dict = dict()
|
|
|
+ for part in row:
|
|
|
+ for v in part:
|
|
|
+ _dict[v[0]] = v[1]
|
|
|
+ with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
|
|
|
+ f.write(_dict["dochtmlcon"])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+ pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+ columns = ["type","count"]
|
|
|
+ df_data = {}
|
|
|
+ for _c in columns:
|
|
|
+ df_data[_c] = []
|
|
|
+ for _indus in dict_industry.keys():
|
|
|
+ for _c in columns:
|
|
|
+ df_data[_c].append(dict_industry[_indus][_c])
|
|
|
+
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
|
|
|
+
|
|
|
+def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
|
|
|
+ '''
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ ost_client = getConnect_ots()
|
|
|
+ last_docid = 0
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
|
|
|
+ RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+
|
|
|
+ _count = len(rows)
|
|
|
+ df_data = {}
|
|
|
+ def getData(df_data,rows):
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _dict in list_dict:
|
|
|
+ for _k,_v in _dict.items():
|
|
|
+ if _k not in df_data:
|
|
|
+ df_data[_k] = []
|
|
|
+ df_data[_k].append(getLegal_str(_v))
|
|
|
+ getData(df_data,rows)
|
|
|
+ while(next_token):
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ _count += len(rows)
|
|
|
+ getData(df_data,rows)
|
|
|
+
|
|
|
+
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
|
|
|
+
|
|
|
+def processDocument():
|
|
|
+ filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
|
|
|
+ return
|
|
|
+ list_dict = []
|
|
|
+ for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
|
|
|
+ list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for _dict in list_dict:
|
|
|
+ task_queue.put(_dict)
|
|
|
+ result_queue = queue.Queue()
|
|
|
+
|
|
|
+ def _handle(_dict,result_queue,pool_mysql):
|
|
|
+ conn = pool_mysql.getConnector()
|
|
|
+ cursor = conn.cursor()
|
|
|
+ sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
|
|
|
+ cursor.execute(sql)
|
|
|
+ conn.commit()
|
|
|
+
|
|
|
+ pool_mysql.putConnector(conn)
|
|
|
+ # url = "http://192.168.2.101:15030"
|
|
|
+ # myheaders = {'Content-Type': 'application/json'}
|
|
|
+ # print(int(_dict["docid"]))
|
|
|
+ # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
|
|
|
+ # resp = requests.post(url,json=data,headers=myheaders, verify=True)
|
|
|
+ # result = json.loads(resp.content.decode("utf8"),"utf8")
|
|
|
+ # _dict["product"] = result["product"]
|
|
|
+
|
|
|
+ pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
|
|
|
+ mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+ # columns = ["docid","doctitle","project_name","product"]
|
|
|
+ #
|
|
|
+ # df_data = {}
|
|
|
+ # for _c in columns:
|
|
|
+ # df_data[_c] = []
|
|
|
+ # for _dict in list_dict:
|
|
|
+ # for _c in columns:
|
|
|
+ # df_data[_c].append(_dict.get(_c,""))
|
|
|
+ # df = pd.DataFrame(df_data)
|
|
|
+ # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
|
|
|
+
|
|
|
+def export_extract_check():
|
|
|
+ '''
|
|
|
+ :return:导出数据提取校验的结果并生成报告
|
|
|
+ '''
|
|
|
+
|
|
|
+ conn = getConnection_testmysql()
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ sql = " select docid,json_result from exportdb.extract_check "
|
|
|
+
|
|
|
+ cursor.execute(sql)
|
|
|
+
|
|
|
+
|
|
|
+ dict_global = {}
|
|
|
+
|
|
|
+ df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
|
|
|
+ df_document = {"docid":[]}
|
|
|
+
|
|
|
+ while True:
|
|
|
+ rows = cursor.fetchmany(10000)
|
|
|
+ if not rows:
|
|
|
+ break
|
|
|
+ for docid,json_result in rows:
|
|
|
+ df_document["docid"].append(docid)
|
|
|
+ _result = json.loads(json_result)
|
|
|
+ for k,v in _result.items():
|
|
|
+ key = k.split("_")
|
|
|
+ _key = "_".join(key[:-1])
|
|
|
+ if "punish" in _key or "complainants" in _key or "institutions" in _key:
|
|
|
+ continue
|
|
|
+ if k not in df_document:
|
|
|
+ df_document[k] = []
|
|
|
+ df_document[k].append(v)
|
|
|
+ key_type = key[-1]
|
|
|
+ if _key not in dict_global:
|
|
|
+ dict_global[_key] = {}
|
|
|
+ if key_type not in dict_global[_key]:
|
|
|
+ dict_global[_key][key_type] = 0
|
|
|
+ if key_type=="diff":
|
|
|
+ dict_global[_key][key_type] += v
|
|
|
+ if key_type in ("online","test"):
|
|
|
+ if isinstance(v,str):
|
|
|
+ if v!="":
|
|
|
+ dict_global[_key][key_type] += 1
|
|
|
+ elif isinstance(v,list):
|
|
|
+ dict_global[_key][key_type] += len(v)
|
|
|
+ for k,v in dict_global.items():
|
|
|
+ df_global["key_type"].append(k)
|
|
|
+ df_global["online_count"].append(v["online"])
|
|
|
+ df_global["test_count"].append(v["test"])
|
|
|
+ df_global["diff_count"].append(v["diff"])
|
|
|
+ df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
|
|
|
+
|
|
|
+ filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
|
|
|
+ with pd.ExcelWriter(filename) as writer:
|
|
|
+ df1 = pd.DataFrame(df_global)
|
|
|
+ df1.to_excel(writer,sheet_name="global")
|
|
|
+ for k,v in df_document.items():
|
|
|
+ print(k,len(v))
|
|
|
+ df2 = pd.DataFrame(df_document)
|
|
|
+ df2.to_excel(writer,sheet_name="document")
|
|
|
+ writer.save()
|
|
|
+ writer.close()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def exportDocument_dump():
|
|
|
+ # filename = "../data/重复公告.xlsx"
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
|
|
|
+ df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
|
|
|
+ df_data = {}
|
|
|
+ for _key in df_keys:
|
|
|
+ df_data[_key] = []
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
|
|
|
+ RangeQuery("status",201,300,True,True)])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ def getData(df_data,rows):
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for row in list_data:
|
|
|
+ dict_find = {}
|
|
|
+ for _key in df_keys:
|
|
|
+ dict_find[_key] = 0
|
|
|
+ for _k,_v in row.items():
|
|
|
+ if _k in df_keys:
|
|
|
+ dict_find[_k] = 1
|
|
|
+ if _k=="project_code":
|
|
|
+ _v = '"%s"'%_v
|
|
|
+ df_data[_k].append(_v)
|
|
|
+ doctitle = row.get("doctitle","")
|
|
|
+ df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
|
|
|
+ df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
|
|
|
+ dict_find["URL"] = 1
|
|
|
+ dict_find["doctitle_refine"] = 1
|
|
|
+ sub_docs_json = row.get("sub_docs_json","[{}]")
|
|
|
+ doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
|
|
|
+ if sub_docs_json is not None:
|
|
|
+ for sub_docs in json.loads(sub_docs_json):
|
|
|
+ for _key_sub_docs in sub_docs.keys():
|
|
|
+ if _key_sub_docs in doc_columns:
|
|
|
+ if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
|
|
|
+ if _key_sub_docs in ["bidding_budget","win_bid_price"]:
|
|
|
+ if float(sub_docs[_key_sub_docs])>0:
|
|
|
+ doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
|
|
|
+ else:
|
|
|
+ doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
|
|
|
+ for _k,_v in doc_columns.items():
|
|
|
+ dict_find[_k] = 1
|
|
|
+ df_data[_k].append(_v)
|
|
|
+ for _k,_v in dict_find.items():
|
|
|
+ if _v==0:
|
|
|
+ df_data[_k].append("")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ _count = len(rows)
|
|
|
+ getData(df_data,rows)
|
|
|
+ while next_token:
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ getData(df_data,rows)
|
|
|
+ _count += len(rows)
|
|
|
+
|
|
|
+ # for docid in df["docid"]:
|
|
|
+ # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
|
|
|
+ #
|
|
|
+ # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ # SearchQuery(bool_query , limit=100, get_total_count=True),
|
|
|
+ # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ # list_data = getRow_ots(rows)
|
|
|
+ # if len(list_data)>0:
|
|
|
+ # dict_find = {}
|
|
|
+ # for _key in df_keys:
|
|
|
+ # dict_find[_key] = 0
|
|
|
+ # for _k,_v in list_data[0].items():
|
|
|
+ # if _k in df_keys:
|
|
|
+ # dict_find[_k] = 1
|
|
|
+ # df_data[_k].append(_v)
|
|
|
+ # doctitle = list_data[0].get("doctitle","")
|
|
|
+ # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
|
|
|
+ # dict_find["doctitle_refine"] = 1
|
|
|
+ # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
|
|
|
+ # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
|
|
|
+ # if sub_docs_json is not None:
|
|
|
+ # for sub_docs in json.loads(sub_docs_json):
|
|
|
+ # for _key_sub_docs in sub_docs.keys():
|
|
|
+ # if _key_sub_docs in doc_columns:
|
|
|
+ # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
|
|
|
+ # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
|
|
|
+ # if float(sub_docs[_key_sub_docs])>0:
|
|
|
+ # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
|
|
|
+ # else:
|
|
|
+ # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
|
|
|
+ # for _k,_v in doc_columns.items():
|
|
|
+ # dict_find[_k] = 1
|
|
|
+ # df_data[_k].append(_v)
|
|
|
+ # for _k,_v in dict_find.items():
|
|
|
+ # if _v==0:
|
|
|
+ # df_data[_k].append("")
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_csv("../data/0303去重.csv",columns=df_keys)
|
|
|
+
|
|
|
+def exportDocument_dump_mysql():
|
|
|
+ conn = getConnection_testmysql()
|
|
|
+
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
|
|
|
+
|
|
|
+ df_data = {}
|
|
|
+ for _c in columns:
|
|
|
+ df_data[_c] = []
|
|
|
+ sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
|
|
|
+ cursor.execute(sql)
|
|
|
+ while True:
|
|
|
+ rows = cursor.fetchmany(100000)
|
|
|
+ if not rows:
|
|
|
+ break
|
|
|
+ for row in rows:
|
|
|
+ for _i in range(len(columns)):
|
|
|
+ df_data[columns[_i]].append(row[_i])
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
|
|
|
+
|
|
|
+
|
|
|
+ print(cursor.description)
|
|
|
+
|
|
|
+def getDict_docchannel():
|
|
|
+ conn = getConnection_mysql()
|
|
|
+ cursor = conn.cursor()
|
|
|
+ sql = "select channel_id,chnlname from sys_channel "
|
|
|
+ cursor.execute(sql)
|
|
|
+ rows = cursor.fetchall()
|
|
|
+ _dict = dict()
|
|
|
+ for row in rows:
|
|
|
+ _dict[row[0]] = row[1]
|
|
|
+ return _dict
|
|
|
+
|
|
|
+def exportDocument_by_doctitle():
|
|
|
+ columns = ["docid","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource"]
|
|
|
+
|
|
|
+ dict_channel = getDict_docchannel()
|
|
|
+
|
|
|
+ str_keyword = '''
|
|
|
+ 报批技术服务 不动产 测绘 城市更新 档案整理
|
|
|
+房地一体 拆旧复垦 土地规划 城乡规划 村庄规划
|
|
|
+技术服务 技术审查 建设用地增减挂钩 勘察 垦造水田
|
|
|
+不动产数据建库 不动产数据整合 林权调查 土地调查 城市更新数据调查
|
|
|
+不动产平台 测绘系统 地理信息系统 城乡规划信息系统 一张图信息平台
|
|
|
+测绘信息平台 双评价 城市更新研究 垦造水田研究报告 生态修复研究
|
|
|
+土地规划研究 复垦咨询服务 生态修复咨询服务 城乡规划咨询服务 城市更新咨询服务
|
|
|
+勘测定界 多测合一 用地预审 国土规划数据治理 地名普查
|
|
|
+地形图 垦造水田咨询服务 评估 全域土地综合整治 生态修复
|
|
|
+林权数据建库 权属调查 权籍调查
|
|
|
+
|
|
|
+
|
|
|
+ '''
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+
|
|
|
+ for _keyword in re.split("\s",str_keyword):
|
|
|
+ if len(_keyword.strip())==0:
|
|
|
+ continue
|
|
|
+ task_queue.put({"keyword":_keyword})
|
|
|
+
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+
|
|
|
+
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+ should_q1 = BoolQuery(should_queries=[MatchPhraseQuery("doctitle",item["keyword"])])
|
|
|
+
|
|
|
+ should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东"),
|
|
|
+ WildcardQuery('province', '%s*'%"湖南"),
|
|
|
+ WildcardQuery('province', '%s*'%"广西")])
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-01-01"),
|
|
|
+ RangeQuery("status",201,300,True,True),
|
|
|
+ TermQuery("docchannel",101),
|
|
|
+ should_q1,should_q2])
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ _data["keyword"] = item["keyword"]
|
|
|
+ result_queue.put(_data)
|
|
|
+
|
|
|
+ _count = len(list_data)
|
|
|
+ while next_token:
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _count += len(list_data)
|
|
|
+ for _data in list_data:
|
|
|
+ _data["keyword"] = item["keyword"]
|
|
|
+ result_queue.put(_data)
|
|
|
+
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+ list_item = []
|
|
|
+ try:
|
|
|
+ while True:
|
|
|
+ _dict = result_queue.get(False)
|
|
|
+ list_item.append(_dict)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+
|
|
|
+ keys = list_item[0].keys()
|
|
|
+ df_data = {}
|
|
|
+
|
|
|
+ print(len(list_item))
|
|
|
+ set_line = set()
|
|
|
+ for row in list_item:
|
|
|
+ item = {}
|
|
|
+ _dict = row
|
|
|
+
|
|
|
+ set_dict_item(item,"公告id",_dict.get("docid",""))
|
|
|
+ set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
|
|
|
+
|
|
|
+ set_dict_item(item,"公告标题",_dict.get("doctitle",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"省份",_dict.get("province",""))
|
|
|
+ # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
|
|
|
+ set_dict_item(item,"城市",_dict.get("city",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"区",_dict.get("district",""))
|
|
|
+ set_dict_item(item,"关键词",_dict.get("keyword",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"发布时间",_dict.get("page_time",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"项目编号",_dict.get("project_code",""))
|
|
|
+ set_dict_item(item,"招标单位",_dict.get("tenderee",""))
|
|
|
+ set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
|
|
|
+ set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
|
|
|
+ # set_dict_item(item,"代理单位",_dict.get("agency",""))
|
|
|
+ # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
|
|
|
+ # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
|
|
|
+ set_dict_item(item,"公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
|
|
|
+
|
|
|
+ tenderee_in = "否"
|
|
|
+ tenderee_keyword = "否"
|
|
|
+ if _dict.get("tenderee","")!="":
|
|
|
+ if _dict.get("tenderee","") in _dict.get("doctitle",""):
|
|
|
+ tenderee_in = "是"
|
|
|
+ if _dict.get("keyword","") in _dict.get("tenderee",""):
|
|
|
+ tenderee_keyword = "是"
|
|
|
+ set_dict_item(item,"标题包含招标人",tenderee_in)
|
|
|
+ set_dict_item(item,"招标人含有关键词",tenderee_keyword)
|
|
|
+ sub_docs_json = _dict.get("sub_docs_json")
|
|
|
+ for _doc in json.loads(sub_docs_json):
|
|
|
+ if "win_tenderer" in _doc:
|
|
|
+ set_dict_item(item,"中标单位",_doc["win_tenderer"])
|
|
|
+ # if "win_tenderee_manager" in _doc:
|
|
|
+ # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
|
|
|
+ # if "win_tenderee_phone" in _doc:
|
|
|
+ # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
|
|
|
+ if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
|
|
|
+ set_dict_item(item,"中标金额(万元)",float(_doc["win_bid_price"])/10000)
|
|
|
+ if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
|
|
|
+ set_dict_item(item,"招标金额(万元)",float(_doc["bidding_budget"])/10000)
|
|
|
+ if "中标单位" not in item:
|
|
|
+ set_dict_item(item,"中标单位","")
|
|
|
+ if "中标金额(万元)" not in item:
|
|
|
+ set_dict_item(item,"中标金额(万元)","")
|
|
|
+ if "招标金额(万元)" not in item:
|
|
|
+ set_dict_item(item,"招标金额(万元)","")
|
|
|
+
|
|
|
+
|
|
|
+ # if "中标单位联系人" not in item:
|
|
|
+ # set_dict_item(item,"中标单位联系人","")
|
|
|
+ # if "中标单位联系电话" not in item:
|
|
|
+ # set_dict_item(item,"中标单位联系电话","")
|
|
|
+
|
|
|
+
|
|
|
+ _line = "%s-%s-%s"%(item["项目编号"],item["中标单位"],item["中标金额(万元)"])
|
|
|
+ if _line in set_line:
|
|
|
+ continue
|
|
|
+ # if item["招标金额"]=="":
|
|
|
+ # continue
|
|
|
+ set_line.add(_line)
|
|
|
+ for k,v in item.items():
|
|
|
+ if k not in df_data:
|
|
|
+ df_data[k] = []
|
|
|
+ df_data[k].append(v)
|
|
|
+
|
|
|
+
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+
|
|
|
+ df.to_excel("../data/2021-04-14_export11.xlsx",columns=list_df_columns)
|
|
|
+
|
|
|
+set_columns = set()
|
|
|
+list_df_columns = []
|
|
|
+
|
|
|
+def set_dict_item(_dict,name,v):
|
|
|
+ _dict[name] = getLegal_str(v)
|
|
|
+ if name not in set_columns:
|
|
|
+ set_columns.add(name)
|
|
|
+ list_df_columns.append(getLegal_str(name))
|
|
|
+
|
|
|
+def exportDocument_medicine(start_time,end_time):
|
|
|
+ # filename = "../data/重复公告.xlsx"
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
|
|
|
+
|
|
|
+ dict_channel = getDict_docchannel()
|
|
|
+ def getData(df_data,rows,set_line):
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for row in list_data:
|
|
|
+ item = {}
|
|
|
+ _dict = row
|
|
|
+ set_dict_item(item,"docid",_dict.get("docid",""))
|
|
|
+ set_dict_item(item,"公告标题",_dict.get("doctitle",""))
|
|
|
+ set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
|
|
|
+ set_dict_item(item,"省份",_dict.get("province",""))
|
|
|
+ # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
|
|
|
+ set_dict_item(item,"城市",_dict.get("city",""))
|
|
|
+ set_dict_item(item,"发布时间",_dict.get("page_time",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"项目编号",_dict.get("project_code",""))
|
|
|
+ set_dict_item(item,"招标单位",_dict.get("tenderee",""))
|
|
|
+ set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
|
|
|
+ set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
|
|
|
+ set_dict_item(item,"代理单位",_dict.get("agency",""))
|
|
|
+ set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
|
|
|
+ set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
|
|
|
+ set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
|
|
|
+
|
|
|
+ sub_docs_json = _dict.get("sub_docs_json")
|
|
|
+ for _doc in json.loads(sub_docs_json):
|
|
|
+ if "win_tenderer" in _doc:
|
|
|
+ set_dict_item(item,"中标单位",_doc["win_tenderer"])
|
|
|
+ if "win_tenderee_manager" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
|
|
|
+ if "win_tenderee_phone" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
|
|
|
+ if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
|
|
|
+ set_dict_item(item,"中标金额",_doc["win_bid_price"])
|
|
|
+ if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
|
|
|
+ set_dict_item(item,"招标金额",_doc["bidding_budget"])
|
|
|
+ if "招标金额" not in item:
|
|
|
+ set_dict_item(item,"招标金额","")
|
|
|
+ if "中标金额" not in item:
|
|
|
+ set_dict_item(item,"中标金额","")
|
|
|
+ if "中标单位" not in item:
|
|
|
+ set_dict_item(item,"中标单位","")
|
|
|
+ if "中标单位联系人" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系人","")
|
|
|
+ if "中标单位联系电话" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系电话","")
|
|
|
+
|
|
|
+
|
|
|
+ _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
|
|
|
+ # if _line in set_line:
|
|
|
+ # continue
|
|
|
+ # if item["招标金额"]=="":
|
|
|
+ # continue
|
|
|
+ set_line.add(_line)
|
|
|
+ for k,v in item.items():
|
|
|
+ if k not in df_data:
|
|
|
+ df_data[k] = []
|
|
|
+ df_data[k].append(v)
|
|
|
+
|
|
|
+ # list_province = ["江西","湖南","四川","安徽"]
|
|
|
+ list_province = ["全国"]
|
|
|
+ for _province in list_province:
|
|
|
+ df_data = {}
|
|
|
+
|
|
|
+ str_p = '''
|
|
|
+ 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
|
|
|
+县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
|
|
|
+数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
|
|
|
+临床应用
|
|
|
+
|
|
|
+ '''
|
|
|
+ list_prov = re.split("\s",str_p)
|
|
|
+ list_mu = []
|
|
|
+ for _p in list_prov:
|
|
|
+ if _p.strip()=="":
|
|
|
+ continue
|
|
|
+ print(_p)
|
|
|
+ list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
|
|
|
+
|
|
|
+ s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
|
|
|
+ list_should_ten = []
|
|
|
+ for _p in re.split("、",s_tenderee):
|
|
|
+ if _p.split()=="":
|
|
|
+ continue
|
|
|
+ list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
|
|
|
+
|
|
|
+ list_should_chan = []
|
|
|
+ list_should_chan.append(TermQuery("docchannel",52))
|
|
|
+ list_should_chan.append(TermQuery("docchannel",101))
|
|
|
+ list_should_chan.append(TermQuery("docchannel",102))
|
|
|
+
|
|
|
+ should_q1 = BoolQuery(should_queries=list_mu)
|
|
|
+ should_q2 = BoolQuery(should_queries=list_should_ten)
|
|
|
+ should_q3 = BoolQuery(should_queries=list_should_chan)
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
|
|
|
+ RangeQuery("status",201,300,True,True),
|
|
|
+ should_q1,should_q2,should_q3])
|
|
|
+
|
|
|
+
|
|
|
+ # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教学器材")])
|
|
|
+ # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教育设备")])
|
|
|
+ # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教学器材")])
|
|
|
+ # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教育设备")])
|
|
|
+ # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
|
|
|
+ # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
|
|
|
+ # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
|
|
|
+ # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
|
|
|
+ # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
|
|
|
+ # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
|
|
|
+ # MatchPhraseQuery("doctitle","设备")])
|
|
|
+ #
|
|
|
+ # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
|
|
|
+ # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
|
|
|
+ # MatchPhraseQuery("doctitle","医院"),
|
|
|
+ # MatchPhraseQuery("doctitle","工业园"),
|
|
|
+ # MatchPhraseQuery("doctitle","营养"),
|
|
|
+ # MatchPhraseQuery("doctitle","厨房设备")])
|
|
|
+ # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
|
|
|
+ # RangeQuery("status",201,300,True,True),
|
|
|
+ # TermQuery("docchannel",101),
|
|
|
+ # WildcardQuery('province', '%s*'%_province),
|
|
|
+ # all_should],
|
|
|
+ # must_not_queries=[must_not_q])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+
|
|
|
+ set_line = set()
|
|
|
+ _count = len(rows)
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ while next_token:
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ _count += len(rows)
|
|
|
+ if len(df_data[list(df_data.keys())[0]])>=300:
|
|
|
+ break
|
|
|
+
|
|
|
+ set_enterprise = set()
|
|
|
+ for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
|
|
|
+ set_enterprise.add(_tenderee)
|
|
|
+ set_enterprise.add(_agency)
|
|
|
+ set_enterprise.add(_win_tenderer)
|
|
|
+ if "" in set_enterprise:
|
|
|
+ set_enterprise.remove("")
|
|
|
+ if None in set_enterprise:
|
|
|
+ set_enterprise.remove(None)
|
|
|
+ dict_enterprise = getDictEnterprise(list(set_enterprise))
|
|
|
+ if len(set_enterprise)>0:
|
|
|
+ for _i in range(len(df_data["招标单位"])):
|
|
|
+ _enterprise_name = df_data["招标单位"][_i]
|
|
|
+ if df_data["招标联系人电话"][_i]=="":
|
|
|
+ contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ if contacts is not None:
|
|
|
+ _person,_phone = getOneContact(contacts)
|
|
|
+ df_data["招标联系人"][_i] = _person
|
|
|
+ df_data["招标联系人电话"][_i] = _phone
|
|
|
+
|
|
|
+ _enterprise_name = df_data["代理单位"][_i]
|
|
|
+ if df_data["代理联系人电话"][_i]=="":
|
|
|
+ contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ if contacts is not None:
|
|
|
+ _person,_phone = getOneContact(contacts)
|
|
|
+ df_data["代理联系人"][_i] = _person
|
|
|
+ df_data["代理联系人电话"][_i] = _phone
|
|
|
+
|
|
|
+ _enterprise_name = df_data["中标单位"][_i]
|
|
|
+ if df_data["中标单位联系电话"][_i]=="":
|
|
|
+ contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ if contacts is not None:
|
|
|
+ _person,_phone = getOneContact(contacts)
|
|
|
+ df_data["中标单位联系人"][_i] = _person
|
|
|
+ df_data["中标单位联系电话"][_i] = _phone
|
|
|
+
|
|
|
+ return df_data
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
|
|
|
+
|
|
|
+
|
|
|
+def exportDocument_by_pagetime():
|
|
|
+ # filename = "../data/重复公告.xlsx"
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
|
|
|
+
|
|
|
+ dict_channel = getDict_docchannel()
|
|
|
+ def getData(df_data,rows,set_line):
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for row in list_data:
|
|
|
+ item = {}
|
|
|
+ _dict = row
|
|
|
+ set_dict_item(item,"docid",_dict.get("docid",""))
|
|
|
+ set_dict_item(item,"公告标题",_dict.get("doctitle",""))
|
|
|
+ set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
|
|
|
+ set_dict_item(item,"省份",_dict.get("province",""))
|
|
|
+ # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
|
|
|
+ set_dict_item(item,"城市",_dict.get("city",""))
|
|
|
+ set_dict_item(item,"发布时间",_dict.get("page_time",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
|
|
|
+
|
|
|
+ set_dict_item(item,"项目编号",_dict.get("project_code",""))
|
|
|
+ set_dict_item(item,"招标单位",_dict.get("tenderee",""))
|
|
|
+ set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
|
|
|
+ set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
|
|
|
+ set_dict_item(item,"代理单位",_dict.get("agency",""))
|
|
|
+ set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
|
|
|
+ set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
|
|
|
+ set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
|
|
|
+
|
|
|
+ sub_docs_json = _dict.get("sub_docs_json")
|
|
|
+ for _doc in json.loads(sub_docs_json):
|
|
|
+ if "win_tenderer" in _doc:
|
|
|
+ set_dict_item(item,"中标单位",_doc["win_tenderer"])
|
|
|
+ if "win_tenderee_manager" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
|
|
|
+ if "win_tenderee_phone" in _doc:
|
|
|
+ set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
|
|
|
+ if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
|
|
|
+ set_dict_item(item,"中标金额",_doc["win_bid_price"])
|
|
|
+ if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
|
|
|
+ set_dict_item(item,"招标金额",_doc["bidding_budget"])
|
|
|
+ if "招标金额" not in item:
|
|
|
+ set_dict_item(item,"招标金额","")
|
|
|
+ if "中标金额" not in item:
|
|
|
+ set_dict_item(item,"中标金额","")
|
|
|
+ if "中标单位" not in item:
|
|
|
+ set_dict_item(item,"中标单位","")
|
|
|
+ if "中标单位联系人" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系人","")
|
|
|
+ if "中标单位联系电话" not in item:
|
|
|
+ set_dict_item(item,"中标单位联系电话","")
|
|
|
+
|
|
|
+
|
|
|
+ _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
|
|
|
+ # if _line in set_line:
|
|
|
+ # continue
|
|
|
+ # if item["招标金额"]=="":
|
|
|
+ # continue
|
|
|
+ set_line.add(_line)
|
|
|
+ for k,v in item.items():
|
|
|
+ if k not in df_data:
|
|
|
+ df_data[k] = []
|
|
|
+ df_data[k].append(v)
|
|
|
+
|
|
|
+ # list_province = ["江西","湖南","四川","安徽"]
|
|
|
+ list_province = ["全国"]
|
|
|
+ for _province in list_province:
|
|
|
+ df_data = {}
|
|
|
+
|
|
|
+ str_p = '''
|
|
|
+ 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
|
|
|
+县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
|
|
|
+数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
|
|
|
+临床应用
|
|
|
+
|
|
|
+ '''
|
|
|
+ list_prov = re.split("\s",str_p)
|
|
|
+ list_mu = []
|
|
|
+ for _p in list_prov:
|
|
|
+ if _p.strip()=="":
|
|
|
+ continue
|
|
|
+ print(_p)
|
|
|
+ list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
|
|
|
+
|
|
|
+ s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
|
|
|
+ list_should_ten = []
|
|
|
+ for _p in re.split("、",s_tenderee):
|
|
|
+ if _p.split()=="":
|
|
|
+ continue
|
|
|
+ list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
|
|
|
+
|
|
|
+ list_should_chan = []
|
|
|
+ list_should_chan.append(TermQuery("docchannel",52))
|
|
|
+ # list_should_chan.append(TermQuery("docchannel",101))
|
|
|
+ # list_should_chan.append(TermQuery("docchannel",102))
|
|
|
+
|
|
|
+ should_q1 = BoolQuery(should_queries=list_mu)
|
|
|
+ should_q2 = BoolQuery(should_queries=list_should_ten)
|
|
|
+ should_q3 = BoolQuery(should_queries=list_should_chan)
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-05-07","2021-05-07",include_lower=True,include_upper=True),
|
|
|
+ RangeQuery("status",201,300,True,True)
|
|
|
+ # ,should_q1
|
|
|
+ # ,should_q2
|
|
|
+ ,should_q3])
|
|
|
+
|
|
|
+
|
|
|
+ # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教学器材")])
|
|
|
+ # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教育设备")])
|
|
|
+ # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教学器材")])
|
|
|
+ # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
|
|
|
+ # MatchPhraseQuery("doctitle","教育设备")])
|
|
|
+ # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
|
|
|
+ # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
|
|
|
+ # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
|
|
|
+ # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
|
|
|
+ # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
|
|
|
+ # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
|
|
|
+ # MatchPhraseQuery("doctitle","设备")])
|
|
|
+ #
|
|
|
+ # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
|
|
|
+ # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
|
|
|
+ # MatchPhraseQuery("doctitle","医院"),
|
|
|
+ # MatchPhraseQuery("doctitle","工业园"),
|
|
|
+ # MatchPhraseQuery("doctitle","营养"),
|
|
|
+ # MatchPhraseQuery("doctitle","厨房设备")])
|
|
|
+ # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
|
|
|
+ # RangeQuery("status",201,300,True,True),
|
|
|
+ # TermQuery("docchannel",101),
|
|
|
+ # WildcardQuery('province', '%s*'%_province),
|
|
|
+ # all_should],
|
|
|
+ # must_not_queries=[must_not_q])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+
|
|
|
+ set_line = set()
|
|
|
+ _count = len(rows)
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ while next_token:
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ _count += len(rows)
|
|
|
+ # if len(df_data[list(df_data.keys())[0]])>=300:
|
|
|
+ # break
|
|
|
+
|
|
|
+ # set_enterprise = set()
|
|
|
+ # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
|
|
|
+ # set_enterprise.add(_tenderee)
|
|
|
+ # set_enterprise.add(_agency)
|
|
|
+ # set_enterprise.add(_win_tenderer)
|
|
|
+ # if "" in set_enterprise:
|
|
|
+ # set_enterprise.remove("")
|
|
|
+ # if None in set_enterprise:
|
|
|
+ # set_enterprise.remove(None)
|
|
|
+ # dict_enterprise = getDictEnterprise(list(set_enterprise))
|
|
|
+ # if len(set_enterprise)>0:
|
|
|
+ # for _i in range(len(df_data["招标单位"])):
|
|
|
+ # _enterprise_name = df_data["招标单位"][_i]
|
|
|
+ # if df_data["招标联系人电话"][_i]=="":
|
|
|
+ # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ # if contacts is not None:
|
|
|
+ # _person,_phone = getOneContact(contacts)
|
|
|
+ # df_data["招标联系人"][_i] = _person
|
|
|
+ # df_data["招标联系人电话"][_i] = _phone
|
|
|
+ #
|
|
|
+ # _enterprise_name = df_data["代理单位"][_i]
|
|
|
+ # if df_data["代理联系人电话"][_i]=="":
|
|
|
+ # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ # if contacts is not None:
|
|
|
+ # _person,_phone = getOneContact(contacts)
|
|
|
+ # df_data["代理联系人"][_i] = _person
|
|
|
+ # df_data["代理联系人电话"][_i] = _phone
|
|
|
+ #
|
|
|
+ # _enterprise_name = df_data["中标单位"][_i]
|
|
|
+ # if df_data["中标单位联系电话"][_i]=="":
|
|
|
+ # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
|
|
|
+ # if contacts is not None:
|
|
|
+ # _person,_phone = getOneContact(contacts)
|
|
|
+ # df_data["中标单位联系人"][_i] = _person
|
|
|
+ # df_data["中标单位联系电话"][_i] = _phone
|
|
|
+
|
|
|
+
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
|
|
|
+
|
|
|
+def attachCompanyContact():
|
|
|
+ files = ["../data/2021-03-17_四川_关键词导出.csv",
|
|
|
+ "../data/2021-03-17_安徽_关键词导出.csv",
|
|
|
+ "../data/2021-03-17_江西_关键词导出.csv",
|
|
|
+ "../data/2021-03-17_湖南_关键词导出.csv"]
|
|
|
+
|
|
|
+ files = ["../data/欧科自然资源5w以上数据.xlsx"]
|
|
|
+
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+
|
|
|
+ primary_key = [('name',str(item["招标单位"]))]
|
|
|
+ columns_to_get = ["province","city","district"]
|
|
|
+ consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ # bool_query = BoolQuery(must_queries=[TermQuery("nicknames",item["中标单位"])])
|
|
|
+ #
|
|
|
+ # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
|
|
|
+ # SearchQuery(bool_query , limit=1, get_total_count=True),
|
|
|
+ # ColumnsToGet(["contacts"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ # _docid = int(item["docid"])
|
|
|
+ # partitionkey = _docid%500+1
|
|
|
+ # primary_key = [('partitionkey',partitionkey),("docid",_docid)]
|
|
|
+ # columns_to_get = ["doctitle"]
|
|
|
+ # consumed, return_row, next_token = ots_client.get_row("document",primary_key, columns_to_get, None, 1)
|
|
|
+ if isinstance(item["招标单位"],str) and item["招标单位"]!="":
|
|
|
+ if return_row is not None:
|
|
|
+ _dict = getRow_ots_primary(return_row)
|
|
|
+ # item["doctitle"] = _dict.get("doctitle","")
|
|
|
+ item["招标人省份"] = _dict.get("province","")
|
|
|
+ item["招标人城市"] = _dict.get("city","")
|
|
|
+ item["招标人区域"] = _dict.get("district","")
|
|
|
+
|
|
|
+ province,city,district = getLocation(item["招标单位"])
|
|
|
+ if item["招标人省份"]=="" or item["招标人省份"]=="未知":
|
|
|
+ item["招标人省份"] = province
|
|
|
+ if item["招标人城市"]=="" or item["招标人城市"]=="未知":
|
|
|
+ item["招标人城市"] = city
|
|
|
+ if item["招标人区域"]=="" or item["招标人区域"]=="未知":
|
|
|
+ item["招标人区域"] = district
|
|
|
+
|
|
|
+ else:
|
|
|
+ province,city,district = getLocation(item["招标单位"])
|
|
|
+ item["招标人省份"] = province
|
|
|
+ item["招标人城市"] = city
|
|
|
+ item["招标人区域"] = district
|
|
|
+
|
|
|
+ else:
|
|
|
+ item["招标人省份"] = item["省份"]
|
|
|
+ item["招标人城市"] = item["城市"]
|
|
|
+ item["招标人区域"] = item["区"]
|
|
|
+ # contacts = json.loads(_dict["contacts"])
|
|
|
+ # contacts.sort(key=lambda x:x["level"],reverse=True)
|
|
|
+ # phone = ""
|
|
|
+ # phone_person = ""
|
|
|
+ # mobile = ""
|
|
|
+ # mobile_person = ""
|
|
|
+ # for contact in contacts:
|
|
|
+ # if mobile=="" and contact.get("mobile_no","")!="":
|
|
|
+ # mobile = contact.get("mobile_no","")
|
|
|
+ # mobile_person = contact.get("contact_person","")
|
|
|
+ # if phone=="" and contact.get("phone_no","")!="":
|
|
|
+ # phone = contact.get("phone_no",'')
|
|
|
+ # phone_person = contact.get("contact_person","")
|
|
|
+ # item["招标联系人"] = ""
|
|
|
+ # item["招标联系人电话"] = ""
|
|
|
+ # if mobile!="":
|
|
|
+ # item["招标联系人"] = mobile_person
|
|
|
+ # item["招标联系人电话"] = mobile
|
|
|
+ # else:
|
|
|
+ # item["中标单位联系人"] = phone_person
|
|
|
+ # item["中标单位联系电话"] = phone
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ for file in files:
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ df = pd.read_excel(file,encoding="UTF8")
|
|
|
+ keys = df.keys()[1:]
|
|
|
+ list_item = []
|
|
|
+ for row in df.itertuples():
|
|
|
+ _dict = {}
|
|
|
+ for _key in keys:
|
|
|
+ if _key in dir(row):
|
|
|
+ _v = row.__getattribute__(_key)
|
|
|
+ else:
|
|
|
+ _v = ''
|
|
|
+ _dict[_key] = _v
|
|
|
+ list_item.append(_dict)
|
|
|
+ for item in list_item:
|
|
|
+ task_queue.put(item)
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+ df_data = {}
|
|
|
+ for _k in keys:
|
|
|
+ df_data[_k] = []
|
|
|
+ for item in list_item:
|
|
|
+ for _k in keys:
|
|
|
+ df_data[_k].append(getLegal_str(item.get(_k,"-")))
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel("%s_attach.xlsx"%file,columns=keys)
|
|
|
+
|
|
|
+def dumpWebSourceNo():
|
|
|
+ conn_oracle = getConnection_oracle()
|
|
|
+ cursor_oracle = conn_oracle.cursor()
|
|
|
+ sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
|
|
|
+ cursor_oracle.execute(sql)
|
|
|
+ rows = cursor_oracle.fetchall()
|
|
|
+
|
|
|
+ conn_mysql = getConnection_testmysql()
|
|
|
+ cursor_mysql = conn_mysql.cursor()
|
|
|
+ for row in rows:
|
|
|
+ sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
|
|
|
+ print(sql)
|
|
|
+ cursor_mysql.execute(sql)
|
|
|
+ conn_mysql.commit()
|
|
|
+
|
|
|
+def exportNzj():
|
|
|
+ # filename = "../data/重复公告.xlsx"
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
|
|
|
+
|
|
|
+ def getData(df_data,rows,set_line):
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for row in list_data:
|
|
|
+ item = {}
|
|
|
+ _dict = row
|
|
|
+ set_dict_item(item,"docids",_dict.get("docids",""))
|
|
|
+ set_dict_item(item,"contacts",_dict.get("contacts",""))
|
|
|
+ set_dict_item(item,"covered_area",_dict.get("covered_area",""))
|
|
|
+ set_dict_item(item,"follows",_dict.get("follows",""))
|
|
|
+ set_dict_item(item,"project_type",_dict.get("project_type",""))
|
|
|
+ # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
|
|
|
+ set_dict_item(item,"page_time",_dict.get("page_time",""))
|
|
|
+ set_dict_item(item,"progress",_dict.get("progress",""))
|
|
|
+
|
|
|
+ set_dict_item(item,"project_description",_dict.get("project_description",""))
|
|
|
+ set_dict_item(item,"project_follow",_dict.get("project_follow",""))
|
|
|
+ set_dict_item(item,"project_code",_dict.get("project_code",""))
|
|
|
+ set_dict_item(item,"project_name",_dict.get("project_name",""))
|
|
|
+
|
|
|
+ for k,v in item.items():
|
|
|
+ if k not in df_data:
|
|
|
+ df_data[k] = []
|
|
|
+ df_data[k].append(v)
|
|
|
+
|
|
|
+ df_data = {}
|
|
|
+
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
|
|
|
+
|
|
|
+
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
|
|
|
+ SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+
|
|
|
+ set_line = set()
|
|
|
+ _count = len(rows)
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ while next_token:
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
|
|
|
+ SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ getData(df_data,rows,set_line)
|
|
|
+ _count += len(rows)
|
|
|
+
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
|
|
|
+
|
|
|
+def turn_status():
|
|
|
+ df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
|
|
|
+
|
|
|
+ conn = getConnection_testmysql()
|
|
|
+ cursor = conn.cursor()
|
|
|
+ for docid in df["公告id"]:
|
|
|
+ partitionkey = int(docid)%500+1
|
|
|
+ sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
|
|
|
+ cursor.execute(sql)
|
|
|
+
|
|
|
+ conn.commit()
|
|
|
+
|
|
|
+
|
|
|
+def attachBidding_budget():
|
|
|
+
|
|
|
+ conn_mysql = getConnection_testmysql()
|
|
|
+ cursor = conn_mysql.cursor()
|
|
|
+
|
|
|
+ sql = "select docid from analysis_r2 where bidding_budget=''"
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ cursor.execute(sql)
|
|
|
+ rows = cursor.fetchmany(10000)
|
|
|
+ while(rows):
|
|
|
+ for row in rows:
|
|
|
+ task_queue.put(row[0])
|
|
|
+ rows = cursor.fetchmany(10000)
|
|
|
+
|
|
|
+ pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+
|
|
|
+ def _handle(item,result_queue,pool_mysql,pool_ots):
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
|
|
|
+ SearchQuery(bool_query , limit=1, get_total_count=True),
|
|
|
+ ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ if len(list_dict)>0:
|
|
|
+ conn = pool_mysql.getConnector()
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
|
|
|
+ cursor.execute(sql)
|
|
|
+ conn.commit()
|
|
|
+
|
|
|
+ pool_mysql.putConnector(conn)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+def debug_documentMerge():
|
|
|
+ conn = getConnection_testmysql()
|
|
|
+
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ sql = "select merge_docids from project_group_final_log "
|
|
|
+ cursor.execute(sql)
|
|
|
+
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ for row in cursor.fetchall():
|
|
|
+ task_queue.put(row[0])
|
|
|
+
|
|
|
+ print(task_queue.qsize())
|
|
|
+ def _handle(item,result_queue,pool_ots):
|
|
|
+ ots_client = pool_ots.getConnector()
|
|
|
+ list_docids = item.split(",")
|
|
|
+
|
|
|
+ must_q = []
|
|
|
+ for _docid in list_docids:
|
|
|
+ must_q.append(TermQuery("docids",_docid))
|
|
|
+ bool_query = BoolQuery(must_queries=must_q)
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
|
|
|
+ SearchQuery(bool_query,limit=1,get_total_count=True),
|
|
|
+ ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ if total_count==0:
|
|
|
+ print(item)
|
|
|
+ result_queue.put(item)
|
|
|
+ pool_ots.putConnector(ots_client)
|
|
|
+ result_queue = queue.Queue()
|
|
|
+ pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
|
|
|
+ mt.run()
|
|
|
+ while(True):
|
|
|
+ try:
|
|
|
+ item = result_queue.get(True)
|
|
|
+ print(item)
|
|
|
+ except Exception as e:
|
|
|
+ print(str(e))
|
|
|
+ break
|
|
|
+
|
|
|
+def signDocument():
|
|
|
+ filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
|
|
|
+ sign_filename = "%s_sign.xlsx"%filename
|
|
|
+ df = pd.read_excel(filename)
|
|
|
+ df_data = {"sign":[]}
|
|
|
+
|
|
|
+ for item in df["segword"]:
|
|
|
+ content = re.sub("\s*","",item)
|
|
|
+ _find = re.search("(?P<key>流标|废标|中止|终止|撤销|采购失败)",content)
|
|
|
+ if _find is not None:
|
|
|
+ df_data["sign"].append(_find.groupdict().get("key"))
|
|
|
+ else:
|
|
|
+ df_data["sign"].append("无")
|
|
|
+ df1 = pd.DataFrame(df_data)
|
|
|
+ df1.to_excel(sign_filename)
|
|
|
+
|
|
|
+if __name__=="__main__":
|
|
|
+ # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
|
|
|
+ # processDocument()
|
|
|
+ # export_extract_check()
|
|
|
+ # exportArticle_by_websource()
|
|
|
+ # export_keyword_count()
|
|
|
+ # export_province_keyword_count()
|
|
|
+ # exportDocument_dump()
|
|
|
+ # exportDocument_dump_mysql()
|
|
|
+ # export_attachment()
|
|
|
+ # exportDocument_by_doctitle()
|
|
|
+ # exportIndustryCount()
|
|
|
+ # exportDocument_by_pagetime()
|
|
|
+ # attachCompanyContact()
|
|
|
+ # dumpWebSourceNo()
|
|
|
+ # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%138306357))))
|
|
|
+ # exportNzj()
|
|
|
+ # turn_status()
|
|
|
+ # attachBidding_budget()
|
|
|
+ # debug_documentMerge()
|
|
|
+ exportDocument_medicine("2021-05-02","2021-05-08")
|
|
|
+ # signDocument()
|
|
|
+
|
|
|
+
|