#encoding:GBK import sys import os sys.path.append("../") import pandas as pd from dataSource.source import * import json from utils.multiThread import MultiThreadHandler import queue from utils.Utils import * from dataSource.pool import ConnectorPool import re from tablestore import * import traceback from utils.hashUtil import aesCipher from export.exportEnterprice import getDictEnterprise,getOneContact data_path = "../data/" def getCompanyTenderer(): def _handle(item,result_queue): company = item dict_result = {"company":company,"count":0,"competitor":"","project_name":""} dict_result["company"] = company graph = getConnect_neo4j() cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company) finded = graph.run(cql) finded_ids = json.loads(json.dumps(finded.data())) dict_result["count"] = len(finded_ids) mongoDB = getConnect_mongodb() coll_zb = mongoDB.zhongbiao_extraction if len(finded_ids)>0: cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company) finded = graph.run(cql) finded_names = json.loads(json.dumps(finded.data())) list_names = [_i["p.project_name"] for _i in finded_names] dict_result["project_name"] = str(list_names) cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company) finded = graph.run(cql) finded_money = json.loads(json.dumps(finded.data())) whole_money = 0 for item in finded_money: if item["r.price"] is not None: whole_money += getUnifyMoney(item["r.price"]) dict_result["whole_money"] = str(whole_money) dict_competitor = {} for item in finded_ids: docId = item["p.zhong_biao_id"] if docId is not None: rows = coll_zb.find({"docId":docId}) for row in rows: keys = ["second_tenderer","third_tenderer"] for _key in keys: if _key in row: if row[_key] not in dict_competitor: dict_competitor[row[_key]] = 0 dict_competitor[row[_key]] += 1 list_competitor = [] for _key in dict_competitor: list_competitor.append([_key,dict_competitor[_key]]) list_competitor.sort(key=lambda x:x[1],reverse=True) list_competitors = [i[0] for i in list_competitor[:10]] dict_result["competitor"] = str(list_competitors) result_queue.put(dict_result) # filename = "成交客户匹配中标项目的需求.xlsx" # df = pd.read_excel(filename) # list_company = df["公司名字"] # company = list_company[0] list_company = [] filename = "../data/服务型客户.txt" with open(filename,"r",encoding="GBK") as f: while(True): line = f.readline() if not line: break list_company.append(line.strip()) task_queue = queue.Queue() for company in list_company: task_queue.put(company) result_queue = queue.Queue() handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10) handler.run() list_company = [] list_zb = [] list_count = [] list_project = [] list_money = [] list_competitor = [] while(True): try: _result = result_queue.get(False) list_company.append(_result.get("company","")) list_zb.append("是" if _result.get("count","")>0 else "否") list_count.append(_result.get("count","")) list_project.append(_result.get("project_name","")) list_money.append(_result.get("whole_money","")) list_competitor.append(_result.get("competitor","")) except Exception as e: print(e) break df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor}) df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"]) def export_count_includeKeyword(): filename = "../data/jc001.xlsx" list_name = [] list_count = [] df = pd.read_excel(filename) _index = 0 for row in df["品目"]: _name = row data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1}) if data is not None: _count = data["response"]["numFound"] else: _count = 0 list_name.append(_name) list_count.append(_count) _index += 1 print(_index) df1 = pd.DataFrame({"品目":list_name,"数量":list_count}) df1.to_excel("%s_export.xls"%filename) def export_count_includeKeyword_multiThread(): def _handler(item,result_queue): data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1}) if data is not None: _count = data["response"]["numFound"] else: _count = 0 result_queue.put([item,_count]) task_queue = queue.Queue() result_queue = queue.Queue() filename = "../data/jc001.xlsx" list_name = [] list_count = [] df = pd.read_excel(filename) _index = 0 for row in df["品目"]: _name = row task_queue.put(_name) _index += 1 multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20) multHandler.run() while(True): try: item = result_queue.get(False) list_name.append(item[0]) list_count.append(item[1]) except queue.Empty as e: break df1 = pd.DataFrame({"品目":list_name,"数量":list_count}) df1.to_excel("%s_export.xls"%filename) def exportKeywords(): def _handle(item,result_queue,pool_mongo): docId = item["docId"] mongo = pool_mongo.getConnector() zhongbiao = mongo.zhongbiao_extraction zhaobiao = mongo.zhaobiao_extraction _project = "" rows = zhaobiao.find({"docId":docId},{"project_name":1}) find_flag = False for row in rows: find_flag = True _project = row.get("project_name","") if not find_flag: rows = zhongbiao.find({"docId":docId},{"project_name":1}) for row in rows: _project = row.get("project_name","") item["project_name"] = _project pool_mongo.putConnector(mongo) result_queue.put(item) list_key = [] dict_key_ids = dict() with open("../data/品目.txt", "r", encoding="utf8") as f: while(True): row = f.readline() if not row: break list_key.append(row) dict_key_ids[row] = [] data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000}) for item in data["response"]["docs"]: dict_key_ids[row].append(item["id"]) task_queue = queue.Queue() result_queue = queue.Queue() for _key in dict_key_ids.keys(): for item in dict_key_ids[_key]: task_queue.put({"docId":item,"project_name":""}) pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo) mt.run() dict_docId_projectname = {} while(True): try: item = result_queue.get(False) dict_docId_projectname[item["docId"]] = item["project_name"] except Exception: break dict_key_count = dict() for _key in dict_key_ids.keys(): set_key = set() for docId in dict_key_ids[_key]: set_key.add(dict_docId_projectname.get(docId,"")) dict_key_count[_key] = len(set_key) print("==") for _key in list_key: print(len(dict_key_ids[_key])) print("==") for _key in list_key: print(dict_key_count[_key]) print("==") def getIndustryCompany(): def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots): # mongoDB = getConnect_mongodb() log(item["enterprise_name"]) mongoDB = pool_mongo.getConnector() # coll_zb = mongoDB.enterprise_profile # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 }) # for row in rows: # item["regCapital"] = row.get("regCapital","") # item["legal_person"] = row.get("legal_person","") # item["phone"] = row.get("phone","") # item["actualCapital"] = row.get("actualCapital","") # item["industry"] = row.get("industry","") # item["estiblishTime"] = row.get("estiblishTime","") # item["socialStaffNum"] = row.get("socialStaffNum","") # item["businessScope"] = row.get("businessScope","") # graph = getConnect_neo4j() ots_client = pool_ots.getConnector() primary_key = [('name',item["enterprise_name"])] columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"] consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1) if return_row is not None: for att in return_row.attribute_columns: item[att[0]] = att[1] list_same_industry_company = [] if "industry" in item: bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])]) col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED) rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index", SearchQuery(bool_query, limit=10, get_total_count=True), col) for row in rows: for item1 in row[0]: list_same_industry_company.append(item1[1]) # if "industry" in item: # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10) # for row in rows: # print(row) # list_same_industry_company.append(row.get("enterprise_name","")) item["same_industry_company"] = list_same_industry_company graph = pool_neo4j.getConnector() company_name = item["enterprise_name"] cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name) finded = graph.run(cql) data = json.loads(json.dumps(finded.data())) _count = data[0]["_c"] # list_project = [] # for _data in data: # if _count<=3: # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01": # if _data["project_name"] is not None: # list_project.append(_data["project_name"]) # _count += 1 item["count"] = _count # item["project"] = str(list_project) result_queue.put(item) pool_mongo.putConnector(mongoDB) pool_neo4j.putConnector(graph) pool_ots.putConnector(ots_client) log_tofile("export.log") pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb) pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j) pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) # list_company = getCompanys() # filename = "".join(["环境","生态","再生","回收","环保"]) list_company = [] filename = "../data/同行客户匹配.xlsx" df = pd.read_excel(filename,sheetname=0) for _com in df["公司名称"]: print(_com) if _com is not None and _com.strip()!="": _company = {"enterprise_name":""} _company["enterprise_name"] = _com list_company.append(_company) task_queue = queue.Queue() for item in list_company: task_queue.put(item) result_queue = queue.Queue() _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots) _muti.run() df_company = {} set_key = set() if len(list_company)>0: for item in list_company: for _key in item.keys(): set_key.add(_key) if _key not in df_company: df_company[_key] = [] list_key = list(set_key) for item in list_company: for _key in list_key: df_company[_key].append(item.get(_key,"")) df1 = pd.DataFrame(df_company) df1.to_excel("%s_export.xlsx"%(filename)) def exportWin_tenderer(time_from,time_to): ''' :return: ''' ost_client = getConnect_ots() last_docid = 0 bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True), TermQuery("docchannel",101), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), RangeQuery('docid', last_docid, include_lower=False)]) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) list_project = [] def _getRow(list_project,rows,last_docid): for row in rows: project_name = row[1][0][1] docid = row[0][1][1] last_docid = docid list_pack = json.loads(row[1][1][1]) _set_tenderer = set() win_tenderer = "" for _pack in list_pack: if "win_tenderer" in _pack and win_tenderer=="": win_tenderer = _pack["win_tenderer"] if "second_tenderer" in _pack: _set_tenderer.add(_pack["second_tenderer"]) if "third_tenderer" in _pack: _set_tenderer.add(_pack["third_tenderer"]) list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)}) return last_docid _getRow(list_project,rows,last_docid) while(next_token): print("%d/%d"%(len(list_project),total_count)) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) last_docid = _getRow(list_project,rows,last_docid) task_queue = queue.Queue() result_queue = queue.Queue() for item in list_project: task_queue.put(item) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) def _handle(item,result_queue,pool_ots): if item["win_tenderer"]!="": ots_client = pool_ots.getConnector() consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1) _dict = dict() for _item in return_row.attribute_columns: _dict[_item[0]] = _item[1] for _key in _dict.keys(): item[_key] = _dict[_key] data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10}) for _item in data["response"]["docs"]: for _key in _item.keys(): item[_key] = _item[_key] break pool_ots.putConnector(ots_client) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots); mt.run() keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"] df_data = {} for _key in keys: df_data[_key] = [] for item in list_project: for _key in keys: if _key in item: df_data[_key].append(item[_key]) else: df_data[_key].append("") df = pd.DataFrame(df_data) df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys) def exportContact(): time_from = "2021-01-14" time_to = "2021-01-15" filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to) df1 = pd.read_excel(filename) set_company = set() for item in df1["tenderer"]: list_company = re.split("\['|', '|'\]|\[\]",item) for _company in list_company: if _company!="": set_company.add(_company) companys = list(set_company) task_queue = queue.Queue() list_company = [] for _company in companys: item = {"company_name":_company} list_company.append(item) task_queue.put(item) result_queue = queue.Queue() def _handle(item,result_queue): company = item["company_name"] data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10}) for _item in data["response"]["docs"]: for _key in _item.keys(): item[_key] = _item[_key] break mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30); mt.run() keys = ["company_name","contact_person","mobile_no","phone_no"] df_data = {} for _key in keys: df_data[_key] = [] ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') for item in list_company: for _key in keys: if _key in item: df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key])) else: df_data[_key].append("") df = pd.DataFrame(df_data) df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys) def countKeyword(): conn = getConnection_mysql() cursor = conn.cursor() print(0) sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'" cursor.execute(sql) print(0.1) df = pd.read_excel("万郡绿建细分关键词.xls") list_keywords = df["细分类别"] dict_keywords = dict() for _key in list_keywords: dict_keywords[_key] = 0 print(1) from bs4 import BeautifulSoup while(True): rows = cursor.fetchmany(10000) print("==") if not rows: break for row in rows: _html = BeautifulSoup(row[0],"lxml").getText() for _key in list_keywords: if re.search(_key,_html) is not None: dict_keywords[_key] += 1 print(dict_keywords) list_count = [] for _key in list_keywords: list_count.append(dict_keywords[_key]) df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count}) df1.to_excel("关键词统计.xlsx") def countKeyword_solr(): def _handle(item,result_queue): keyword = item["keyword"] data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] item["zhongbiao"] = _num data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] item["zhaobiao"] = _num result_queue.put(item) file = "../data/关键词11.xlsx" df = pd.read_excel(file) task_queue = queue.Queue() print(df.keys()) for item in df["业务关键词"]: task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0}) result_queue = queue.Queue() mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10) mt.run() list_keyword = [] list_zhaobiao = [] list_zhongbiao = [] while(True): try: item = result_queue.get(False) list_keyword.append(item["keyword"]) list_zhaobiao.append(item["zhaobiao"]) list_zhongbiao.append(item["zhongbiao"]) except Exception: break df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao}) df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"]) def query_from_solr(): data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] print(_num) def export_province_keyword_count(): def _handle(item,result_queue,pool_ots): columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"] ots_client = pool_ots.getConnector() _province = item["province"] print(item) # keywords = item["keyword"]+" "+item["key"] list_keyword = item["keyword"] # for _temp in keywords.split(" "): # if len(_temp)>0: # list_keyword.append(_temp) should_queries = [] must_not_q = [] for _temp in list_keyword: should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp)) must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp)) bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2) page_time = item["page_time"] bool_query = BoolQuery(must_queries=[bool_query_keyword # ,WildcardQuery("publishtime","%s*"%page_time) # ,MatchPhraseQuery("doctitle","服务") ,RangeQuery("page_time","2021-04-22","2021-04-27",include_lower=True,include_upper=True), TermQuery("docchannel",101), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), WildcardQuery('province', '%s*'%_province) ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True)) ], must_not_queries=must_not_q) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True), ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED)) item["count"] = total_count list_data = getRow_ots(rows) item["list_data"] = list_data print(item) pool_ots.putConnector(ots_client) df = pd.read_excel("../data/省份信息.xlsx") list_province = [] for _name,_type in zip(df["cname"],df["ctype"]): if _type==20: list_province.append(_name) # filename = "../data/2021-02关键词导出数据.xlsx" # dict_keyword = {} # df1 = pd.read_excel(filename,dtype=str) # for _key,_keyword in zip(df1["key1"],df1["keyword"]): # print("===",str(_keyword)) # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword # for _key in df1["关键词"]: # dict_keyword[_key] = "" keyword_str = ''' 快递 物流 供应链 运输 配送 仓储 冷链 整车 服务 ''' list_key = [] for _k in re.split("\s",keyword_str): _k1 = _k.strip() if len(_k1)>0: list_key.append(_k1) list_task = [] page_time = "2020-11" for _province in list_province: list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0}) task_queue = queue.Queue() for item in list_task: task_queue.put(item) result_queue = queue.Queue() pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() dict_key_data = dict() list_data = [] for item in list_task: list_data.extend(item["list_data"]) dict_channel = getDict_docchannel() df_data= {} print(list_data) for row in list_data: item = {} _dict = row set_dict_item(item,"docid",_dict.get("docid","")) set_dict_item(item,"公告标题",_dict.get("doctitle","")) set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"项目编号",_dict.get("project_code","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item(item,"代理单位",_dict.get("agency","")) set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) sub_docs_json = _dict.get("sub_docs_json") for _doc in json.loads(sub_docs_json): if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) if "win_tenderee_manager" in _doc: set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额",_doc["bidding_budget"]) if "招标金额" not in item: set_dict_item(item,"招标金额","") if "中标金额" not in item: set_dict_item(item,"中标金额","") if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标单位联系人" not in item: set_dict_item(item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item(item,"中标单位联系电话","") _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # if _line in set_line: # continue # if item["招标金额"]=="": # continue # set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns) # for item in list_task: # dict_key_data[item["key"]][item["province"]] = item # dict_key_province = dict() # dict_key_province["关键词"] = [] # for _province in list_province: # dict_key_province[_province] = [] # for _key in dict_keyword.keys(): # dict_key_province["关键词"].append(_key) # for _province in list_province: # dict_key_province[_province].append(dict_key_data[_key][_province]["count"]) # columns = ["关键词"] # columns.extend(list_province) # df2 = pd.DataFrame(dict_key_province) # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns) def export_keyword_count(): def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",item["range_from"],item["range_to"]), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), MatchPhraseQuery(item["type"], item["keyword"]) ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1, get_total_count=True), ColumnsToGet(return_type=ColumnReturnType.ALL)) item["count"] = total_count pool_ots.putConnector(ots_client) range_from = "2019-01-01" range_to = "2022-12-23" _type = "doctextcon" assert _type in ["doctitle","doctextcon"] list_dict_key_count = [] filename = "../data/医院.xlsx" df = pd.read_excel(filename) for item in df["关键词"]: list_dict_key_count.append({"keyword":item,"count":0,"range_from":range_from,"range_to":range_to,"type":_type}) task_queue = queue.Queue() for item in list_dict_key_count: task_queue.put(item) result_queue = queue.Queue() pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() columns = ["keyword","count","range_from","range_to","type"] df_data = {} for _c in columns: df_data[_c] = [] for item in list_dict_key_count: for _c in columns: if _c in item: df_data[_c].append(item[_c]) else: df_data[_c].append("") df2 = pd.DataFrame(df_data) df2.to_excel("%s_数量导出.xlsx"%filename,columns=columns) def export_keyword_title(): ots_client = getConnect_ots() range_from = "2020-01-01" range_to = "2022-12-23" list_condition = [["医务室"], ["医院"], ["卫生院"], ["卫生所"], ["卫生室"], ["社区卫生服务中心"]] list_should_query = [] for _c in list_condition: if len(_c)==1: list_should_query.append(MatchPhraseQuery("doctitle",_c[0])) else: _must_query = [] for _q in _c: _must_query.append(MatchPhraseQuery("doctitle",_q)) list_should_query.append(BoolQuery(must_queries=_must_query)) keyword_query = BoolQuery(should_queries=list_should_query) bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), keyword_query ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED)) df_data = {"docid":[],"doctitle":[],"tenderee":[]} def setData(df_data,rows): list_dict = getRow_ots(rows) for _dict in list_dict: docid = _dict.get("docid","") doctitle = _dict.get("doctitle","") tenderee = _dict.get("tenderee","") df_data["docid"].append(docid) df_data["doctitle"].append(doctitle) df_data["tenderee"].append(tenderee) setData(df_data,rows) _count = len(rows) while next_token: rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED)) setData(df_data,rows) _count += 100 print(_count,total_count) file_begin = 0 file_length = 100000 _count = 0 while file_begin100000: continue df_2000["document_id"].append(_id) df_2000["document_text"].append(_text) df_2 = pd.DataFrame(df_2000) df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False) # save(dict_websource,"../data/dict_websource.pk") def getWinTenderer(sub_doc_json): if sub_doc_json is not None: sub_doc = json.loads(sub_doc_json) for _doc in sub_doc: if "win_tenderer" in _doc: return _doc["win_tenderer"] return "" def exportDocument_by_keywords(page_time, list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]): task_queue = queue.Queue() result_queue = queue.Queue() for _k in list_keyword: task_queue.put(_k) def _handle(keyword,result_queue): should_queries = [] for _temp in [keyword]: should_queries.append(MatchPhraseQuery("doctitle",_temp)) bool_query_keyword = BoolQuery(should_queries=should_queries) ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'), MatchPhraseQuery("doctitle",keyword), TermQuery("docchannel","101") ]) is_all_succeed = False _count = 0 total_count = 1 next_token = None rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) for row in rows: _dict = dict() for values in row: for _v in values: _dict[_v[0]] = _v[1] result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))}) print(keyword,next_token,total_count) while(next_token): try: # print(next_token) _count += len(rows) print("%s:%d/%d"%(keyword,_count,total_count)) for row in rows: _dict = dict() for values in row: for _v in values: _dict[_v[0]] = _v[1] result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))}) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) except Exception as e: traceback.print_exc() mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30) mt.run() df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]} while(True): try: item = result_queue.get(block=True,timeout=1) for _k in df_data.keys(): if _k in item: df_data[_k].append(item[_k]) else: df_data[_k].append("") except queue.Empty as e: break except Exception as e: traceback.print_exc() df = pd.DataFrame(df_data) df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"]) def exportGovement(): should_queries1 = [] for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]: should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp)) should_queries2 = [] for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]: should_queries2.append(WildcardQuery("province","*%s*"%_temp)) ots_client = getConnect_ots() page_time = "2020-12" bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1), BoolQuery(should_queries=should_queries2), TermQuery("docchannel","52"), RangeQuery("publishtime",page_time)]) columns = ["tenderee","tenderee_contact","tenderee_phone"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) print(total_count) def getRow(rows,df_data,columns): for row in rows: _dict = dict() for part in row: for item in part: _dict[item[0]] = item[1] if "tenderee_contact" in _dict and "tenderee_phone" in _dict: for key in columns: df_data[key].append(_dict.get(key,"")) all_rows = 0 df_data = {} for key in columns: df_data[key] = [] getRow(rows,df_data,columns) _count = 100 while(next_token): print(_count,total_count) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += 100 getRow(rows,df_data,columns) df2 = pd.DataFrame(df_data) df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns) def export_attachment(): filename = "../data/销售部-字段名.xlsx" df = pd.read_excel(filename) list_dict = [] for _key in df["关键词"]: if len(_key)>0: list_dict.append({"keyword":_key}) def _handle(_dict,result_queue,pool_ots): _keyword = _dict["keyword"] ots_client = pool_ots.getConnector() keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword), MatchPhraseQuery("doctitle",_keyword)]) bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True), keyword_query]) columns = ["dochtmlcon","docid"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) print(_keyword,total_count) hit_count = 0 def getData(rows,hit_count): list_fields = getRow_ots(rows) for _fields in list_fields: dochtmlcon = _fields["dochtmlcon"] docid = _fields["docid"] _flag = False for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon): try: result = requests.get(url,stream=True,timeout=100) if result.status_code==200: filename = get_file_name(url,result.headers) with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f: f.write(result.content) _flag = True except Exception: pass if _flag: hit_count += 1 return hit_count hit_count = getData(rows,hit_count) _count = len(rows) while next_token: rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += len(rows) if _count>20000: break hit_count = getData(rows,hit_count) if hit_count>20: break pool_ots.putConnector(ots_client) task_queue = queue.Queue() result_queue = queue.Queue() for item in list_dict: task_queue.put(item) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() def exportIndustryCount(): import codecs time_from = "2020-12-21" time_to = "2020-12-25" # dict_channel = {"51":{"type":"公告变更"}, # "52":{"type":"招标公告"}, # "101":{"type":"中标信息"}, # "102":{"type":"招标预告"}, # "103":{"type":"招标答疑"}, # "104":{"type":"招标文件"}, # "105":{"type":"资审结果"}, # "103":{"type":"招标控制价"}, # "100":{"type":"未知类型"}} dict_industry = {} meta_industry = load("../data/class2dalei_menlei.pkl") for _key in meta_industry.keys(): dict_industry[_key] = {"type":_key} print(dict_industry.keys()) return task_queue = queue.Queue() result_queue = queue.Queue() for _key in dict_industry.keys(): task_queue.put(dict_industry[_key]) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]), RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)]) columns = ["docid"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) item["count"] = total_count columns = ["dochtmlcon"] bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) for row in rows: _dict = dict() for part in row: for v in part: _dict[v[0]] = v[1] with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f: f.write(_dict["dochtmlcon"]) pool_ots.putConnector(ots_client) pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() columns = ["type","count"] df_data = {} for _c in columns: df_data[_c] = [] for _indus in dict_industry.keys(): for _c in columns: df_data[_c].append(dict_industry[_indus][_c]) df = pd.DataFrame(df_data) df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns) def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]): ''' :return: ''' ost_client = getConnect_ots() last_docid = 0 bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True), RangeQuery('status', '201', '300', include_lower=True, include_upper=True)]) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count = len(rows) df_data = {} def getData(df_data,rows): list_dict = getRow_ots(rows) for _dict in list_dict: for _k,_v in _dict.items(): if _k not in df_data: df_data[_k] = [] df_data[_k].append(getLegal_str(_v)) getData(df_data,rows) while(next_token): print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += len(rows) getData(df_data,rows) df = pd.DataFrame(df_data) df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns) def processDocument(): filename = "../data/2021-01-29-2021-01-29公告信息.xlsx" df = pd.read_excel(filename) df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv") return list_dict = [] for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]): list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon}) task_queue = queue.Queue() for _dict in list_dict: task_queue.put(_dict) result_queue = queue.Queue() def _handle(_dict,result_queue,pool_mysql): conn = pool_mysql.getConnector() cursor = conn.cursor() sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"]) cursor.execute(sql) conn.commit() pool_mysql.putConnector(conn) # url = "http://192.168.2.101:15030" # myheaders = {'Content-Type': 'application/json'} # print(int(_dict["docid"])) # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]} # resp = requests.post(url,json=data,headers=myheaders, verify=True) # result = json.loads(resp.content.decode("utf8"),"utf8") # _dict["product"] = result["product"] pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql) mt.run() # columns = ["docid","doctitle","project_name","product"] # # df_data = {} # for _c in columns: # df_data[_c] = [] # for _dict in list_dict: # for _c in columns: # df_data[_c].append(_dict.get(_c,"")) # df = pd.DataFrame(df_data) # df.to_excel("%s.product.xlsx"%(filename),columns=columns) def export_extract_check(): ''' :return:导出数据提取校验的结果并生成报告 ''' conn = getConnection_testmysql() cursor = conn.cursor() sql = " select docid,json_result from exportdb.extract_check " cursor.execute(sql) dict_global = {} df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]} df_document = {"docid":[]} while True: rows = cursor.fetchmany(10000) if not rows: break for docid,json_result in rows: df_document["docid"].append(docid) _result = json.loads(json_result) for k,v in _result.items(): key = k.split("_") _key = "_".join(key[:-1]) if "punish" in _key or "complainants" in _key or "institutions" in _key: continue if k not in df_document: df_document[k] = [] df_document[k].append(v) key_type = key[-1] if _key not in dict_global: dict_global[_key] = {} if key_type not in dict_global[_key]: dict_global[_key][key_type] = 0 if key_type=="diff": dict_global[_key][key_type] += v if key_type in ("online","test"): if isinstance(v,str): if v!="": dict_global[_key][key_type] += 1 elif isinstance(v,list): dict_global[_key][key_type] += len(v) for k,v in dict_global.items(): df_global["key_type"].append(k) df_global["online_count"].append(v["online"]) df_global["test_count"].append(v["test"]) df_global["diff_count"].append(v["diff"]) df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0) filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d")) with pd.ExcelWriter(filename) as writer: df1 = pd.DataFrame(df_global) df1.to_excel(writer,sheet_name="global") for k,v in df_document.items(): print(k,len(v)) df2 = pd.DataFrame(df_document) df2.to_excel(writer,sheet_name="document") writer.save() writer.close() def exportDocument_dump(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"] df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"] df_data = {} for _key in df_keys: df_data[_key] = [] bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"), RangeQuery("status",201,300,True,True)]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) def getData(df_data,rows): list_data = getRow_ots(rows) for row in list_data: dict_find = {} for _key in df_keys: dict_find[_key] = 0 for _k,_v in row.items(): if _k in df_keys: dict_find[_k] = 1 if _k=="project_code": _v = '"%s"'%_v df_data[_k].append(_v) doctitle = row.get("doctitle","") df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle)) df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"])) dict_find["URL"] = 1 dict_find["doctitle_refine"] = 1 sub_docs_json = row.get("sub_docs_json","[{}]") doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""} if sub_docs_json is not None: for sub_docs in json.loads(sub_docs_json): for _key_sub_docs in sub_docs.keys(): if _key_sub_docs in doc_columns: if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]: if _key_sub_docs in ["bidding_budget","win_bid_price"]: if float(sub_docs[_key_sub_docs])>0: doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) else: doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) for _k,_v in doc_columns.items(): dict_find[_k] = 1 df_data[_k].append(_v) for _k,_v in dict_find.items(): if _v==0: df_data[_k].append("") _count = len(rows) getData(df_data,rows) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows) _count += len(rows) # for docid in df["docid"]: # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))]) # # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", # SearchQuery(bool_query , limit=100, get_total_count=True), # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) # list_data = getRow_ots(rows) # if len(list_data)>0: # dict_find = {} # for _key in df_keys: # dict_find[_key] = 0 # for _k,_v in list_data[0].items(): # if _k in df_keys: # dict_find[_k] = 1 # df_data[_k].append(_v) # doctitle = list_data[0].get("doctitle","") # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle)) # dict_find["doctitle_refine"] = 1 # sub_docs_json = list_data[0].get("sub_docs_json","[{}]") # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""} # if sub_docs_json is not None: # for sub_docs in json.loads(sub_docs_json): # for _key_sub_docs in sub_docs.keys(): # if _key_sub_docs in doc_columns: # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]: # if _key_sub_docs in ["bidding_budget","win_bid_price"]: # if float(sub_docs[_key_sub_docs])>0: # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) # else: # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) # for _k,_v in doc_columns.items(): # dict_find[_k] = 1 # df_data[_k].append(_v) # for _k,_v in dict_find.items(): # if _v==0: # df_data[_k].append("") df1 = pd.DataFrame(df_data) df1.to_csv("../data/0303去重.csv",columns=df_keys) def exportDocument_dump_mysql(): conn = getConnection_testmysql() cursor = conn.cursor() columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"] df_data = {} for _c in columns: df_data[_c] = [] sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)" cursor.execute(sql) while True: rows = cursor.fetchmany(100000) if not rows: break for row in rows: for _i in range(len(columns)): df_data[columns[_i]].append(row[_i]) df = pd.DataFrame(df_data) df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"]) print(cursor.description) def getDict_docchannel(): conn = getConnection_mysql() cursor = conn.cursor() sql = "select channel_id,chnlname from sys_channel " cursor.execute(sql) rows = cursor.fetchall() _dict = dict() for row in rows: _dict[row[0]] = row[1] return _dict def exportDocument_by_doctitle(): columns = ["docid","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource"] dict_channel = getDict_docchannel() str_keyword = ''' 报批技术服务 不动产 测绘 城市更新 档案整理 房地一体 拆旧复垦 土地规划 城乡规划 村庄规划 技术服务 技术审查 建设用地增减挂钩 勘察 垦造水田 不动产数据建库 不动产数据整合 林权调查 土地调查 城市更新数据调查 不动产平台 测绘系统 地理信息系统 城乡规划信息系统 一张图信息平台 测绘信息平台 双评价 城市更新研究 垦造水田研究报告 生态修复研究 土地规划研究 复垦咨询服务 生态修复咨询服务 城乡规划咨询服务 城市更新咨询服务 勘测定界 多测合一 用地预审 国土规划数据治理 地名普查 地形图 垦造水田咨询服务 评估 全域土地综合整治 生态修复 林权数据建库 权属调查 权籍调查 ''' task_queue = queue.Queue() result_queue = queue.Queue() for _keyword in re.split("\s",str_keyword): if len(_keyword.strip())==0: continue task_queue.put({"keyword":_keyword}) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() should_q1 = BoolQuery(should_queries=[MatchPhraseQuery("doctitle",item["keyword"])]) should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东"), WildcardQuery('province', '%s*'%"湖南"), WildcardQuery('province', '%s*'%"广西")]) bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-01-01"), RangeQuery("status",201,300,True,True), TermQuery("docchannel",101), should_q1,should_q2]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) for _data in list_data: _data["keyword"] = item["keyword"] result_queue.put(_data) _count = len(list_data) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) _count += len(list_data) for _data in list_data: _data["keyword"] = item["keyword"] result_queue.put(_data) pool_ots.putConnector(ots_client) mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots) mt.run() list_item = [] try: while True: _dict = result_queue.get(False) list_item.append(_dict) except Exception as e: print(e) keys = list_item[0].keys() df_data = {} print(len(list_item)) set_line = set() for row in list_item: item = {} _dict = row set_dict_item(item,"公告id",_dict.get("docid","")) set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) set_dict_item(item,"公告标题",_dict.get("doctitle","")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"区",_dict.get("district","")) set_dict_item(item,"关键词",_dict.get("keyword","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"项目编号",_dict.get("project_code","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) # set_dict_item(item,"代理单位",_dict.get("agency","")) # set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item(item,"公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) tenderee_in = "否" tenderee_keyword = "否" if _dict.get("tenderee","")!="": if _dict.get("tenderee","") in _dict.get("doctitle",""): tenderee_in = "是" if _dict.get("keyword","") in _dict.get("tenderee",""): tenderee_keyword = "是" set_dict_item(item,"标题包含招标人",tenderee_in) set_dict_item(item,"招标人含有关键词",tenderee_keyword) sub_docs_json = _dict.get("sub_docs_json") for _doc in json.loads(sub_docs_json): if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) # if "win_tenderee_manager" in _doc: # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) # if "win_tenderee_phone" in _doc: # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额(万元)",float(_doc["win_bid_price"])/10000) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额(万元)",float(_doc["bidding_budget"])/10000) if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标金额(万元)" not in item: set_dict_item(item,"中标金额(万元)","") if "招标金额(万元)" not in item: set_dict_item(item,"招标金额(万元)","") # if "中标单位联系人" not in item: # set_dict_item(item,"中标单位联系人","") # if "中标单位联系电话" not in item: # set_dict_item(item,"中标单位联系电话","") _line = "%s-%s-%s"%(item["项目编号"],item["中标单位"],item["中标金额(万元)"]) if _line in set_line: continue # if item["招标金额"]=="": # continue set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df = pd.DataFrame(df_data) df.to_excel("../data/2021-04-14_export11.xlsx",columns=list_df_columns) set_columns = set() list_df_columns = [] def set_dict_item(_dict,name,v): _dict[name] = getLegal_str(v) if name not in set_columns: set_columns.add(name) list_df_columns.append(getLegal_str(name)) def exportDocument_medicine(start_time,end_time): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"] dict_channel = getDict_docchannel() def getData(df_data,rows,set_line): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row set_dict_item(item,"docid",_dict.get("docid","")) set_dict_item(item,"公告标题",_dict.get("doctitle","")) set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"项目编号",_dict.get("project_code","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item(item,"代理单位",_dict.get("agency","")) set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) sub_docs_json = _dict.get("sub_docs_json") for _doc in json.loads(sub_docs_json): if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) if "win_tenderee_manager" in _doc: set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额",_doc["bidding_budget"]) if "招标金额" not in item: set_dict_item(item,"招标金额","") if "中标金额" not in item: set_dict_item(item,"中标金额","") if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标单位联系人" not in item: set_dict_item(item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item(item,"中标单位联系电话","") _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # if _line in set_line: # continue # if item["招标金额"]=="": # continue set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) # list_province = ["江西","湖南","四川","安徽"] list_province = ["全国"] for _province in list_province: df_data = {} str_p = ''' 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化 临床应用 ''' list_prov = re.split("\s",str_p) list_mu = [] for _p in list_prov: if _p.strip()=="": continue print(_p) list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip())) s_tenderee = '医院、卫生院、疗养院、健康局、卫生局' list_should_ten = [] for _p in re.split("、",s_tenderee): if _p.split()=="": continue list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip())) list_should_chan = [] list_should_chan.append(TermQuery("docchannel",52)) list_should_chan.append(TermQuery("docchannel",101)) list_should_chan.append(TermQuery("docchannel",102)) should_q1 = BoolQuery(should_queries=list_mu) should_q2 = BoolQuery(should_queries=list_should_ten) should_q3 = BoolQuery(should_queries=list_should_chan) bool_query = BoolQuery(must_queries=[RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True), RangeQuery("status",201,300,True,True), should_q1,should_q2,should_q3]) # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"), # MatchPhraseQuery("doctitle","教学器材")]) # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"), # MatchPhraseQuery("doctitle","教育设备")]) # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"), # MatchPhraseQuery("doctitle","教学器材")]) # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"), # MatchPhraseQuery("doctitle","教育设备")]) # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")]) # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")]) # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")]) # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")]) # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")]) # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"), # MatchPhraseQuery("doctitle","设备")]) # # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10]) # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"), # MatchPhraseQuery("doctitle","医院"), # MatchPhraseQuery("doctitle","工业园"), # MatchPhraseQuery("doctitle","营养"), # MatchPhraseQuery("doctitle","厨房设备")]) # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"), # RangeQuery("status",201,300,True,True), # TermQuery("docchannel",101), # WildcardQuery('province', '%s*'%_province), # all_should], # must_not_queries=[must_not_q]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) getData(df_data,rows,set_line) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows,set_line) _count += len(rows) if len(df_data[list(df_data.keys())[0]])>=300: break set_enterprise = set() for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]): set_enterprise.add(_tenderee) set_enterprise.add(_agency) set_enterprise.add(_win_tenderer) if "" in set_enterprise: set_enterprise.remove("") if None in set_enterprise: set_enterprise.remove(None) dict_enterprise = getDictEnterprise(list(set_enterprise)) if len(set_enterprise)>0: for _i in range(len(df_data["招标单位"])): _enterprise_name = df_data["招标单位"][_i] if df_data["招标联系人电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["招标联系人"][_i] = _person df_data["招标联系人电话"][_i] = _phone _enterprise_name = df_data["代理单位"][_i] if df_data["代理联系人电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["代理联系人"][_i] = _person df_data["代理联系人电话"][_i] = _phone _enterprise_name = df_data["中标单位"][_i] if df_data["中标单位联系电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["中标单位联系人"][_i] = _person df_data["中标单位联系电话"][_i] = _phone return df_data df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) def exportDocument_by_pagetime(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"] dict_channel = getDict_docchannel() def getData(df_data,rows,set_line): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row set_dict_item(item,"docid",_dict.get("docid","")) set_dict_item(item,"公告标题",_dict.get("doctitle","")) set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle",""))) set_dict_item(item,"项目编号",_dict.get("project_code","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item(item,"代理单位",_dict.get("agency","")) set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) sub_docs_json = _dict.get("sub_docs_json") for _doc in json.loads(sub_docs_json): if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) if "win_tenderee_manager" in _doc: set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额",_doc["bidding_budget"]) if "招标金额" not in item: set_dict_item(item,"招标金额","") if "中标金额" not in item: set_dict_item(item,"中标金额","") if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标单位联系人" not in item: set_dict_item(item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item(item,"中标单位联系电话","") _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # if _line in set_line: # continue # if item["招标金额"]=="": # continue set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) # list_province = ["江西","湖南","四川","安徽"] list_province = ["全国"] for _province in list_province: df_data = {} str_p = ''' 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化 临床应用 ''' list_prov = re.split("\s",str_p) list_mu = [] for _p in list_prov: if _p.strip()=="": continue print(_p) list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip())) s_tenderee = '医院、卫生院、疗养院、健康局、卫生局' list_should_ten = [] for _p in re.split("、",s_tenderee): if _p.split()=="": continue list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip())) list_should_chan = [] list_should_chan.append(TermQuery("docchannel",52)) # list_should_chan.append(TermQuery("docchannel",101)) # list_should_chan.append(TermQuery("docchannel",102)) should_q1 = BoolQuery(should_queries=list_mu) should_q2 = BoolQuery(should_queries=list_should_ten) should_q3 = BoolQuery(should_queries=list_should_chan) bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-05-07","2021-05-07",include_lower=True,include_upper=True), RangeQuery("status",201,300,True,True) # ,should_q1 # ,should_q2 ,should_q3]) # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"), # MatchPhraseQuery("doctitle","教学器材")]) # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"), # MatchPhraseQuery("doctitle","教育设备")]) # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"), # MatchPhraseQuery("doctitle","教学器材")]) # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"), # MatchPhraseQuery("doctitle","教育设备")]) # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")]) # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")]) # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")]) # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")]) # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")]) # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"), # MatchPhraseQuery("doctitle","设备")]) # # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10]) # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"), # MatchPhraseQuery("doctitle","医院"), # MatchPhraseQuery("doctitle","工业园"), # MatchPhraseQuery("doctitle","营养"), # MatchPhraseQuery("doctitle","厨房设备")]) # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"), # RangeQuery("status",201,300,True,True), # TermQuery("docchannel",101), # WildcardQuery('province', '%s*'%_province), # all_should], # must_not_queries=[must_not_q]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) getData(df_data,rows,set_line) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows,set_line) _count += len(rows) # if len(df_data[list(df_data.keys())[0]])>=300: # break # set_enterprise = set() # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]): # set_enterprise.add(_tenderee) # set_enterprise.add(_agency) # set_enterprise.add(_win_tenderer) # if "" in set_enterprise: # set_enterprise.remove("") # if None in set_enterprise: # set_enterprise.remove(None) # dict_enterprise = getDictEnterprise(list(set_enterprise)) # if len(set_enterprise)>0: # for _i in range(len(df_data["招标单位"])): # _enterprise_name = df_data["招标单位"][_i] # if df_data["招标联系人电话"][_i]=="": # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # if contacts is not None: # _person,_phone = getOneContact(contacts) # df_data["招标联系人"][_i] = _person # df_data["招标联系人电话"][_i] = _phone # # _enterprise_name = df_data["代理单位"][_i] # if df_data["代理联系人电话"][_i]=="": # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # if contacts is not None: # _person,_phone = getOneContact(contacts) # df_data["代理联系人"][_i] = _person # df_data["代理联系人电话"][_i] = _phone # # _enterprise_name = df_data["中标单位"][_i] # if df_data["中标单位联系电话"][_i]=="": # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # if contacts is not None: # _person,_phone = getOneContact(contacts) # df_data["中标单位联系人"][_i] = _person # df_data["中标单位联系电话"][_i] = _phone df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) def attachCompanyContact(): files = ["../data/2021-03-17_四川_关键词导出.csv", "../data/2021-03-17_安徽_关键词导出.csv", "../data/2021-03-17_江西_关键词导出.csv", "../data/2021-03-17_湖南_关键词导出.csv"] files = ["../data/欧科自然资源5w以上数据.xlsx"] def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() primary_key = [('name',str(item["招标单位"]))] columns_to_get = ["province","city","district"] consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1) # bool_query = BoolQuery(must_queries=[TermQuery("nicknames",item["中标单位"])]) # # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index", # SearchQuery(bool_query , limit=1, get_total_count=True), # ColumnsToGet(["contacts"],return_type=ColumnReturnType.SPECIFIED)) # _docid = int(item["docid"]) # partitionkey = _docid%500+1 # primary_key = [('partitionkey',partitionkey),("docid",_docid)] # columns_to_get = ["doctitle"] # consumed, return_row, next_token = ots_client.get_row("document",primary_key, columns_to_get, None, 1) if isinstance(item["招标单位"],str) and item["招标单位"]!="": if return_row is not None: _dict = getRow_ots_primary(return_row) # item["doctitle"] = _dict.get("doctitle","") item["招标人省份"] = _dict.get("province","") item["招标人城市"] = _dict.get("city","") item["招标人区域"] = _dict.get("district","") province,city,district = getLocation(item["招标单位"]) if item["招标人省份"]=="" or item["招标人省份"]=="未知": item["招标人省份"] = province if item["招标人城市"]=="" or item["招标人城市"]=="未知": item["招标人城市"] = city if item["招标人区域"]=="" or item["招标人区域"]=="未知": item["招标人区域"] = district else: province,city,district = getLocation(item["招标单位"]) item["招标人省份"] = province item["招标人城市"] = city item["招标人区域"] = district else: item["招标人省份"] = item["省份"] item["招标人城市"] = item["城市"] item["招标人区域"] = item["区"] # contacts = json.loads(_dict["contacts"]) # contacts.sort(key=lambda x:x["level"],reverse=True) # phone = "" # phone_person = "" # mobile = "" # mobile_person = "" # for contact in contacts: # if mobile=="" and contact.get("mobile_no","")!="": # mobile = contact.get("mobile_no","") # mobile_person = contact.get("contact_person","") # if phone=="" and contact.get("phone_no","")!="": # phone = contact.get("phone_no",'') # phone_person = contact.get("contact_person","") # item["招标联系人"] = "" # item["招标联系人电话"] = "" # if mobile!="": # item["招标联系人"] = mobile_person # item["招标联系人电话"] = mobile # else: # item["中标单位联系人"] = phone_person # item["中标单位联系电话"] = phone pool_ots.putConnector(ots_client) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) for file in files: task_queue = queue.Queue() df = pd.read_excel(file,encoding="UTF8") keys = df.keys()[1:] list_item = [] for row in df.itertuples(): _dict = {} for _key in keys: if _key in dir(row): _v = row.__getattribute__(_key) else: _v = '' _dict[_key] = _v list_item.append(_dict) for item in list_item: task_queue.put(item) mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots) mt.run() df_data = {} for _k in keys: df_data[_k] = [] for item in list_item: for _k in keys: df_data[_k].append(getLegal_str(item.get(_k,"-"))) df1 = pd.DataFrame(df_data) df1.to_excel("%s_attach.xlsx"%file,columns=keys) def dumpWebSourceNo(): conn_oracle = getConnection_oracle() cursor_oracle = conn_oracle.cursor() sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO " cursor_oracle.execute(sql) rows = cursor_oracle.fetchall() conn_mysql = getConnection_testmysql() cursor_mysql = conn_mysql.cursor() for row in rows: sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1]) print(sql) cursor_mysql.execute(sql) conn_mysql.commit() def exportNzj(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"] def getData(df_data,rows,set_line): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row set_dict_item(item,"docids",_dict.get("docids","")) set_dict_item(item,"contacts",_dict.get("contacts","")) set_dict_item(item,"covered_area",_dict.get("covered_area","")) set_dict_item(item,"follows",_dict.get("follows","")) set_dict_item(item,"project_type",_dict.get("project_type","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"page_time",_dict.get("page_time","")) set_dict_item(item,"progress",_dict.get("progress","")) set_dict_item(item,"project_description",_dict.get("project_description","")) set_dict_item(item,"project_follow",_dict.get("project_follow","")) set_dict_item(item,"project_code",_dict.get("project_code","")) set_dict_item(item,"project_name",_dict.get("project_name","")) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df_data = {} bool_query = BoolQuery(must_queries=[ExistsQuery("docids")]) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) getData(df_data,rows,set_line) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows,set_line) _count += len(rows) df1 = pd.DataFrame(df_data) df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns) def turn_status(): df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx") conn = getConnection_testmysql() cursor = conn.cursor() for docid in df["公告id"]: partitionkey = int(docid)%500+1 sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid) cursor.execute(sql) conn.commit() def attachBidding_budget(): conn_mysql = getConnection_testmysql() cursor = conn_mysql.cursor() sql = "select docid from analysis_r2 where bidding_budget=''" task_queue = queue.Queue() result_queue = queue.Queue() cursor.execute(sql) rows = cursor.fetchmany(10000) while(rows): for row in rows: task_queue.put(row[0]) rows = cursor.fetchmany(10000) pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) def _handle(item,result_queue,pool_mysql,pool_ots): ots_client = pool_ots.getConnector() bool_query = BoolQuery(must_queries=[TermQuery("docids",item)]) rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index", SearchQuery(bool_query , limit=1, get_total_count=True), ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED)) list_dict = getRow_ots(rows) if len(list_dict)>0: conn = pool_mysql.getConnector() cursor = conn.cursor() sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item) cursor.execute(sql) conn.commit() pool_mysql.putConnector(conn) pool_ots.putConnector(ots_client) mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots) mt.run() def debug_documentMerge(): conn = getConnection_testmysql() cursor = conn.cursor() sql = "select merge_docids from project_group_final_log " cursor.execute(sql) task_queue = queue.Queue() for row in cursor.fetchall(): task_queue.put(row[0]) print(task_queue.qsize()) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() list_docids = item.split(",") must_q = [] for _docid in list_docids: must_q.append(TermQuery("docids",_docid)) bool_query = BoolQuery(must_queries=must_q) rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index", SearchQuery(bool_query,limit=1,get_total_count=True), ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED)) if total_count==0: print(item) result_queue.put(item) pool_ots.putConnector(ots_client) result_queue = queue.Queue() pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots) mt.run() while(True): try: item = result_queue.get(True) print(item) except Exception as e: print(str(e)) break def signDocument(): filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx" sign_filename = "%s_sign.xlsx"%filename df = pd.read_excel(filename) df_data = {"sign":[]} for item in df["segword"]: content = re.sub("\s*","",item) _find = re.search("(?P流标|废标|中止|终止|撤销|采购失败)",content) if _find is not None: df_data["sign"].append(_find.groupdict().get("key")) else: df_data["sign"].append("无") df1 = pd.DataFrame(df_data) df1.to_excel(sign_filename) if __name__=="__main__": # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"]) # processDocument() # export_extract_check() # exportArticle_by_websource() # export_keyword_count() # export_province_keyword_count() # exportDocument_dump() # exportDocument_dump_mysql() # export_attachment() # exportDocument_by_doctitle() # exportIndustryCount() # exportDocument_by_pagetime() # attachCompanyContact() # dumpWebSourceNo() # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%138306357)))) # exportNzj() # turn_status() # attachBidding_budget() # debug_documentMerge() exportDocument_medicine("2021-05-02","2021-05-08") # signDocument()