#encoding:UTF8 import sys import os sys.path.append("..") print(sys.path) import pandas as pd from dataSource.source import * import json from utils.multiThread import MultiThreadHandler import queue from utils.Utils import * from dataSource.pool import ConnectorPool import re from tablestore import * import traceback from utils.hashUtil import aesCipher from uuid import uuid4 from export.exportUtils import * data_path = "../data/" def getCompanyTenderer(): def _handle(item,result_queue): company = item dict_result = {"company":company,"count":0,"competitor":"","project_name":""} dict_result["company"] = company graph = getConnect_neo4j() cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company) finded = graph.run(cql) finded_ids = json.loads(json.dumps(finded.data())) dict_result["count"] = len(finded_ids) mongoDB = getConnect_mongodb() coll_zb = mongoDB.zhongbiao_extraction if len(finded_ids)>0: cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company) finded = graph.run(cql) finded_names = json.loads(json.dumps(finded.data())) list_names = [_i["p.project_name"] for _i in finded_names] dict_result["project_name"] = str(list_names) cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company) finded = graph.run(cql) finded_money = json.loads(json.dumps(finded.data())) whole_money = 0 for item in finded_money: if item["r.price"] is not None: whole_money += getUnifyMoney(item["r.price"]) dict_result["whole_money"] = str(whole_money) dict_competitor = {} for item in finded_ids: docId = item["p.zhong_biao_id"] if docId is not None: rows = coll_zb.find({"docId":docId}) for row in rows: keys = ["second_tenderer","third_tenderer"] for _key in keys: if _key in row: if row[_key] not in dict_competitor: dict_competitor[row[_key]] = 0 dict_competitor[row[_key]] += 1 list_competitor = [] for _key in dict_competitor: list_competitor.append([_key,dict_competitor[_key]]) list_competitor.sort(key=lambda x:x[1],reverse=True) list_competitors = [i[0] for i in list_competitor[:10]] dict_result["competitor"] = str(list_competitors) result_queue.put(dict_result) # filename = "成交客户匹配中标项目的需求.xlsx" # df = pd.read_excel(filename) # list_company = df["公司名字"] # company = list_company[0] list_company = [] filename = "../data/服务型客户.txt" with open(filename,"r",encoding="GBK") as f: while(True): line = f.readline() if not line: break list_company.append(line.strip()) task_queue = queue.Queue() for company in list_company: task_queue.put(company) result_queue = queue.Queue() handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10) handler.run() list_company = [] list_zb = [] list_count = [] list_project = [] list_money = [] list_competitor = [] while(True): try: _result = result_queue.get(False) list_company.append(_result.get("company","")) list_zb.append("是" if _result.get("count","")>0 else "否") list_count.append(_result.get("count","")) list_project.append(_result.get("project_name","")) list_money.append(_result.get("whole_money","")) list_competitor.append(_result.get("competitor","")) except Exception as e: print(e) break df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor}) df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"]) def export_count_includeKeyword(): filename = "../data/other/jc001.xlsx" list_name = [] list_count = [] df = pd.read_excel(filename) _index = 0 for row in df["品目"]: _name = row data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1}) if data is not None: _count = data["response"]["numFound"] else: _count = 0 list_name.append(_name) list_count.append(_count) _index += 1 print(_index) df1 = pd.DataFrame({"品目":list_name,"数量":list_count}) df1.to_excel("%s_export.xls"%filename) def export_count_includeKeyword_multiThread(): def _handler(item,result_queue): data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1}) if data is not None: _count = data["response"]["numFound"] else: _count = 0 result_queue.put([item,_count]) task_queue = queue.Queue() result_queue = queue.Queue() filename = "../data/other/jc001.xlsx" list_name = [] list_count = [] df = pd.read_excel(filename) _index = 0 for row in df["品目"]: _name = row task_queue.put(_name) _index += 1 multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20) multHandler.run() while(True): try: item = result_queue.get(False) list_name.append(item[0]) list_count.append(item[1]) except queue.Empty as e: break df1 = pd.DataFrame({"品目":list_name,"数量":list_count}) df1.to_excel("%s_export.xls"%filename) def exportKeywords(): def _handle(item,result_queue,pool_mongo): docId = item["docId"] mongo = pool_mongo.getConnector() zhongbiao = mongo.zhongbiao_extraction zhaobiao = mongo.zhaobiao_extraction _project = "" rows = zhaobiao.find({"docId":docId},{"project_name":1}) find_flag = False for row in rows: find_flag = True _project = row.get("project_name","") if not find_flag: rows = zhongbiao.find({"docId":docId},{"project_name":1}) for row in rows: _project = row.get("project_name","") item["project_name"] = _project pool_mongo.putConnector(mongo) result_queue.put(item) list_key = [] dict_key_ids = dict() with open("../data/品目.txt", "r", encoding="utf8") as f: while(True): row = f.readline() if not row: break list_key.append(row) dict_key_ids[row] = [] data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000}) for item in data["response"]["docs"]: dict_key_ids[row].append(item["id"]) task_queue = queue.Queue() result_queue = queue.Queue() for _key in dict_key_ids.keys(): for item in dict_key_ids[_key]: task_queue.put({"docId":item,"project_name":""}) pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo) mt.run() dict_docId_projectname = {} while(True): try: item = result_queue.get(False) dict_docId_projectname[item["docId"]] = item["project_name"] except Exception: break dict_key_count = dict() for _key in dict_key_ids.keys(): set_key = set() for docId in dict_key_ids[_key]: set_key.add(dict_docId_projectname.get(docId,"")) dict_key_count[_key] = len(set_key) print("==") for _key in list_key: print(len(dict_key_ids[_key])) print("==") for _key in list_key: print(dict_key_count[_key]) print("==") def getIndustryCompany(): def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots): # mongoDB = getConnect_mongodb() log(item["enterprise_name"]) mongoDB = pool_mongo.getConnector() # coll_zb = mongoDB.enterprise_profile # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 }) # for row in rows: # item["regCapital"] = row.get("regCapital","") # item["legal_person"] = row.get("legal_person","") # item["phone"] = row.get("phone","") # item["actualCapital"] = row.get("actualCapital","") # item["industry"] = row.get("industry","") # item["estiblishTime"] = row.get("estiblishTime","") # item["socialStaffNum"] = row.get("socialStaffNum","") # item["businessScope"] = row.get("businessScope","") # graph = getConnect_neo4j() ots_client = pool_ots.getConnector() primary_key = [('name',item["enterprise_name"])] columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"] consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1) if return_row is not None: for att in return_row.attribute_columns: item[att[0]] = att[1] list_same_industry_company = [] if "industry" in item: bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])]) col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED) rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index", SearchQuery(bool_query, limit=10, get_total_count=True), col) for row in rows: for item1 in row[0]: list_same_industry_company.append(item1[1]) # if "industry" in item: # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10) # for row in rows: # print(row) # list_same_industry_company.append(row.get("enterprise_name","")) item["same_industry_company"] = list_same_industry_company graph = pool_neo4j.getConnector() company_name = item["enterprise_name"] cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name) finded = graph.run(cql) data = json.loads(json.dumps(finded.data())) _count = data[0]["_c"] # list_project = [] # for _data in data: # if _count<=3: # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01": # if _data["project_name"] is not None: # list_project.append(_data["project_name"]) # _count += 1 item["count"] = _count # item["project"] = str(list_project) result_queue.put(item) pool_mongo.putConnector(mongoDB) pool_neo4j.putConnector(graph) pool_ots.putConnector(ots_client) log_tofile("export.log") pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb) pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j) pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) # list_company = getCompanys() # filename = "".join(["环境","生态","再生","回收","环保"]) list_company = [] filename = "../data/同行客户匹配.xlsx" df = pd.read_excel(filename,sheetname=0) for _com in df["公司名称"]: print(_com) if _com is not None and _com.strip()!="": _company = {"enterprise_name":""} _company["enterprise_name"] = _com list_company.append(_company) task_queue = queue.Queue() for item in list_company: task_queue.put(item) result_queue = queue.Queue() _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots) _muti.run() df_company = {} set_key = set() if len(list_company)>0: for item in list_company: for _key in item.keys(): set_key.add(_key) if _key not in df_company: df_company[_key] = [] list_key = list(set_key) for item in list_company: for _key in list_key: df_company[_key].append(item.get(_key,"")) df1 = pd.DataFrame(df_company) df1.to_excel("%s_export.xlsx"%(filename)) def exportWin_tenderer(time_from,time_to): ''' :return: ''' ost_client = getConnect_ots() last_docid = 0 bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True), TermQuery("docchannel",101), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), RangeQuery('docid', last_docid, include_lower=False)]) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) list_project = [] def _getRow(list_project,rows,last_docid): for row in rows: project_name = row[1][0][1] docid = row[0][1][1] last_docid = docid list_pack = json.loads(row[1][1][1]) _set_tenderer = set() win_tenderer = "" for _pack in list_pack: if "win_tenderer" in _pack and win_tenderer=="": win_tenderer = _pack["win_tenderer"] if "second_tenderer" in _pack: _set_tenderer.add(_pack["second_tenderer"]) if "third_tenderer" in _pack: _set_tenderer.add(_pack["third_tenderer"]) list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)}) return last_docid _getRow(list_project,rows,last_docid) while(next_token): print("%d/%d"%(len(list_project),total_count)) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) last_docid = _getRow(list_project,rows,last_docid) task_queue = queue.Queue() result_queue = queue.Queue() for item in list_project: task_queue.put(item) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) def _handle(item,result_queue,pool_ots): if item["win_tenderer"]!="": ots_client = pool_ots.getConnector() consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1) _dict = dict() for _item in return_row.attribute_columns: _dict[_item[0]] = _item[1] for _key in _dict.keys(): item[_key] = _dict[_key] data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10}) for _item in data["response"]["docs"]: for _key in _item.keys(): item[_key] = _item[_key] break pool_ots.putConnector(ots_client) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots); mt.run() keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"] df_data = {} for _key in keys: df_data[_key] = [] for item in list_project: for _key in keys: if _key in item: df_data[_key].append(item[_key]) else: df_data[_key].append("") df = pd.DataFrame(df_data) df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys) def exportContact(): time_from = "2021-01-14" time_to = "2021-01-15" filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to) df1 = pd.read_excel(filename) set_company = set() for item in df1["tenderer"]: list_company = re.split("\['|', '|'\]|\[\]",item) for _company in list_company: if _company!="": set_company.add(_company) companys = list(set_company) task_queue = queue.Queue() list_company = [] for _company in companys: item = {"company_name":_company} list_company.append(item) task_queue.put(item) result_queue = queue.Queue() def _handle(item,result_queue): company = item["company_name"] data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10}) for _item in data["response"]["docs"]: for _key in _item.keys(): item[_key] = _item[_key] break mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30); mt.run() keys = ["company_name","contact_person","mobile_no","phone_no"] df_data = {} for _key in keys: df_data[_key] = [] ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') for item in list_company: for _key in keys: if _key in item: df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key])) else: df_data[_key].append("") df = pd.DataFrame(df_data) df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys) def countKeyword(): conn = getConnection_mysql() cursor = conn.cursor() print(0) sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'" cursor.execute(sql) print(0.1) df = pd.read_excel("万郡绿建细分关键词.xls") list_keywords = df["细分类别"] dict_keywords = dict() for _key in list_keywords: dict_keywords[_key] = 0 print(1) from bs4 import BeautifulSoup while(True): rows = cursor.fetchmany(10000) print("==") if not rows: break for row in rows: _html = BeautifulSoup(row[0],"lxml").getText() for _key in list_keywords: if re.search(_key,_html) is not None: dict_keywords[_key] += 1 print(dict_keywords) list_count = [] for _key in list_keywords: list_count.append(dict_keywords[_key]) df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count}) df1.to_excel("关键词统计.xlsx") def countKeyword_solr(): def _handle(item,result_queue): keyword = item["keyword"] data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] item["zhongbiao"] = _num data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] item["zhaobiao"] = _num result_queue.put(item) file = "../data/关键词11.xlsx" df = pd.read_excel(file) task_queue = queue.Queue() print(df.keys()) for item in df["业务关键词"]: task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0}) result_queue = queue.Queue() mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10) mt.run() list_keyword = [] list_zhaobiao = [] list_zhongbiao = [] while(True): try: item = result_queue.get(False) list_keyword.append(item["keyword"]) list_zhaobiao.append(item["zhaobiao"]) list_zhongbiao.append(item["zhongbiao"]) except Exception: break df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao}) df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"]) def query_from_solr(): data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10}) _num = data["response"]["numFound"] print(_num) def export_province_keyword_count(): def _handle(item,result_queue,pool_ots): columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"] ots_client = pool_ots.getConnector() _province = item["province"] print(item) # keywords = item["keyword"]+" "+item["key"] list_keyword = item["keyword"] # for _temp in keywords.split(" "): # if len(_temp)>0: # list_keyword.append(_temp) should_queries = [] must_not_q = [] for _temp in list_keyword: should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp)) must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp)) bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2) page_time = item["page_time"] bool_query = BoolQuery(must_queries=[#bool_query_keyword # ,WildcardQuery("publishtime","%s*"%page_time) # ,MatchPhraseQuery("doctitle","服务") RangeQuery("page_time","2018-01-01","2019-01-01",include_lower=True,include_upper=False), TermQuery("docchannel",52), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), WildcardQuery('city', '%s*'%_province) # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True)) ] # ,must_not_queries=must_not_q ) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True), ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED)) item["count"] = total_count list_data = getRow_ots(rows) item["list_data"] = list_data print(item) pool_ots.putConnector(ots_client) df = pd.read_excel("../data/省份信息.xlsx") list_province = [] for _name,_type,_parent in zip(df["cname"],df["ctype"],df["parentid"]): if _type==30 and _parent==4: list_province.append(_name) # filename = "../data/2021-02关键词导出数据.xlsx" # dict_keyword = {} # df1 = pd.read_excel(filename,dtype=str) # for _key,_keyword in zip(df1["key1"],df1["keyword"]): # print("===",str(_keyword)) # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword # for _key in df1["关键词"]: # dict_keyword[_key] = "" keyword_str = ''' 快递 物流 供应链 运输 配送 仓储 冷链 整车 服务 ''' list_key = [] for _k in re.split("\s",keyword_str): _k1 = _k.strip() if len(_k1)>0: list_key.append(_k1) list_task = [] page_time = "2020-11" for _province in list_province: list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0}) task_queue = queue.Queue() for item in list_task: task_queue.put(item) result_queue = queue.Queue() pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() dict_key_data = dict() # list_data = [] # for item in list_task: # list_data.extend(item["list_data"]) # dict_channel = getDict_docchannel() # df_data= {} # print(list_data) # for row in list_data: # item = {} # _dict = row # set_dict_item(item,"docid",_dict.get("docid","")) # set_dict_item(item,"公告标题",_dict.get("doctitle","")) # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) # set_dict_item(item,"省份",_dict.get("province","")) # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) # set_dict_item(item,"城市",_dict.get("city","")) # set_dict_item(item,"发布时间",_dict.get("page_time","")) # # set_dict_item(item,"项目编号",_dict.get("project_code","")) # set_dict_item(item,"招标单位",_dict.get("tenderee","")) # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) # set_dict_item(item,"代理单位",_dict.get("agency","")) # set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) # # sub_docs_json = _dict.get("sub_docs_json") # for _doc in json.loads(sub_docs_json): # if "win_tenderer" in _doc: # set_dict_item(item,"中标单位",_doc["win_tenderer"]) # if "win_tenderee_manager" in _doc: # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) # if "win_tenderee_phone" in _doc: # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: # set_dict_item(item,"中标金额",_doc["win_bid_price"]) # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: # set_dict_item(item,"招标金额",_doc["bidding_budget"]) # if "招标金额" not in item: # set_dict_item(item,"招标金额","") # if "中标金额" not in item: # set_dict_item(item,"中标金额","") # if "中标单位" not in item: # set_dict_item(item,"中标单位","") # if "中标单位联系人" not in item: # set_dict_item(item,"中标单位联系人","") # if "中标单位联系电话" not in item: # set_dict_item(item,"中标单位联系电话","") # # # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # # if _line in set_line: # # continue # # if item["招标金额"]=="": # # continue # # set_line.add(_line) # for k,v in item.items(): # if k not in df_data: # df_data[k] = [] # df_data[k].append(v) # df1 = pd.DataFrame(df_data) # df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns) for item in list_task: print("%s\t%d"%(item["province"],item["count"])) # for item in list_task: # dict_key_data[item["key"]][item["province"]] = item # dict_key_province = dict() # dict_key_province["关键词"] = [] # for _province in list_province: # dict_key_province[_province] = [] # for _key in dict_keyword.keys(): # dict_key_province["关键词"].append(_key) # for _province in list_province: # dict_key_province[_province].append(dict_key_data[_key][_province]["count"]) # columns = ["关键词"] # columns.extend(list_province) # df2 = pd.DataFrame(dict_key_province) # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns) def export_keyword_count(): def _handle(item,result_queue,ots_client): shoud_q_docchannel = BoolQuery(should_queries=[ # RangeQuery("docchannel",51,105,True,True) TermQuery("docchannel",101), RangeQuery("docchannel",118,120,True,True) ] ) should_q_keyword = BoolQuery(should_queries=[ MatchPhraseQuery("doctitle",item["keyword"]), MatchPhraseQuery("doctextcon",item["keyword"]), MatchPhraseQuery("attachmenttextcon",item["keyword"]) ]) bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), generateBoolShouldQuery(["docchannel"],[51, 52, 101, 118, 119, 120, 114, 51, 103],TermQuery), # TermQuery("docchannel",101), # shoud_q_docchannel, should_q_keyword # MatchPhraseQuery(item["type"], item["keyword"]) ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1, get_total_count=True), ColumnsToGet(return_type=ColumnReturnType.ALL)) item["total_count"] = total_count if total_count>0: item["exists"] =1 else: item["exists"] =0 bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), TermQuery("docchannel",52), # shoud_q_docchannel, should_q_keyword # MatchPhraseQuery(item["type"], item["keyword"]) ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1, get_total_count=True), ColumnsToGet(return_type=ColumnReturnType.ALL)) item["zhaobiao_count"] = total_count bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), # TermQuery("docchannel",101), shoud_q_docchannel, should_q_keyword, # MatchPhraseQuery(item["type"], item["keyword"]) ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1, get_total_count=True), ColumnsToGet(return_type=ColumnReturnType.ALL)) item["zhongbiao_count"] = total_count range_from = "2022-02-21" range_to = "2022-02-22" _type = "doctitle" assert _type in ["doctitle","doctextcon"] list_dict_key_count = [] filename = "注册资金大于3kw.csv" df = pd.read_csv(filename) for item in df["name"]: list_dict_key_count.append({"keyword":item,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type}) # str_keys = ''' # 智慧税务 # 发票管理 # # ''' # for item in re.split("\s|\r|\n|,|,|、",str_keys): # if item.strip()!="": # list_dict_key_count.append({"keyword":item,"total_count":0,"zhaobiao_count":0,"zhongbiao_count":0,"range_from":range_from,"range_to":range_to,"type":_type}) task_queue = queue.Queue() for item in list_dict_key_count: task_queue.put(item) result_queue = queue.Queue() ots_client = getConnect_ots() # pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=60,ots_client=ots_client) mt.run() columns = ["keyword","total_count","exists","zhaobiao_count","zhongbiao_count","range_from","range_to","type"] df_data = {} for _c in columns: df_data[_c] = [] for item in list_dict_key_count: print(item) for _c in columns: if _c in item: df_data[_c].append(item[_c]) else: df_data[_c].append("") for k,v in df_data.items(): print(k,len(v)) df2 = pd.DataFrame(df_data) df2.to_excel("%s_数量导出全部类别.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=columns) def export_keyword_title(): ots_client = getConnect_ots() range_from = "2020-01-01" range_to = "2022-12-23" list_condition = [["医务室"], ["医院"], ["卫生院"], ["卫生所"], ["卫生室"], ["社区卫生服务中心"]] list_should_query = [] for _c in list_condition: if len(_c)==1: list_should_query.append(MatchPhraseQuery("doctitle",_c[0])) else: _must_query = [] for _q in _c: _must_query.append(MatchPhraseQuery("doctitle",_q)) list_should_query.append(BoolQuery(must_queries=_must_query)) keyword_query = BoolQuery(should_queries=list_should_query) bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to), RangeQuery('status', '201', '300', include_lower=True, include_upper=True), keyword_query ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED)) df_data = {"docid":[],"doctitle":[],"tenderee":[]} def setData(df_data,rows): list_dict = getRow_ots(rows) for _dict in list_dict: docid = _dict.get("docid","") doctitle = _dict.get("doctitle","") tenderee = _dict.get("tenderee","") df_data["docid"].append(docid) df_data["doctitle"].append(doctitle) df_data["tenderee"].append(tenderee) setData(df_data,rows) _count = len(rows) while next_token: rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED)) setData(df_data,rows) _count += 100 print(_count,total_count) file_begin = 0 file_length = 100000 _count = 0 while file_begin100000: continue df_2000["document_id"].append(_id) df_2000["document_text"].append(_text) df_2 = pd.DataFrame(df_2000) df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False) # save(dict_websource,"../data/dict_websource.pk") def getWinTenderer(sub_doc_json): if sub_doc_json is not None: sub_doc = json.loads(sub_doc_json) for _doc in sub_doc: if "win_tenderer" in _doc: return _doc["win_tenderer"] return "" def exportDocument_by_keywords(page_time, list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]): task_queue = queue.Queue() result_queue = queue.Queue() for _k in list_keyword: task_queue.put(_k) def _handle(keyword,result_queue): should_queries = [] for _temp in [keyword]: should_queries.append(MatchPhraseQuery("doctitle",_temp)) bool_query_keyword = BoolQuery(should_queries=should_queries) ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'), MatchPhraseQuery("doctitle",keyword), TermQuery("docchannel","101") ]) is_all_succeed = False _count = 0 total_count = 1 next_token = None rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) for row in rows: _dict = dict() for values in row: for _v in values: _dict[_v[0]] = _v[1] result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))}) print(keyword,next_token,total_count) while(next_token): try: # print(next_token) _count += len(rows) print("%s:%d/%d"%(keyword,_count,total_count)) for row in rows: _dict = dict() for values in row: for _v in values: _dict[_v[0]] = _v[1] result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))}) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) except Exception as e: traceback.print_exc() mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30) mt.run() df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]} while(True): try: item = result_queue.get(block=True,timeout=1) for _k in df_data.keys(): if _k in item: df_data[_k].append(item[_k]) else: df_data[_k].append("") except queue.Empty as e: break except Exception as e: traceback.print_exc() df = pd.DataFrame(df_data) df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"]) def exportGovement(): should_queries1 = [] for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]: should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp)) should_queries2 = [] for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]: should_queries2.append(WildcardQuery("province","*%s*"%_temp)) ots_client = getConnect_ots() page_time = "2020-12" bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1), BoolQuery(should_queries=should_queries2), TermQuery("docchannel","52"), RangeQuery("publishtime",page_time)]) columns = ["tenderee","tenderee_contact","tenderee_phone"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) print(total_count) def getRow(rows,df_data,columns): for row in rows: _dict = dict() for part in row: for item in part: _dict[item[0]] = item[1] if "tenderee_contact" in _dict and "tenderee_phone" in _dict: for key in columns: df_data[key].append(_dict.get(key,"")) all_rows = 0 df_data = {} for key in columns: df_data[key] = [] getRow(rows,df_data,columns) _count = 100 while(next_token): print(_count,total_count) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += 100 getRow(rows,df_data,columns) df2 = pd.DataFrame(df_data) df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns) def export_attachment(): filename = "../data/销售部-字段名.xlsx" df = pd.read_excel(filename) list_dict = [] for _key in df["关键词"]: if len(_key)>0: list_dict.append({"keyword":_key}) def _handle(_dict,result_queue,pool_ots): _keyword = _dict["keyword"] ots_client = pool_ots.getConnector() keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword), MatchPhraseQuery("doctitle",_keyword)]) bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True), keyword_query]) columns = ["dochtmlcon","docid"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) print(_keyword,total_count) hit_count = 0 def getData(rows,hit_count): list_fields = getRow_ots(rows) for _fields in list_fields: dochtmlcon = _fields["dochtmlcon"] docid = _fields["docid"] _flag = False for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon): try: result = requests.get(url,stream=True,timeout=100) if result.status_code==200: filename = get_file_name(url,result.headers) with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f: f.write(result.content) _flag = True except Exception: pass if _flag: hit_count += 1 return hit_count hit_count = getData(rows,hit_count) _count = len(rows) while next_token: rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += len(rows) if _count>20000: break hit_count = getData(rows,hit_count) if hit_count>20: break pool_ots.putConnector(ots_client) task_queue = queue.Queue() result_queue = queue.Queue() for item in list_dict: task_queue.put(item) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() def exportIndustryCount(): import codecs time_from = "2020-12-21" time_to = "2020-12-25" # dict_channel = {"51":{"type":"公告变更"}, # "52":{"type":"招标公告"}, # "101":{"type":"中标信息"}, # "102":{"type":"招标预告"}, # "103":{"type":"招标答疑"}, # "104":{"type":"招标文件"}, # "105":{"type":"资审结果"}, # "103":{"type":"招标控制价"}, # "100":{"type":"未知类型"}} dict_industry = {} meta_industry = load("../data/other/class2dalei_menlei.pkl") for _key in meta_industry.keys(): dict_industry[_key] = {"type":_key} print(dict_industry.keys()) return task_queue = queue.Queue() result_queue = queue.Queue() for _key in dict_industry.keys(): task_queue.put(dict_industry[_key]) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]), RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)]) columns = ["docid"] rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) item["count"] = total_count columns = ["dochtmlcon"] bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) for row in rows: _dict = dict() for part in row: for v in part: _dict[v[0]] = v[1] with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f: f.write(_dict["dochtmlcon"]) pool_ots.putConnector(ots_client) pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots) mt.run() columns = ["type","count"] df_data = {} for _c in columns: df_data[_c] = [] for _indus in dict_industry.keys(): for _c in columns: df_data[_c].append(dict_industry[_indus][_c]) df = pd.DataFrame(df_data) # df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns) df.to_csv("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns) def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]): ''' :return: ''' ost_client = getConnect_ots() last_docid = 0 bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True), RangeQuery('status', '201', '300', include_lower=True, include_upper=True)]) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count = len(rows) df_data = {} def getData(df_data,rows): list_dict = getRow_ots(rows) for _dict in list_dict: for _k,_v in _dict.items(): if _k not in df_data: df_data[_k] = [] df_data[_k].append(getLegal_str(_v)) getData(df_data,rows) while(next_token): print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index", SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) _count += len(rows) getData(df_data,rows) df = pd.DataFrame(df_data) df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns) def processDocument(): filename = "../data/2021-01-29-2021-01-29公告信息.xlsx" df = pd.read_excel(filename) df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv") return list_dict = [] for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]): list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon}) task_queue = queue.Queue() for _dict in list_dict: task_queue.put(_dict) result_queue = queue.Queue() def _handle(_dict,result_queue,pool_mysql): conn = pool_mysql.getConnector() cursor = conn.cursor() sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"]) cursor.execute(sql) conn.commit() pool_mysql.putConnector(conn) # url = "http://192.168.2.101:15030" # myheaders = {'Content-Type': 'application/json'} # print(int(_dict["docid"])) # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]} # resp = requests.post(url,json=data,headers=myheaders, verify=True) # result = json.loads(resp.content.decode("utf8"),"utf8") # _dict["product"] = result["product"] pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql) mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql) mt.run() # columns = ["docid","doctitle","project_name","product"] # # df_data = {} # for _c in columns: # df_data[_c] = [] # for _dict in list_dict: # for _c in columns: # df_data[_c].append(_dict.get(_c,"")) # df = pd.DataFrame(df_data) # df.to_excel("%s.product.xlsx"%(filename),columns=columns) def export_extract_check(): ''' :return:导出数据提取校验的结果并生成报告 ''' conn = getConnection_testmysql() cursor = conn.cursor() sql = " select docid,json_result from exportdb.extract_check " cursor.execute(sql) dict_global = {} df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]} df_document = {"docid":[]} while True: rows = cursor.fetchmany(10000) if not rows: break for docid,json_result in rows: df_document["docid"].append(docid) _result = json.loads(json_result) for k,v in _result.items(): key = k.split("_") _key = "_".join(key[:-1]) if "punish" in _key or "complainants" in _key or "institutions" in _key: continue if k not in df_document: df_document[k] = [] df_document[k].append(v) key_type = key[-1] if _key not in dict_global: dict_global[_key] = {} if key_type not in dict_global[_key]: dict_global[_key][key_type] = 0 if key_type=="diff": dict_global[_key][key_type] += v if key_type in ("online","test"): if isinstance(v,str): if v!="": dict_global[_key][key_type] += 1 elif isinstance(v,list): dict_global[_key][key_type] += len(v) for k,v in dict_global.items(): df_global["key_type"].append(k) df_global["online_count"].append(v["online"]) df_global["test_count"].append(v["test"]) df_global["diff_count"].append(v["diff"]) df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0) filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d")) with pd.ExcelWriter(filename) as writer: df1 = pd.DataFrame(df_global) df1.to_excel(writer,sheet_name="global") for k,v in df_document.items(): print(k,len(v)) df2 = pd.DataFrame(df_document) df2.to_excel(writer,sheet_name="document") writer.save() writer.close() def exportDocument_dump(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"] df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"] df_data = {} for _key in df_keys: df_data[_key] = [] bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"), RangeQuery("status",201,300,True,True)]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) def getData(df_data,rows): list_data = getRow_ots(rows) for row in list_data: dict_find = {} for _key in df_keys: dict_find[_key] = 0 for _k,_v in row.items(): if _k in df_keys: dict_find[_k] = 1 if _k=="project_code": _v = '"%s"'%_v df_data[_k].append(_v) doctitle = row.get("doctitle","") df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle)) df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"])) dict_find["URL"] = 1 dict_find["doctitle_refine"] = 1 sub_docs_json = row.get("sub_docs_json","[{}]") doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""} if sub_docs_json is not None: for sub_docs in json.loads(sub_docs_json): for _key_sub_docs in sub_docs.keys(): if _key_sub_docs in doc_columns: if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]: if _key_sub_docs in ["bidding_budget","win_bid_price"]: if float(sub_docs[_key_sub_docs])>0: doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) else: doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) for _k,_v in doc_columns.items(): dict_find[_k] = 1 df_data[_k].append(_v) for _k,_v in dict_find.items(): if _v==0: df_data[_k].append("") _count = len(rows) getData(df_data,rows) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows) _count += len(rows) # for docid in df["docid"]: # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))]) # # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", # SearchQuery(bool_query , limit=100, get_total_count=True), # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) # list_data = getRow_ots(rows) # if len(list_data)>0: # dict_find = {} # for _key in df_keys: # dict_find[_key] = 0 # for _k,_v in list_data[0].items(): # if _k in df_keys: # dict_find[_k] = 1 # df_data[_k].append(_v) # doctitle = list_data[0].get("doctitle","") # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle)) # dict_find["doctitle_refine"] = 1 # sub_docs_json = list_data[0].get("sub_docs_json","[{}]") # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""} # if sub_docs_json is not None: # for sub_docs in json.loads(sub_docs_json): # for _key_sub_docs in sub_docs.keys(): # if _key_sub_docs in doc_columns: # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]: # if _key_sub_docs in ["bidding_budget","win_bid_price"]: # if float(sub_docs[_key_sub_docs])>0: # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) # else: # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs]) # for _k,_v in doc_columns.items(): # dict_find[_k] = 1 # df_data[_k].append(_v) # for _k,_v in dict_find.items(): # if _v==0: # df_data[_k].append("") df1 = pd.DataFrame(df_data) df1.to_csv("../data/0303去重.csv",columns=df_keys) def exportDocument_dump_mysql(): conn = getConnection_testmysql() cursor = conn.cursor() columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"] df_data = {} for _c in columns: df_data[_c] = [] sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)" cursor.execute(sql) while True: rows = cursor.fetchmany(100000) if not rows: break for row in rows: for _i in range(len(columns)): df_data[columns[_i]].append(row[_i]) df = pd.DataFrame(df_data) df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"]) print(cursor.description) def getDict_docchannel(): filename = "docchannel.pk" if os.path.exists(filename): _dict = load(filename) return _dict conn = getConnection_mysql() cursor = conn.cursor() sql = "select channel_id,chnlname from sys_channel " cursor.execute(sql) rows = cursor.fetchall() _dict = dict() for row in rows: _dict[row[0]] = row[1] save(_dict,filename) return _dict def exportDocument_by_doctitle(): def timeAdd_minute(_time,minutes): a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+60*minutes _time1 = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(a)) return _time1 columns = ["docid","attachmenttextcon","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","time_bidclose","time_bidopen"] columns = ["doctitle","attachmenttextcon","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"] columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"] dict_channel = getDict_docchannel() task_queue = queue.Queue() result_queue = queue.Queue() list_keyword = [] # for _keyword in re.split("\s|/|、",str_keyword): # if len(_keyword.strip())==0 and isinstance(_keyword,(str)): # continue # print(_keyword) # item = {"keyword":_keyword.strip()} # list_keyword.append(item) # task_queue.put(item) filename = "有效调用次数统计_20220830_v1.xlsx" df = pd.read_excel(filename) for company in df["enterpriseName"]: _dict = {"keyword":company} task_queue.put(_dict) list_keyword.append(_dict) start_day = "2019-01-01" count_days = 90 # for _i in range(count_days): # # current_date = timeAdd(start_day,_i) # for _j in range(24*6): # start_minute = timeAdd_minute(current_date,10*_j) # end_minute = timeAdd_minute(current_date,10*(_j+1)) # # item = {"start_minute":start_minute,"end_minute":end_minute} # list_keyword.append(item) # task_queue.put(item) ots_client = getConnect_ots() def _handle(item,result_queue,ots_client): # should_q_keyword = BoolQuery(should_queries=[ # # MatchPhraseQuery("tenderee",item["keyword"]), # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"])), # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",item["keyword"])), # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",item["keyword"])) # MatchPhraseQuery("doctextcon",item["keyword"]), # MatchPhraseQuery("doctitle",item["keyword"]), # MatchPhraseQuery("attachmenttextcon",item["keyword"]) # ]) # # should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东") # # ,WildcardQuery('province', '%s*'%"湖南") # # ,WildcardQuery('province', '%s*'%"广西") # ]) # # should_q_tenderee = BoolQuery(should_queries=[WildcardQuery("tenderee","*中学*"), # WildcardQuery("tenderee","*大学*"), # WildcardQuery("tenderee","*小学*"), # WildcardQuery("tenderee","*教育局*")]) bool_query = BoolQuery(must_queries=[ # RangeQuery("page_time","2019-01-01","2023-01-01"), # TermQuery("page_time","2022-02-18"), generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item["keyword"]],MatchPhraseQuery), # RangeQuery("crtime",item["start_minute"],item["end_minute"]) RangeQuery("status",151,300,True,True), # TermQuery("tenderee",item["keyword"]) # ,TermQuery("docchannel",52) # ,should_q_keyword # ,should_q_tenderee # ,should_q2 ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=1, get_total_count=True), ColumnsToGet(return_type=ColumnReturnType.NONE)) item["total_count"] = total_count list_data = getRow_ots(rows) for _data in list_data: _data["keyword"] = item["keyword"] result_queue.put(_data) _count = len(list_data) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) _count += len(list_data) for _data in list_data: _data["keyword"] = item["keyword"] result_queue.put(_data) mt = MultiThreadHandler(task_queue,_handle,result_queue,40,ots_client=ots_client) mt.run() set_docid = set() list_item = [] # for item in list_keyword: # print(item["keyword"]) total_count = 0 for item in list_keyword: total_count += item["total_count"] print(item["total_count"]) print("total_count:%d"%(total_count)) keys = list_keyword[0].keys() df_data = {} for item in list_keyword: for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) dumplicate = False try: while True: _dict = result_queue.get(False) _docid = _dict.get("docid") if _docid in set_docid and not dumplicate: continue set_docid.add(_docid) list_item.append(_dict) except Exception as e: print(e) list_docid = list(set_docid) with open("list_docid.txt","w",encoding="utf8") as f: for _docid in list_docid: f.write(str(_docid)) f.write("\n") f.flush() # log("get document taotal_count:%d"%len(list_item)) # set_line = set() # getRowData(df_data,list_item,set_line,list_keyword,dict_channel,dumplicate) # set_enterprise = set() # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]): # set_enterprise.add(_tenderee) # set_enterprise.add(_agency) # set_enterprise.add(_win_tenderer) # if "" in set_enterprise: # set_enterprise.remove("") # if None in set_enterprise: # set_enterprise.remove(None) # dict_enterprise = getDictEnterprise(list(set_enterprise)) # if len(set_enterprise)>0: # for _i in range(len(df_data["招标单位"])): # # _enterprise_name = df_data["招标单位"][_i] # # if df_data["招标联系人电话"][_i]=="": # # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # # if contacts is not None: # # _person,_phone = getOneContact(contacts) # # df_data["招标联系人"][_i] = _person # # df_data["招标联系人电话"][_i] = _phone # # _enterprise_name = df_data["代理单位"][_i] # if df_data["代理联系人电话"][_i]=="": # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # if contacts is not None: # _person,_phone = getOneContact(contacts) # df_data["代理联系人"][_i] = _person # df_data["代理联系人电话"][_i] = _phone # # _enterprise_name = df_data["中标单位"][_i] # if df_data["中标单位联系电话"][_i]=="": # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") # if contacts is not None: # _person,_phone = getOneContact(contacts) # df_data["中标单位联系人"][_i] = _person # df_data["中标单位联系电话"][_i] = _phone df = pd.DataFrame(df_data) df.to_excel("../data/%s_export11.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"))) set_columns = set() list_df_columns = [] def set_dict_item(_dict,name,v): _dict[name] = getLegal_str(v) if name not in set_columns: set_columns.add(name) list_df_columns.append(getLegal_str(name)) def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v): _dict[name] = getLegal_str(v) if name not in set_columns1: set_columns1.add(name) list_df_columns1.append(name) def exportDocument_medicine(start_time,end_time): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() # columns = ["doctitle","docchannel","time_bidopen","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"] columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"] set_enter = set() str_enter = ''' 北京嘉和美康信息技术有限公司 阿里健康科技(中国)有限公司 北大医疗信息技术有限公司 创业慧康科技股份有限公司 东华医为科技有限公司 望海康信(北京)科技股份公司 国新健康保障服务有限公司 南京海泰医疗信息系统有限公司 南京海泰信息技术有限公司 浙江和仁科技股份有限公司 北京惠每科技有限公司 金蝶医疗软件科技有限公司 北京京东健康有限公司 四川久远银海软件股份有限公司 零氪科技(北京)有限公司 北京麦迪克斯科技有限公司 苏州麦迪斯顿医疗科技股份有限公司 江苏曼荼罗软件股份有限公司 北京平安联想智慧医疗信息技术有限公司 青岛百洋智能科技股份有限公司 上海森亿医疗科技有限公司 万达信息股份有限公司 微医集团(浙江)有限公司 卫宁健康科技集团股份有限公司 心医国际数字医疗系统(大连)有限公司 医渡云(北京)技术有限公司 医惠科技有限公司 易联众信息技术股份有限公司 智业软件股份有限公司 中电数据服务有限公司 重庆中联信息产业有限责任公司 杭州卓健信息科技股份有限公司 大连万达集团股份有限公司 ''' for a in re.split("\s+",str_enter): if a.strip()!="": set_enter.add(a.strip()) dict_channel = getDict_docchannel() # list_province = ["江西","湖南","四川","安徽"] list_province = ["全国"] for _province in list_province: df_data = {} str_p = ''' 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化 临床应用 ''' list_prov = re.split("\s",str_p) list_mu = [] for _p in list_prov: if _p.strip()=="": continue print(_p) list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip())) s_tenderee = '医院、卫生院、疗养院、健康局、卫生局' list_should_ten = [] for _p in re.split("、",s_tenderee): if _p.split()=="": continue list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip())) list_should_win = [] for _win in list(set_enter): list_should_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_win))) list_should_chan = [] list_should_chan.append(TermQuery("docchannel",52)) list_should_chan.append(TermQuery("docchannel",101)) list_should_chan.append(TermQuery("docchannel",102)) should_q1 = BoolQuery(should_queries=list_mu) should_q2 = BoolQuery(should_queries=list_should_ten) should_q3 = BoolQuery(should_queries=list_should_chan) bool_query = BoolQuery(must_queries=[ BoolQuery(should_queries=list_should_win), RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True), RangeQuery("status",151,300,True,True), # should_q1, # should_q2, # should_q3, ], # must_not_queries=[ # MatchPhraseQuery("doctextcon","器械"), # MatchPhraseQuery("doctextcon","仪器"), # ] ) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) # getData(df_data,rows,set_line) list_row = getRow_ots(rows) getRowData(df_data,list_row,set_line,[],dict_channel,False) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) list_row = getRow_ots(rows) getRowData(df_data,list_row,set_line,[],dict_channel,False) _count += len(rows) if len(list(df_data.keys()))>0: if len(df_data[list(df_data.keys())[0]])>=300: break set_enterprise = set() for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]): set_enterprise.add(_tenderee) set_enterprise.add(_agency) set_enterprise.add(_win_tenderer) if "" in set_enterprise: set_enterprise.remove("") if None in set_enterprise: set_enterprise.remove(None) dict_enterprise = getDictEnterprise(list(set_enterprise)) if len(set_enterprise)>0: for _i in range(len(df_data["招标单位"])): _enterprise_name = df_data["招标单位"][_i] if df_data["招标联系人电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["招标联系人"][_i] = _person df_data["招标联系人电话"][_i] = _phone _enterprise_name = df_data["代理单位"][_i] if df_data["代理联系人电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["代理联系人"][_i] = _person df_data["代理联系人电话"][_i] = _phone _enterprise_name = df_data["中标单位"][_i] if df_data["中标单位联系电话"][_i]=="": contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts") if contacts is not None: _person,_phone = getOneContact(contacts) df_data["中标单位联系人"][_i] = _person df_data["中标单位联系电话"][_i] = _phone return df_data df1 = pd.DataFrame(df_data) print(len(df_data["docid"])) df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) def getRowDataWithKey(df_data,rows,columns): global list_df_columns list_df_columns = columns for row in rows: for c in columns: if c not in df_data: df_data[c] = [] df_data[c].append(row.get(c)) def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate): dict_line = {} # list_data = getRow_ots(rows) _index = 0 for row in rows: _index += 1 item = {} _dict = row set_dict_item(item,"docid",_dict.get("docid","")) # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status","")) set_dict_item(item,"crtime",_dict.get("crtime","")) # set_dict_item(item,"要素数",_dict.get("extract_count",0)) set_dict_item(item,"公告标题",_dict.get("doctitle","")) set_dict_item(item,"web_source_no",_dict.get("web_source_no","")) set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) # set_dict_item(item,"web_source_name",_dict.get("web_source_name","")) # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),"")) # set_dict_item(item,"detail_link",_dict.get("detail_link","")) set_dict_item(item,"公告内容",getLegal_str(_dict.get("doctextcon",""))) set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon",""))) if "keyword" in _dict: set_dict_item(item,"关键词",_dict["keyword"]) else: _wholeword = re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")")) _pattern = "|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]) set_dict_item(item,"关键词",",".join(list(set(re.findall(_pattern,_wholeword))))) # set_dict_item(item,"关键词",_dict.get("keyword","")) set_dict_item(item,"产品",_dict.get("product","")) set_dict_item(item,"服务期限",_dict.get("service_time","")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"区县",_dict.get("district","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"截标时间",_dict.get("time_bidclose","")) set_dict_item(item,"开标时间",_dict.get("time_bidopen","")) # set_dict_item(item,"创建时间",_dict.get("crtime","")) set_dict_item(item,"招标方式",_dict.get("bidway","")) set_dict_item(item,"行业一级分类",_dict.get("industry","")) set_dict_item(item,"行业二级分类",_dict.get("info_type","")) set_dict_item(item,"uuid",_dict.get("uuid")) # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle",""))) set_dict_item(item,"项目编号",_dict.get("project_code","")) set_dict_item(item,"项目名称",_dict.get("project_name","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item(item,"代理单位",_dict.get("agency","")) set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) set_dict_item(item,"评审专家",_dict.get("person_review","")) set_dict_item(item,"开标时间",_dict.get("time_bidopen","")) set_dict_item(item,"截标时间",_dict.get("time_bidclose","")) set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start","")) set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end","")) set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start","")) set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end","")) sub_docs_json = _dict.get("sub_docs_json") set_tenderer = set() if sub_docs_json is not None: docs = json.loads(sub_docs_json) docs.sort(key=lambda x:float(x.get("win_bid_price",0))) for _doc in docs: if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) if "second_tenderer" in _doc: set_dict_item(item,"第二候选单位",_doc["second_tenderer"]) set_tenderer.add(_doc.get("second_tenderer")) if "third_tenderer" in _doc: set_dict_item(item,"第三候选单位",_doc["third_tenderer"]) set_tenderer.add(_doc.get("third_tenderer")) if "win_tenderee_manager" in _doc: set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额",_doc["bidding_budget"]) set_dict_item(item,"入围供应商",",".join(list(set_tenderer))) if "第二候选单位" not in item: set_dict_item(item,"第二候选单位","") if "第三候选单位" not in item: set_dict_item(item,"第三候选单位","") if "招标金额" not in item: set_dict_item(item,"招标金额","") if "中标金额" not in item: set_dict_item(item,"中标金额","") if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标单位联系人" not in item: set_dict_item(item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item(item,"中标单位联系电话","") # if item["中标单位"] not in set_enter: # continue if not dumplicate: if item["项目编号"]!="": _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"])) if _line in dict_line: dict_line[_line].append(item) continue dict_line[_line] = [item] _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"])) if _line2 in dict_line: dict_line[_line2].append(item) continue dict_line[_line2] = [item] # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None: # continue # if _dict.get("docid","") in set_ig_docid: # continue # if item["招标金额"]=="": # continue for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) if not dumplicate: dict_dump = {} columns = ["group_id"] columns.extend(list_df_columns) for k in columns: dict_dump[k] = [] group_id = 1 for k,v in dict_line.items(): if len(v)==1: continue for item in v: dict_dump["group_id"].append(group_id) for k in list_df_columns: dict_dump[k].append(item.get(k)) group_id += 1 df_dump = pd.DataFrame(dict_dump) df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S"))) def getRowData_sp1(df_data,rows,set_line,list_keyword,dict_channel,dumplicate): dict_line = {} # list_data = getRow_ots(rows) _index = 0 set_id = set() for row in rows: _index += 1 item = {} _dict = row _id = _dict.get("id") if _id is not None and _id in set_id: continue set_id.add(_id) set_dict_item(item,"id",_dict.get("id","")) # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status","")) # set_dict_item(item,"crtime",_dict.get("crtime","")) set_dict_item(item,"detaillink",_dict.get("detaillink","")) # set_dict_item(item,"web_source_no",_dict.get("web_source_no","")) set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),"")) set_dict_item(item,"page_time",getLegal_str(_dict.get("page_time",""))) # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon",""))) set_dict_item(item,"page_title",_dict.get("page_title","")) set_dict_item(item,"record_id",_dict.get("record_id","")) set_dict_item(item,"web_source_name",_dict.get("web_source_name","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"web_source_no",_dict.get("web_source_no","")) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) def getRowData_sp(df_data,rows,set_line,list_keyword,dict_channel,dumplicate): dict_line = {} # list_data = getRow_ots(rows) _index = 0 set_id = set() for row in rows: _index += 1 item = {} _dict = row set_dict_item(item,"docid",_dict.get("docid","")) # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status","")) # set_dict_item(item,"crtime",_dict.get("crtime","")) set_dict_item(item,"公告标题",_dict.get("page_title","")) # set_dict_item(item,"web_source_no",_dict.get("web_source_no","")) set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),"")) set_dict_item(item,"公告内容",getLegal_str(_dict.get("page_content",""))) # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon",""))) if "keyword" in _dict: set_dict_item(item,"关键词",_dict["keyword"]) else: set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")"))))))) # set_dict_item(item,"关键词",_dict.get("keyword","")) set_dict_item(item,"产品",_dict.get("product","")) set_dict_item(item,"服务期限",_dict.get("service_time","")) set_dict_item(item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"城市",_dict.get("city","")) set_dict_item(item,"区县",_dict.get("district","")) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"截标时间",_dict.get("time_bidclose","")) set_dict_item(item,"开标时间",_dict.get("time_bidopen","")) # set_dict_item(item,"创建时间",_dict.get("crtime","")) set_dict_item(item,"招标方式",_dict.get("bidway","")) set_dict_item(item,"行业一级分类",_dict.get("industry","")) set_dict_item(item,"行业二级分类",_dict.get("info_type","")) set_dict_item(item,"来源",_dict.get("web_source_name","")) set_dict_item(item,"uuid",_dict.get("uuid")) # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle",""))) set_dict_item(item,"项目编号",_dict.get("page_code","")) set_dict_item(item,"项目名称",_dict.get("project_name","")) set_dict_item(item,"招标单位",_dict.get("tenderee","")) set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item(item,"代理单位",_dict.get("agency","")) set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) set_dict_item(item,"评审专家",_dict.get("person_review","")) set_dict_item(item,"开标时间",_dict.get("time_bidopen","")) set_dict_item(item,"截标时间",_dict.get("time_bidclose","")) set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start","")) set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end","")) set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start","")) set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end","")) sub_docs_json = _dict.get("sub_docs_json") set_tenderer = set() if sub_docs_json is not None: docs = json.loads(sub_docs_json) docs.sort(key=lambda x:x.get("win_bid_price",0)) for _doc in docs: if "win_tenderer" in _doc: set_dict_item(item,"中标单位",_doc["win_tenderer"]) if "second_tenderer" in _doc: set_dict_item(item,"第二候选单位",_doc["second_tenderer"]) set_tenderer.add(_doc.get("second_tenderer")) if "third_tenderer" in _doc: set_dict_item(item,"第三候选单位",_doc["third_tenderer"]) set_tenderer.add(_doc.get("third_tenderer")) if "win_tenderee_manager" in _doc: set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item(item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item(item,"招标金额",_doc["bidding_budget"]) set_dict_item(item,"入围供应商",",".join(list(set_tenderer))) if "第二候选单位" not in item: set_dict_item(item,"第二候选单位","") if "第三候选单位" not in item: set_dict_item(item,"第三候选单位","") if "招标金额" not in item: set_dict_item(item,"招标金额","") if "中标金额" not in item: set_dict_item(item,"中标金额","") if "中标单位" not in item: set_dict_item(item,"中标单位","") if "中标单位联系人" not in item: set_dict_item(item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item(item,"中标单位联系电话","") # if item["中标单位"] not in set_enter: # continue if not dumplicate: if item["项目编号"]!="": _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"])) if _line in dict_line: dict_line[_line].append(item) continue dict_line[_line] = [item] _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"])) if _line2 in dict_line: dict_line[_line2].append(item) continue dict_line[_line2] = [item] # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None: # continue # if _dict.get("docid","") in set_ig_docid: # continue # if item["招标金额"]=="": # continue for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) if not dumplicate: dict_dump = {} columns = ["group_id"] columns.extend(list_df_columns) for k in columns: dict_dump[k] = [] group_id = 1 for k,v in dict_line.items(): if len(v)==1: continue for item in v: dict_dump["group_id"].append(group_id) for k in list_df_columns: dict_dump[k].append(item.get(k)) group_id += 1 df_dump = pd.DataFrame(dict_dump) df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S"))) def filterRow(list_row,column,list_not_keywrods): list_result = [] for row in list_row: _product = row.get(column,"") sub_docs_json = row.get("sub_docs_json","") doctitle = row.get("doctitle","") if re.search("设计",sub_docs_json) is not None: if re.search("装修",str(doctitle)+str(sub_docs_json)) is None: list_result.append(row) else: print("===",_product) # if re.search("|".join([re.escape(i) for i in list_not_keywrods]),_product) is not None: # continue # list_result.append(row) # if row.get("关键词",1)==row.get("招标单位",2) or row.get("关键词",2)==row.get("中标单位",3): # list_result.append(row) # doctitle = row.get("doctitle") # doctextcon = row.get("doctextcon") # if len(re.sub('\s','',doctextcon))==len(doctitle)+4: # list_result.append(row) # tenderee_phone = row.get("tenderee_phone","") # if len(tenderee_phone)==11: # list_result.append(row) return list_result dict_sptype = {"2":"审批信息", "4":"审批结果", "8":"核准公示", "16":"核准结果", "32":"备案公示", "64":"备案结果", "128":"推介项目", "256":"推介结果", "512":"项目撤销", "1024":"筹备阶段"} def getKeywordByFile(): filename = "审批标题对比检查结果(20220831).xlsx" df = pd.read_excel(filename,sheetname=1) list_data = [] for _title,_no,_type in zip(df["标题"],df["编号"],df["检查结果"]): if _type not in ["接口错"]: continue _dict = {"title":_title, "web_source":_no} list_data.append(_dict) return list_data def exportDocument_by_pagetime(): ots_client = getConnect_ots() # columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"] # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"] # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_name"] columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"] # columns = ["docchannel","docid","project_name","product","doctitle","page_time","province","city","time_get_file_end","time_bidclose","project_code","sub_docs_json","tenderee","info_type","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","detail_link","bidway","crtime","extract_count","products"] # columns = ["page_time","doctitle","crtime","web_source_no","web_source_name","detail_link","original_docchannel","uuid"] dict_channel = getDict_docchannel() # columns = ["doctitle","dochtmlcon","page_time","web_source_no","web_source_name","sub_docs_json"] # columns = ["tenderee","tenderee_contact","tenderee_phone"] list_query = [] str_keyword = ''' 重大安全风险防控 大范围速扫设备 气体泄露监测 ''' list_keyword = splitIntoList(str_keyword,"[\s\n、,,|]") str_con_keyword = ''' 信息化、弱电、pis、通讯 ''' con_keyword = splitIntoList(str_con_keyword,'[\s\n、,,|]') print(con_keyword) should_q_win = [] for _keyword in list_keyword: should_q_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword))) q_win = BoolQuery(should_queries=should_q_win) str_not_keyword = ''' ''' list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]") tenderee_keywrod = "医院、大学、高校、高中" list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]") should_q_system = [TermQuery("procurement_system","企业采购系统"), TermQuery("procurement_system","部队采购系统")] q_system = BoolQuery(should_queries=should_q_system) log(str(list_keyword)) s_province = "北京,天津,深圳,上海,浙江,江苏,安徽" list_province = splitIntoList(s_province,"[,,\s]") st = "环境监测中心、环境监测总站、环保局、水务局、水利局" list_tenderee = splitIntoList(st,"、|\s") # list_title = getKeywordByFile() # # for _d in list_title: # _title = _d["title"] # web_source = _d["web_source"] # bool_query = BoolQuery(must_queries=[ # generateBoolShouldQuery(["page_title"],[_title],MatchPhraseQuery), # TermQuery("web_source_no",web_source) # ]) # list_query.append({"query":bool_query}) for _keyword in list_keyword: bool_query = BoolQuery(must_queries=[ # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)), # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery), # generateBoolShouldQuery(["doctitle"],["实景三维"],MatchPhraseQuery), # generateBoolShouldQuery(["doctitle"],["电子站牌","智慧公交站"],MatchPhraseQuery), # ExistsQuery("tenderee"), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["食堂","饭堂"],MatchPhraseQuery), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["学校","大学","中学","小学"],MatchPhraseQuery), # generateBoolShouldQuery(["web_source_name"],list_keyword,TermQuery), generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],con_keyword,MatchPhraseQuery), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["上海泛微网络科技股份有限公司"],MatchPhraseQuery), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州政通信息科技有限公司"],MatchPhraseQuery), # BoolQuery(should_queries=[ # generateBoolShouldQuery(["doctitle"],["公告","公示","招标","中标","采购","工程","项目","询价","施工","比价","服务","监理","设计"],MatchPhraseQuery), # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["项目名称","招标人","中标人","项目编号","采购组织机构","采购人","招标范围","投标保证金","报价地址","询价通知书"],MatchPhraseQuery) # ]), # generateBoolShouldQuery(["doctitle"],["中标"],MatchPhraseQuery), # generateBoolShouldQuery(["docid"],list_keyword,TermQuery), # q_win, # should_q, # generateBoolShouldQuery(["tenderee"],[company],TermQuery), # generateBoolShouldQuery(["doctitle"],["实验室"],MatchPhraseQuery), # generateBoolShouldQuery(["docchannel"],[52,51,102,103,104],TermQuery), # generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery), generateBoolShouldQuery(["docchannel"],[101,119,120,121,122],TermQuery), # generateBoolShouldQuery(["docchannel"],[51,52,101,102,103,104,105,114,118,119,120],TermQuery), # generateBoolShouldQuery(["docchannel"],[102,52,101,114,119,120],TermQuery), # RangeQuery("page_time","2020-12-05","2023-12-12",True,False), # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")) # WildcardQuery("web_source_no","00219*"), # generateBoolShouldQuery(["district"],["南沙","中山","江门","珠海"],TermQuery), # TermQuery("info_type","物业管理"), RangeQuery("page_time","2023-01-01","2025-01-01"), RangeQuery("status",201,300,True,True), # RangeQuery("crtime","2020-05-01 09:00:00","2022-05-31 10:00:00",True,True), # RangeQuery("page_time",range_from="2022-01-01",range_to="2023-01-01"), # BoolQuery(should_queries=[TermQuery("page_time","2022-09-15"), # TermQuery("page_time","2022-10-20"), # TermQuery("page_time","2022-10-31")]) # TermQuery("page_time","2023-07-21"), # RangeQuery("page_time","2023-05-16","2024-10-14"), # TermQuery("docid",237163857), # WildcardQuery("tenderee","*雅居乐*"), # RangeQuery("crtime","2023-07-22 00:00:00"), # BoolQuery(should_queries=[NestedQuery("products",RangeQuery("products.unit_price",1)), # NestedQuery("products",RangeQuery("products.total_price",1)),]) # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",1000)), # NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5","92775529171409a32513f134a61d73c8")), # TermQuery("province","广东"), # TermQuery("city","无锡"), # generateBoolShouldQuery(["tenderee"],list_tenderee,WildcardQuery), # generateBoolShouldQuery(["tenderee"],["应急管理局","城市管理局","大数据局","政务服务管理局","消防局"],WildcardQuery), # WildcardQuery("tenderee","公安*"), # BoolQuery(should_queries=[WildcardQuery("tenderee","*地铁*"), # WildcardQuery("tenderee","*轨道交通*")]) # BoolQuery(should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")), # ]) # TermQuery("procurement_system","公安系统"), # generateBoolShouldQuery(["province"],["重庆"],WildcardQuery), # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery) # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery), ], # should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")), # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")), # ], # must_not_queries=[ # # TermQuery("exist_table",1), # # WildcardQuery("tenderee","公安县*"), # # TermQuery("attachment_extract_status",1), # # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_not_key,MatchPhraseQuery), # generateBoolShouldQuery(["province"],["湖南","广西","广东"],MatchPhraseQuery), # # q_system, # ] ) list_query.append({"query":bool_query,"limit":300000}) break list_row = getDocument(list_query,columns,"document","document_index") # list_row = getDocument(list_query,columns,"t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index") log("get document %d rows"%len(list_row)) # list_query = [] # # for _row in list_row: # _uuid = uuid4().hex # page_attachments = json.loads(_row.get("page_attachments")) # l_s = [] # for _at in page_attachments: # l_s.append(NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5",_at.get("fileMd5")))) # list_query.append({"query":BoolQuery(should_queries=l_s),"limit":500,"keyword":_uuid}) # list_row = getDocument(list_query,columns,"document","document_index") df_data = {} set_line = set() # list_row = filterRow(list_row,"doctitle",list_not_key) log("get document %d rows"%len(list_row)) # getRowDataWithKey(df_data,list_row,columns) getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True) # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True) # fixContactPerson(df_data,list_df_columns,get_legal_person=False) df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) # keys = df_data.keys() # print("keys",keys) # dict_company = {} # set_dup_keys = set() # set_docid = set() # for _i in range(len(df_data[list(keys)[0]])): # if df_data["关键词"][_i]==df_data["招标单位"][_i] or df_data["关键词"][_i]==df_data["中标单位"][_i]: # company_name = df_data["关键词"][_i] # if company_name not in dict_company: # dict_company[company_name] = {"企业名称":company_name,"招标":[],"中标":[]} # if df_data["关键词"][_i]==df_data["招标单位"][_i]: # if str(df_data["招标金额"][_i])!="nan": # _key = "%s-%s"%(company_name,str(df_data["招标金额"][_i])) # if _key not in set_dup_keys: # set_docid.add(df_data["docid"][_i]) # dict_company[company_name]["招标"].append({"标题":df_data["公告标题"][_i], # "招标方式":df_data["招标方式"][_i], # "招标单位":df_data["招标单位"][_i], # "招标金额":df_data["招标金额"][_i]}) # set_dup_keys.add(_key) # if df_data["关键词"][_i]==df_data["中标单位"][_i]: # if str(df_data["中标金额"][_i])!="nan": # _key = "%s-%s"%(str(df_data["中标单位"][_i]),str(df_data["中标金额"][_i])) # if _key not in set_dup_keys: # set_docid.add(df_data["docid"][_i]) # dict_company[company_name]["中标"].append({"标题1":df_data["公告标题"][_i], # "招标方式1":df_data["招标方式"][_i], # "中标单位1":df_data["中标单位"][_i], # "中标金额1":df_data["中标金额"][_i]}) # set_dup_keys.add(_key) # df_keys = ["企业名称","标题","招标方式","招标单位","招标金额","标题1","招标方式1","中标单位1","中标金额1"] # df_da = {} # for k in df_keys: # df_da[k] = [] # for k,v in dict_company.items(): # list_zhaobiao = v["招标"] # list_zhongbiao = v["中标"] # _nums = max(min(len(list_zhaobiao),5),min(len(list_zhongbiao),5)) # for i in range(_nums): # df_da["企业名称"].append(k) # if i>=len(list_zhaobiao): # df_da["标题"].append("") # df_da["招标方式"].append("") # df_da["招标单位"].append("") # df_da["招标金额"].append("") # else: # df_da["标题"].append(list_zhaobiao[i]["标题"]) # df_da["招标方式"].append(list_zhaobiao[i]["招标方式"]) # df_da["招标单位"].append(list_zhaobiao[i]["招标单位"]) # df_da["招标金额"].append(list_zhaobiao[i]["招标金额"]) # # if i>=len(list_zhongbiao): # df_da["标题1"].append("") # df_da["招标方式1"].append("") # df_da["中标单位1"].append("") # df_da["中标金额1"].append("") # else: # df_da["标题1"].append(list_zhongbiao[i]["标题1"]) # df_da["招标方式1"].append(list_zhongbiao[i]["招标方式1"]) # df_da["中标单位1"].append(list_zhongbiao[i]["中标单位1"]) # df_da["中标金额1"].append(list_zhongbiao[i]["中标金额1"]) # df2 = pd.DataFrame(df_da) # df2.to_excel("tmp333.xlsx",columns=df_keys) # # df_3 = {} # for k in keys: # df_3[k] = [] # for _i in range(len(df_data[list(keys)[0]])): # docid = df_data["docid"][_i] # if docid in set_docid: # for k in keys: # df_3[k].append(df_data[k][_i]) # df3 = pd.DataFrame(df_3) # df3.to_excel("tmp_333_mx.xlsx",columns=keys) # fixContactPerson(df_data,list_df_columns) # def findProjects(): df = pd.read_excel("两广地区中标时间为空标注_预匹配1.xlsx",0) list_items = [] for docids,project_code,project_name,tenderee,zhao_biao_page_time in zip(df["docids"],df["project_code"],df["project_name"],df["tenderee"],df["zhao_biao_page_time"]): if not isinstance(project_code,(str)): project_code = "$$$" if not isinstance(project_name,(str)): project_name = "$$$" if not isinstance(tenderee,(str)): tenderee = "" print(dir(zhao_biao_page_time)) _dict = {"docids":docids, "project_code":project_code, "project_name":project_name, "tenderee":tenderee, "zhao_biao_page_time":zhao_biao_page_time.strftime("%Y-%m-%d"), "end_time":timeAdd(zhao_biao_page_time.strftime("%Y-%m-%d"),180)} list_items.append(_dict) task_queue = queue.Queue() for item in list_items: task_queue.put(item) def _handle(item,result_queue,ots_client): docids = item.get("docids") list_s_n = [] for docid in re.split(",",str(docids)): list_s_n.append(TermQuery("docid",docid)) query_not = BoolQuery(should_queries=list_s_n) # bool_query =BoolQuery(must_queries=[query_not]) # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", # SearchQuery(bool_query,limit=50,get_total_count=True), # ColumnsToGet(["sub_docs_json"],ColumnReturnType.SPECIFIED)) # if total_count>0: # dict_rows = getRow_ots(rows) # _find = False # for _row in dict_rows: # sub_docs_json = _row.get("sub_docs_json",'[]') # sub_docs = json.loads(sub_docs_json) # for _doc in sub_docs: # if "bidding_budget" in _doc and _doc.get("bidding_budget",0)>0: # item["new_budding_budget"] = _doc.get("bidding_budget",0) # _find = True # break # if _find: # break # # return _find = True bool_query =BoolQuery(must_queries=[ generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$"),item.get("project_name","$$$$")],MatchPhraseQuery), generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery), RangeQuery("status",151,301), NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")), RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time")) ],must_not_queries=[query_not]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,limit=50,get_total_count=True), ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED)) if total_count>0: dict_rows = getRow_ots(rows) str_docid = "" for _row in dict_rows: str_docid+="%d,"%_row.get("docid") sub_docs_json = _row.get("sub_docs_json",'[]') sub_docs = json.loads(sub_docs_json) if item.get("tenderee","--")!=_row.get("tenderee","-#"): continue for _doc in sub_docs: if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0: item["new_win_bid_price"] = _doc.get("win_bid_price") item["new_win_tenderer"] = _doc.get("win_tenderer") item["new_finded_docid"] = _row.get("docid") _find = True break if _find: return item["maybe_docids"] = str_docid bool_query =BoolQuery(must_queries=[ generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$")],MatchPhraseQuery), generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery), RangeQuery("status",151,301), NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")), RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time")) ],must_not_queries=[query_not]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,limit=50,get_total_count=True), ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED)) if total_count>0: dict_rows = getRow_ots(rows) str_docid = "" for _row in dict_rows: str_docid+="%d,"%_row.get("docid") sub_docs_json = _row.get("sub_docs_json",'[]') sub_docs = json.loads(sub_docs_json) if item.get("tenderee","--")!=_row.get("tenderee","-#"): continue for _doc in sub_docs: if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0: item["new_win_bid_price"] = _doc.get("win_bid_price") item["new_win_tenderer"] = _doc.get("win_tenderer") item["new_finded_docid"] = _row.get("docid") _find = True break if _find: return item["maybe_docids"] = str_docid bool_query =BoolQuery(must_queries=[ generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_name","$$$$")],MatchPhraseQuery), generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery), RangeQuery("status",151,301), NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")), RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time")) ],must_not_queries=[query_not]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,limit=50,get_total_count=True), ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED)) if total_count>0: dict_rows = getRow_ots(rows) str_docid = "" for _row in dict_rows: str_docid+="%d,"%_row.get("docid") sub_docs_json = _row.get("sub_docs_json",'[]') sub_docs = json.loads(sub_docs_json) if item.get("tenderee","--")!=_row.get("tenderee","-#"): continue for _doc in sub_docs: if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0: item["new_win_bid_price"] = _doc.get("win_bid_price") item["new_win_tenderer"] = _doc.get("win_tenderer") item["new_finded_docid"] = _row.get("docid") _find = True break if _find: return item["maybe_docids"] = str_docid return mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=getConnect_ots()) mt.run() df_data = {"docids":[], "project_code":[], "project_name":[], "maybe_docids":[], "new_budding_budget":[], "new_win_bid_price":[], "new_win_tenderer":[], "new_finded_docid":[]} keys = df_data.keys() for item in list_items: for k in keys: df_data[k].append(item.get(k)) df2 = pd.DataFrame(df_data) df2.to_excel("两广补充数据.xlsx") def attachCompanyContact(): files = ["../data/2021-03-17_四川_关键词导出.csv", "../data/2021-03-17_安徽_关键词导出.csv", "../data/2021-03-17_江西_关键词导出.csv", "../data/2021-03-17_湖南_关键词导出.csv"] files = ["../data/20210609(最新).xlsx"] pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) set_enter = set() for file in files: df = pd.read_excel(file) columns = ["招标单位","中标单位","代理单位"] for _c in columns: for item in df[_c]: if isinstance(item,str): item = item.strip() if item!="": set_enter.add(item) dict_enter = getDictEnterprise(list(set_enter)) for file in files: task_queue = queue.Queue() df = pd.read_excel(file,encoding="UTF8") keys = df.keys()[1:] list_item = [] for row in df.itertuples(): _dict = {} for _key in keys: if _key in dir(row): _v = row.__getattribute__(_key) else: _v = '' _dict[_key] = _v if str(_dict["招标联系人"]) in ("","nan") or str(_dict["招标联系人电话"]) in ("","nan"): contact_person,mobile = getOneContact(dict_enter.get(_dict["招标单位"],{}).get("contacts","[]")) if contact_person!="": _dict["招标联系人"] = contact_person _dict["招标联系人电话"] = mobile if str(_dict["中标联系人"]) in ("","nan") or str(_dict["中标联系人电话"]) in ("","nan"): contact_person,mobile = getOneContact(dict_enter.get(_dict["中标单位"],{}).get("contacts","[]")) if contact_person!="": _dict["中标联系人"] = contact_person _dict["中标联系人电话"] = mobile if str(_dict["代理联系人"]) in ("","nan") or str(_dict["代理联系人电话"]) in ("","nan"): contact_person,mobile = getOneContact(dict_enter.get(_dict["代理单位"],{}).get("contacts","[]")) if contact_person!="": _dict["代理联系人"] = contact_person _dict["代理联系人电话"] = mobile list_item.append(_dict) for item in list_item: task_queue.put(item) df_data = {} for _k in keys: df_data[_k] = [] for item in list_item: for _k in keys: df_data[_k].append(getLegal_str(item.get(_k,"-"))) df1 = pd.DataFrame(df_data) df1.to_excel("%s_attach.xlsx"%file,columns=keys) def dumpWebSourceNo(): conn_oracle = getConnection_oracle() cursor_oracle = conn_oracle.cursor() sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO " cursor_oracle.execute(sql) rows = cursor_oracle.fetchall() conn_mysql = getConnection_testmysql() cursor_mysql = conn_mysql.cursor() for row in rows: sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1]) print(sql) cursor_mysql.execute(sql) conn_mysql.commit() def exportNzj(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"] def getData(df_data,rows,set_line): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row set_dict_item(item,"docids",_dict.get("docids","")) set_dict_item(item,"contacts",_dict.get("contacts","")) set_dict_item(item,"covered_area",_dict.get("covered_area","")) set_dict_item(item,"follows",_dict.get("follows","")) set_dict_item(item,"project_type",_dict.get("project_type","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item(item,"page_time",_dict.get("page_time","")) set_dict_item(item,"progress",_dict.get("progress","")) set_dict_item(item,"project_description",_dict.get("project_description","")) set_dict_item(item,"project_follow",_dict.get("project_follow","")) set_dict_item(item,"project_code",_dict.get("project_code","")) set_dict_item(item,"project_name",_dict.get("project_name","")) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df_data = {} bool_query = BoolQuery(must_queries=[ExistsQuery("docids")]) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) getData(df_data,rows,set_line) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows,set_line) _count += len(rows) df1 = pd.DataFrame(df_data) df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns) def turn_status(): df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx") conn = getConnection_testmysql() cursor = conn.cursor() for docid in df["公告id"]: partitionkey = int(docid)%500+1 sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid) cursor.execute(sql) conn.commit() def attachBidding_budget(): conn_mysql = getConnection_testmysql() cursor = conn_mysql.cursor() sql = "select docid from analysis_r2 where bidding_budget=''" task_queue = queue.Queue() result_queue = queue.Queue() cursor.execute(sql) rows = cursor.fetchmany(10000) while(rows): for row in rows: task_queue.put(row[0]) rows = cursor.fetchmany(10000) pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql) pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) def _handle(item,result_queue,pool_mysql,pool_ots): ots_client = pool_ots.getConnector() bool_query = BoolQuery(must_queries=[TermQuery("docids",item)]) rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index", SearchQuery(bool_query , limit=1, get_total_count=True), ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED)) list_dict = getRow_ots(rows) if len(list_dict)>0: conn = pool_mysql.getConnector() cursor = conn.cursor() sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item) cursor.execute(sql) conn.commit() pool_mysql.putConnector(conn) pool_ots.putConnector(ots_client) mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots) mt.run() def debug_documentMerge(): conn = getConnection_testmysql() cursor = conn.cursor() sql = "select merge_docids from project_group_final_log " cursor.execute(sql) task_queue = queue.Queue() for row in cursor.fetchall(): task_queue.put(row[0]) print(task_queue.qsize()) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() list_docids = item.split(",") must_q = [] for _docid in list_docids: must_q.append(TermQuery("docids",_docid)) bool_query = BoolQuery(must_queries=must_q) rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index", SearchQuery(bool_query,limit=1,get_total_count=True), ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED)) if total_count==0: print(item) result_queue.put(item) pool_ots.putConnector(ots_client) result_queue = queue.Queue() pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots) mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots) mt.run() while(True): try: item = result_queue.get(True) print(item) except Exception as e: print(str(e)) break def signDocument(): filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx" sign_filename = "%s_sign.xlsx"%filename df = pd.read_excel(filename) df_data = {"sign":[]} for item in df["segword"]: content = re.sub("\s*","",item) _find = re.search("(?P采购失败|招标失败)",content) if _find is not None: df_data["sign"].append(_find.groupdict().get("key")) else: df_data["sign"].append("无") df1 = pd.DataFrame(df_data) df1.to_excel(sign_filename) def exportWin_tenderer_count(): ots_client = getConnect_ots() str_enter = ''' 红旗渠建设集团有限公司 河南瑞华建筑集团有限公司 林州瑞达工程管理有限公司 河南鸿盛建筑工程有限公司 天一建设发展有限公司 河南省大成建设工程有限公司 中润昌弘建工集团有限公司 河南省中创建筑工程有限公司 河南锦达建设有限公司 林州宏基建筑工程有限公司 河南富世建筑工程有限公司 中恒方圆建筑工程有限公司 河南华隆建设工程有限公司 河南昊锦建设集团有限公司 河南新隆建工集团有限公司 中城华安建设集团有限公司 河南恒通公路桥梁建设有限公司 林州二建集团建设有限公司 河南华安建设集团有限公司 河南裕鸿建筑工程有限公司 中商建投建设有限公司 河南鑫利恒工程有限公司 林州市永盛建筑有限公司 林州市顺鑫建筑有限公司 中水京林建设有限公司 河南宏海建设有限公司 河南宏岳建设有限公司 河南元筑实业有限公司 河南基兆建筑工程有限公司 林州金瑞建筑工程有限公司 林州建工集团有限公司 河南万融建筑工程有限公司 林州东风建设有限公司 河南鸿泰建筑安装有限公司 河南源泰建筑有限公司 河南优德建筑工程有限公司 安阳鸿盛建设劳务有限公司 河南省安阳市安装工程有限责任公司 河南港城建设工程有限公司 河南天汇建筑工程有限公司 河南省惠浦建设发展有限公司 林州市建筑工程有限公司 河南正天建筑工程有限公司 河南颂邦建筑工程有限公司 林州市华源建设有限公司 河南中投建设有限公司 林州华林建筑劳务有限公司 河南基祥建设工程有限公司 河南文水水电工程有限公司 林州兴业建筑工程有限公司 河南中州建筑有限公司 河南省佳禾园林建设有限公司 林州万亚建筑工程有限公司 河南林正建设工程有限公司 河南鼎兴建设工程有限公司 河南平原建工集团有限公司 河南林九建设工程有限公司 林州市三才建筑工程有限公司 安阳建设(集团)有限责任公司 盛世恒达建设有限公司 河南城洲建设工程有限公司 河南国埔建筑工程有限公司 中创市政建设发展有限公司 河南正祥建筑工程有限公司 河南宏九建筑工程有限公司 河南金阳建筑工程有限公司 河南天容建设工程有限责任公司 河南聚宇建筑工程有限公司 河南瑞旗建设工程有限公司 河南利盛建设工程有限公司 林州四海建设有限公司 林州市建工城建集团有限公司 河南众佑建设工程有限公司 河南德诚建设有限公司 河南景华建筑工程有限公司 河南华江建筑工程有限公司 林州永丰建设集团有限公司 林州福东建设工程有限公司 河南恒森建筑工程有限公司 河南朝泓建设工程有限公司 河南润京建设有限公司 林州市红旗渠公路工程有限公司 林州中宇建设工程有限公司 河南长锦建设工程有限公司 河南汇商建筑工程有限公司 河南省豫鹤建设工程有限公司 河南江城建筑工程有限公司 中海华祥建设发展有限公司 河南宁中路桥建筑有限公司 河南天河建设工程有限公司 林州市路桥建筑工程有限公司 河南省中北建设有限公司 河南汇亿建筑工程有限公司 河南金帝建筑安装有限公司 河南省望安建筑工程有限公司 泰欣建设有限公司 河南筑鑫建筑工程有限公司 元熙建设工程有限公司 旭隆建设集团有限公司 河南省城控建工集团有限公司 河南晨丰建筑工程有限公司 河南嘉丰建设有限公司 林州市合众建筑劳务有限公司 河南金瓦刀建筑劳务有限公司 河南中实建筑工程有限公司 畅通路桥工程建设有限责任公司 河南军恒建设有限公司 中钊建设集团有限公司 河南德宁建设集团有限公司 林州兴鸿建筑工程有限公司 林州市明泽建筑工程有限公司 河南紫光建筑工程有限公司 河南誉天建筑工程有限公司 林州景丰建筑劳务有限公司 河南江瀚建筑劳务有限公司 河南弘之昌建筑工程有限公司 河南祥泰钻井工程有限公司 河南迅阳建筑劳务有限公司 河南嘉成建筑工程有限公司 河南兴锦建设工程有限公司 河南邦坤建设工程有限公司 河南锦毅市政工程建筑有限公司 河南广益建筑工程有限公司 河南创胜建筑工程有限公司 河南勤铭建筑工程有限公司 河南铭锋建设工程有限公司 平源建设有限公司 河南隆通建筑工程有限公司 河南省基本建设有限公司 河南丰茂建筑劳务有限公司 河南城安建筑工程有限公司 林州市富源建筑劳务有限公司 德方建设有限公司 河南泰联建筑工程有限公司 河南新建投工程有限公司 河南省鲁班建工集团有限公司 林州方超建筑劳务有限公司 林州市采桑建筑劳务输出有限公司 河南省仁昱建筑工程有限公司 河南鸾林建设工程有限公司 宜民建设集团有限公司 林州聚兴建筑工程有限公司 河南省聚国建筑工程有限公司 林州市大东建筑劳务有限公司 河南欣东劳务有限公司 中建润德景观建筑工程有限公司 河南辰宇建设工程有限公司 号东建设工程有限公司 河南润北建筑工程有限公司 河南邦昊建设工程有限公司 林州市建设投资有限责任公司 林州市太行建设工程有限公司 河南峡安建筑工程有限公司 河南安疆建筑工程有限公司 河南淇河建设工程有限公司 河南晶品建设有限公司 河南翔固建筑工程有限公司 纵横九州路桥建设有限公司 河南青林建筑工程有限公司 合久建设有限公司 河南明昊建筑工程有限公司 河南滨河建设工程有限公司 河南群腾建筑工程有限公司 河南隆亨建筑工程有限公司 骏达建设有限公司 河南仁安建设工程有限公司 河南旻尚园林建筑工程有限公司 河南省匡正建设工程有限公司 河南金凡建筑工程有限公司 河南佰丰建筑工程有限公司 德普建设有限公司 国润新天地工程技术有限公司 中潮建设发展有限公司 河南捷正建筑工程有限公司 林州百万工匠建筑劳务有限公司 河南祥彬建筑工程有限公司 河南林祥工程建设有限公司 河南唐尧建筑劳务有限公司 河南汇祥建设有限公司 河南友信建设有限公司 林州市鼎昇建筑工程有限公司 林州市富兴建筑劳务有限公司 林州厚德建筑劳务有限公司 河南振亚工程建设有限公司 河南英茂建筑工程有限公司 河南丰岩建设工程有限公司 林州市昌都建筑工程有限公司 林州四建建筑工程有限公司 林州和兴建筑劳务有限公司 林州市鸿升建筑工程有限公司 河南润泰建设工程有限公司 河南鑫路通建筑劳务有限公司 河南信守建筑劳务有限公司 林州安达鸿昌建筑劳务有限公司 河南意达建设有限公司 河南金穗来建筑工程有限公司 河南东风建筑工程有限公司 河南筑胜建筑劳务有限公司 河南民润建筑工程有限公司 林州市中锦路桥建设工程有限公司 林州一建建筑工程有限公司 林州市宏瑞建筑劳务有限公司 林州鸿恩建筑劳务有限公司 河南晟元建筑工程有限公司 中国建筑第六工程局有限公司 河南筑泰建筑工程有限公司 河南省亚建建筑工程有限公司 河南辰弘建筑工程有限公司 河南先创建筑工程有限公司 林豫建工集团有限公司 河南省盛民建筑工程有限公司 河南泓发市政工程有限公司 河南帝恩建筑劳务有限公司 河南天泉建设工程有限公司 河南恒升工程建设有限公司 林州市浩远电力建筑工程有限公司 河南友瑞建筑工程有限公司 河南冠州路桥工程有限公司 三角鼎建设工程有限公司 河南富坤建筑工程有限公司 林州市恒源建筑工程有限公司 河南广汇建筑工程有限公司 河南隆豫建设有限公司 林州市九洲工程劳务有限公司 林州瑜辉建筑工程有限公司 河南福恩建筑工程有限公司 河南通盛路桥建设有限公司 河南央泰建设工程有限公司 林州市红旗渠公路养护工程有限公司 林州大兴建设工程有限公司 河南锐丰建设工程有限公司 林州市中泰建筑劳务有限公司 林州成业建筑工程有限公司 河南建创建筑工程有限公司 河南宏兴建设工程有限公司 河南隆鼎建筑工程有限公司 林州市天罡建筑劳务有限公司 汇聚建设发展有限公司 中铁中城工程有限公司 河南景天建筑劳务有限公司 林州蒙建建设工程有限公司 富华建设工程有限公司 河南殿轩建筑劳务有限公司 河南瑞通建设工程有限公司 林州金桥劳务工程有限公司 河南省景隆实业有限公司 河南升洲建筑工程有限公司 河南里程建筑劳务有限公司 林州市润景建设工程有限公司 河南巨坤建筑工程有限公司 河南九牛建设劳务有限公司 吉修建设工程有限公司 河南图润建筑工程有限公司 河南鼎鑫建筑劳务有限公司 河南港航建设工程有限公司 河南省盛飞建设工程有限公司 林州市兴义建筑劳务有限公司 河南秉程建筑工程有限公司 河南硕亚水电路桥工程有限公司 河南科才建筑劳务有限公司 河南荣泰建筑安装工程有限公司 河南省天丰建筑工程有限公司 河南方元建筑工程有限公司 恒上建设有限公司 河南省德信建筑工程有限公司 河南诚宸建设工程有限公司 河南置信建筑工程有限公司 河南省鑫河建设有限公司 河南成兴建设工程有限公司 林州中港建筑工程有限公司 河南富春建设工程有限公司 中科豫资建设发展有限公司 河南京都建筑安装有限公司 安阳市宇豪爆破工程有限公司 河南华特建筑工程有限公司 河南颍淮建工有限公司 林州市八建工程有限公司 河南展辉建筑工程有限公司 河南中博建筑有限公司 河南方圆建设有限公司 河南大鼎建筑工程有限公司 林州中天建设有限公司 河南久东建筑工程有限公司 河南九一建设工程有限公司 九州水文建设集团有限公司 河南省建安防水防腐工程有限公司 中建宏图建设发展有限公司 筑宇建设有限公司 林州市宏图建设工程有限公司 河南林润建设工程有限公司 嘉泰建设发展有限公司 河南丰茂建筑安装工程有限公司 河南万泰建设工程有限公司 林州市红旗渠市政工程有限公司 林州建总建筑工程有限公司 河南聚之祥建设有限公司 河南鼎之信建设工程有限公司 河南省华瑞建设工程有限公司 河南世光电力工程有限公司 河南地远建筑工程有限公司 河南鑫品建筑工程有限公司 河南省东旗建筑工程有限公司 润华建设有限公司 林州富民建筑劳务有限公司 林州市晨诚建筑劳务有限公司 河南万胜建设有限公司 河南龙磐建筑工程有限公司 河南顺昌建筑劳务有限公司 林州恒瑞建设工程有限公司 河南大成建设劳务有限公司 河南大一建筑劳务有限公司 河南盛威建筑工程有限公司 河南坤之宇建筑工程有限公司 众信电力工程有限公司 河南昱佛建筑工程有限公司 河南淇源建筑工程有限公司 林州凤宝建筑安装有限公司 河南中发岩土工程有限公司 河南中都建设工程有限公司 河南祥凯建筑工程有限公司 河南乐泰建筑工程有限公司 林州宏达建筑劳务有限公司 河南华盛建设集团有限公司 河南凯通建设工程有限公司 国腾路桥工程有限公司 中建方达建设工程有限公司 河南省天都建设工程有限公司 昌隆建设工程有限公司 河南洹上村园林绿化工程有限公司 河南双锦建设工程有限公司 河南子丰市政工程有限公司 林州首创建筑工程有限公司 河南众鑫建筑工程有限公司 河南宁崴建筑工程有限公司 林州市航安建筑劳务有限公司 林州益成建设工程有限公司 林州市昌弘建筑工程有限公司 河南正耀建设有限公司 河南鑫鹏建设工程有限公司 林州恒泰建筑工程有限公司 林竣建设有限公司 河南朝众建筑工程有限公司 林州科鸿建筑工程有限公司 东辰建设发展有限公司 河南创新新能源科技有限公司 河南省永业建筑工程有限公司 林州市煜凯建筑工程有限公司 宝鼎建设工程有限公司 林州市航安建筑工程有限公司 河南业展建设工程有限公司 河南联竣建筑工程有限公司 河南聚超建筑工程有限公司 林州远方电力工程有限公司 河南蒙寅建筑劳务有限公司 方元建筑劳务有限公司 龙兴建设工程有限公司 河南春谦建设工程有限公司 河南正博公路工程有限公司 林州市汇鑫安装工程有限公司 林州市祥隆劳务有限公司 河南胜杰建筑工程有限公司 河南恩普建筑工程有限公司 河南港津建筑工程有限公司 河南昌明建筑工程有限公司 中豫城控建设集团有限公司 林州晨宇建设工程有限公司 河南豫柯建筑工程有限公司 河南捷润建筑工程有限公司 中方通建设工程有限公司 河南多果建筑工程有限公司 河南尚伟建筑工程有限公司 林州新航程建筑工程有限公司 河南金华建筑工程有限公司 国云工程技术有限公司 河南路威路桥工程有限公司 林州中盛建设工程有限公司 林州市恒基建设有限公司 河南润恒建筑工程有限公司 河南华安水利工程有限公司 中城易通建设发展有限公司 河南浚洲建筑工程有限公司 林州市锦晟建筑劳务有限公司 河南省北安建筑工程有限公司 林州泰岳建设工程有限公司 河南联洋建筑工程有限公司 河南港大市政建筑工程有限公司 林州东盛建筑劳务有限公司 河南省天鉴建设工程有限公司 河南瑞凝建筑工程有限公司 林州市东瑞建筑劳务有限公司 河南众达建筑劳务有限公司 河南省帝增建筑工程有限公司 河南省升灿建筑工程有限公司 河南苑景建筑劳务分包有限公司 林州众立建设工程有限公司 红旺建筑工程有限公司 林州市圣兴建筑劳务有限公司 林州诚林建筑劳务有限公司 林州建工劳务有限公司 河南巨业建筑工程有限公司 中科华夏建设开发有限公司 君晟建筑工程有限公司 郑州新动力建筑劳务分包有限公司 河南省福德建筑工程有限公司 林州源大建筑工程有限公司 河南大瑞园林建设有限公司 河南秋禾建筑劳务有限公司 河南腾翔建筑工程有限公司 河南天之华建设工程有限公司 河南祥和建筑安装有限公司 河南省鼎文建设工程有限公司 河南周城建设发展有限公司 河南庆泰建筑工程有限公司 中科信合建设发展有限公司 林州恒隆建设工程有限公司 河南省力恒建筑工程有限公司 林州市四季青绿化有限责任公司 林州市景盛建筑工程有限公司 河南建基建设工程有限公司 河南宝凯建筑工程有限公司 林州市四合建筑劳务有限公司 河南和耀建筑工程有限公司 林州市凯达建筑劳务有限公司 林州市恒信建筑劳务有限公司 开翔建设工程有限公司 河南省新创达建设工程有限公司 林州鑫龙建筑工程有限公司 河南省昌博建筑工程有限公司 河南君利泰建筑工程有限公司 林州杏林建筑工程有限公司 河南千禧建设工程有限公司 中建诚正建筑工程有限公司 河南省聚千建筑工程有限公司 林州海之鸿建筑工程有限公司 河南振鼎建筑工程有限公司 林州方成建筑劳务有限公司 河南众众建设工程有限公司 林州市万润建筑劳务有限公司 启创建设工程有限公司 河南子明建筑工程有限公司 安阳市兴鼎路桥工程有限公司 河南智擎建筑劳务有限公司 河南鼎平市政工程有限公司 林州宏阳建筑工程有限公司 河南豫泰建筑工程有限公司 林州市鸿浩建筑劳务有限公司 林州市锦华建筑工程有限公司 河南瑞锋建设有限公司 河南欧信建筑劳务有限公司 林州市中兴建筑劳务有限公司 林州市大德建设工程有限公司 河南华文建设有限公司 河南凌焜建筑工程有限公司 河南安居建设有限公司 林州鲲鹏建筑工程有限公司 林州经纬建筑工程有限公司 林州祥川建筑工程有限公司 林州市鑫淼建筑劳务有限公司 河南祥泰路桥有限公司 景祥建设工程有限公司 河南省兴华建安工程有限公司 河南古森建筑劳务有限公司 平祥建设工程有限公司 河南大博建设工程有限公司 河南华普建设工程有限公司 河南东邦建设工程有限公司 卓冠建设工程有限公司 河南品瑞建筑工程有限公司 河南宝金建设工程有限公司 中城鑫邦建设有限公司 河南省鸿运建设工程有限公司 林州明奥建筑工程有限公司 河南金手指建设工程有限公司 林州市弘顺建筑劳务有限公司 林州市林海建筑劳务有限公司 河南艺兆市政工程有限公司 林州誉峰建筑工程有限公司 河南卓骏建筑工程有限公司 林州众成建筑工程有限公司 河南城通市政工程有限公司 林州市晋源建筑工程有限公司 河南飞越建筑工程有限公司 林州鑫泰建筑工程有限公司 林州市太行建筑劳务有限公司 河南筑丰建设发展有限公司 林州一帆建筑劳务有限公司 林州宏久建筑工程有限公司 林州市盛祥建筑劳务有限公司 河南黎润建设工程有限公司 林州市永安建筑劳务有限公司 河南省长江建设实业有限公司 河南腾润建设工程有限公司 河南国梁建设工程有限公司 河南诚聚建筑工程有限公司 河南德邦市政工程有限公司 河南安德建设工程有限公司 河南森川建筑工程有限公司 林州市顺通公路工程有限公司 河南领邦建筑工程有限公司 河南博兴建设工程有限公司 东泽消防工程有限公司 ''' list_enter = [] for _p in re.split("\s",str_enter): if _p.strip()=="": continue list_enter.append({"name":_p.strip()}) def _handle(item,result_queue,pool_ots): ots_client = pool_ots.getConnector() try: bool_query = BoolQuery(must_queries=[ NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"])) ,RangeQuery("status",201,300,include_lower=True,include_upper=True) ,RangeQuery("page_time","2020-01-01") ]) rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", SearchQuery(bool_query, limit=1, get_total_count=True), ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED)) item["total_count"] = total_count # bool_query = BoolQuery(must_queries=[ # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"])) # ,RangeQuery("status",201,300,include_lower=True,include_upper=True) # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",0,1000000,include_upper=True)) # ]) # # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index", # SearchQuery(bool_query, limit=1, get_total_count=True), # ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED)) # item["l_total_count"] = total_count except Exception as e: pass pool_ots.putConnector(ots_client) pool_ots = ConnectorPool(init_num=30,max_num=40,method_init=getConnect_ots) task_queue = queue.Queue() for item in list_enter: task_queue.put(item) mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots) mt.run() df_data = {} for item in list_enter: for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df = pd.DataFrame(df_data) df.to_excel("../data/%s.xls"%getCurrent_date("%Y-%m-%d_%H%M%S")) from bs4 import BeautifulSoup def downloadAttach(_url,_path): try: result = requests.get(_url,stream=True,timeout=20) if result.status_code==200: with open(_path,"wb") as f: f.write(result.content) else: log("download failed with code %d of url:%s"%(result.status_code,_url)) except Exception: log("download failed of url:%s"%(_url)) def extract_pageAttachments(_html): fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"] _soup = BeautifulSoup(_html,"lxml") list_a = _soup.find_all("a") list_img = _soup.find_all("img") page_attachments = [] for _a in list_a: _text =_a.get_text() _url = _a.attrs.get("href","") if _url.find("http://www.bidizhaobiao.com")>=0: continue is_attach = False for suf in fileSuffix: if _text.find(suf)>=0 or _url.find(suf)>=0: is_attach = True if is_attach: page_attachments.append({"fileLink":_url,"fileTitle":_text}) for _a in list_img: _text =_a.get_text() _url = _a.attrs.get("src","") if _url.find("http://www.bidizhaobiao.com")>=0: continue is_attach = False for suf in fileSuffix: if _text.find(suf)>=0 or _url.find(suf)>=0: is_attach = True if is_attach: page_attachments.append({"fileLink":_url,"fileTitle":_text}) return page_attachments def exportDocument_attachment(): ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[TermQuery("docid",165528701)]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True), columns_to_get=ColumnsToGet(["dochtmlcon"],ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) for _data in list_data: extract_pageAttachments(_data["dochtmlcon"]) def transUUid(): conn_oracle = getConnection_oracle() cursor = conn_oracle.cursor() tables = ['T_ZHAO_BIAO_GONG_GAO','T_ZHONG_BIAO_XIN_XI'] conn_mysql = getConnection_testmysql() cursor_mysql = conn_mysql.cursor() for _t in tables: sql = " select id,page_time,'%s' from bxkc.%s where page_time>='%s' and page_time<='%s' order by page_time "%(_t,_t,"2021-06-01","2021-08-31") print(sql) cursor.execute(sql) _count = 0 while(True): insert_sql = "insert into fix_document(uuid,page_time,table_name) values" rows = cursor.fetchmany(10000) if not rows: break _count += len(rows) print(_count) for row in rows: _uuid = row[0] page_time = row[1] table_name = row[2] insert_sql += "('%s','%s','%s'),"%(_uuid,page_time,table_name) insert_sql = insert_sql[:-1] cursor_mysql.execute(insert_sql) conn_mysql.commit() def fix_document(): conn_oracle = getConnection_oracle() cursor_oracle = conn_oracle.cursor() conn_mysql = getConnection_testmysql() cursor_mysql = conn_mysql.cursor() sql = "select uuid,page_time,table_name from fix_document_final where page_time>='2021-06-24' " cursor_mysql.execute(sql) _count = 0 while True: rows = cursor_mysql.fetchmany(1000) if not rows: break _count += len(rows) print(_count) insert_sql = "" for row in rows: _uuid = row[0] page_time = row[1] table_name = row[2] insert_sql += " insert into BXKC.fix_document_final(id,page_time,TABLENAME) values('%s','%s','%s');"%(_uuid,page_time,table_name) insert_sql = "begin %s end;"%(insert_sql) cursor_oracle.execute(insert_sql) conn_oracle.commit() def exportDocument_forRecommen(): filename = "../data/推荐 (1).csv" df = pd.read_csv(filename,encoding="GBK") ots_client = getConnect_ots() columns = ["province","city","page_time","doctitle","product"] current_date = getCurrent_date("%Y-%m-%d") adict_data = [] _index = 0 for company,json_docid in zip(df["company"][:10000],df["json_docid"][:10000]): _index += 1 _province = "" # consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",company)],columns_to_get=["province"]) # dict_k = getRow_ots_primary(return_row) # _province = dict_k.get("province","") print("序号:%d,%s,%s"%(_index,company,_province)) dict_recommen = json.loads(json_docid) for str_way,str_docid in dict_recommen.items(): should_q = [] for _docid in str_docid.split(","): should_q.append(TermQuery("docid",_docid)) bool_query = BoolQuery(must_queries=[ # TermQuery("province",_province) # ,RangeQuery("page_time",timeAdd(current_date,-7),current_date,True,True) # , BoolQuery(should_queries=should_q)] ) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,get_total_count=True,limit=100), ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) adict_row = getRow_ots(rows) for dict_row in adict_row: dict_item = dict() set_dict_item(dict_item,"公司名称",company) set_dict_item(dict_item,"推荐路径",str_way) set_dict_item(dict_item,"公告id",dict_row.get("docid","")) set_dict_item(dict_item,"省份",dict_row.get("province","")) set_dict_item(dict_item,"城市",dict_row.get("city","")) set_dict_item(dict_item,"page_time",dict_row.get("page_time","")) set_dict_item(dict_item,"doctitle",dict_row.get("doctitle","")) set_dict_item(dict_item,"product",dict_row.get("product","")) adict_data.append(dict_item) dict_data = {} for dict_item in adict_data: for k in list_df_columns: if k not in dict_data: dict_data[k] = [] dict_data[k].append(dict_item.get(k,"")) df1 = pd.DataFrame(dict_data) df1.to_excel("../data/%s_推荐.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns) def exportDocument_by_days(page_time): dict_channel = getDict_docchannel() ots_client = getConnect_ots() filename = "供货贷含[建筑]企业名单.xlsx" df = pd.read_excel(filename) bool_query = BoolQuery(must_queries=[TermQuery("page_time",page_time), # RangeQuery("status",201,301), ]) # columns = ["doctitle","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link"] columns = ["doctitle","docchannel","product","bidway","moneysource","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link","page_attachments","service_time"] dict_channel = getDict_docchannel() def hidePhone(phone): if phone is None or phone=="": return "" return "*"*(len(phone)-4)+phone[-4:] def getData(df_data,rows,set_line,list_keyword,set_columns,df_columns): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row set_dict_item_columns(set_columns,df_columns,item,"docid",_dict.get("docid","")) set_dict_item_columns(set_columns,df_columns,item,"公告标题",_dict.get("doctitle","")) # set_dict_item_columns(set_columns,df_columns,item,"公告内容",_dict.get("doctextcon","")) set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) # set_dict_item_columns(set_columns,df_columns,item,"关键词",",".join(list(set(re.findall("|".join(list_keyword),_dict.get("doctextcon","")))))) set_dict_item_columns(set_columns,df_columns,item,"产品",_dict.get("product","")) set_dict_item_columns(set_columns,df_columns,item,"省份",_dict.get("province","")) # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) set_dict_item_columns(set_columns,df_columns,item,"资金来源",_dict.get("moneysource","")) set_dict_item_columns(set_columns,df_columns,item,"招标方式",_dict.get("bidway","")) set_dict_item_columns(set_columns,df_columns,item,"服务期限",_dict.get("service_time","")) set_dict_item_columns(set_columns,df_columns,item,"城市",_dict.get("city","")) set_dict_item_columns(set_columns,df_columns,item,"区县",_dict.get("district","")) set_dict_item_columns(set_columns,df_columns,item,"发布时间",_dict.get("page_time","")) set_dict_item_columns(set_columns,df_columns,item,"创建时间",_dict.get("crtime","")) set_dict_item_columns(set_columns,df_columns,item,"行业一级分类",_dict.get("industry","")) set_dict_item_columns(set_columns,df_columns,item,"行业二级分类",_dict.get("info_type","")) # set_dict_item_columns(set_columns,df_columns,item,"uuid",_dict.get("uuid")) # set_dict_item_columns(set_columns,df_columns,item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle",""))) set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),"")) set_dict_item_columns(set_columns,df_columns,item,"原网公告类别",dict_channel.get(_dict.get("original_docchannel",""),"")) set_dict_item_columns(set_columns,df_columns,item,"status","正常" if _dict.get("status",201) <=300 else "去重") set_dict_item_columns(set_columns,df_columns,item,"detail_link",_dict.get("detail_link")) set_dict_item_columns(set_columns,df_columns,item,"web_source_no",_dict.get("web_source_no","")) set_dict_item_columns(set_columns,df_columns,item,"web_source_name",_dict.get("web_source_name","")) set_dict_item_columns(set_columns,df_columns,item,"项目名称",_dict.get("project_name","")) set_dict_item_columns(set_columns,df_columns,item,"项目编号",_dict.get("project_code","")) set_dict_item_columns(set_columns,df_columns,item,"招标单位",_dict.get("tenderee","")) set_dict_item_columns(set_columns,df_columns,item,"招标联系人",_dict.get("tenderee_contact","")) set_dict_item_columns(set_columns,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone","")) set_dict_item_columns(set_columns,df_columns,item,"代理单位",_dict.get("agency","")) set_dict_item_columns(set_columns,df_columns,item,"代理联系人",_dict.get("agency_contact","")) set_dict_item_columns(set_columns,df_columns,item,"代理联系人电话",_dict.get("agency_phone","")) set_dict_item_columns(set_columns,df_columns,item,"url","http://www.bidizhaobiao.com/info-%d.html"%(_dict.get("docid",""))) set_dict_item_columns(set_columns,df_columns,item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) set_dict_item_columns(set_columns,df_columns,item,"截标时间",_dict.get("time_bidclose","")) set_dict_item_columns(set_columns,df_columns,item,"page_attachments",_dict.get("page_attachments","[]")) sub_docs_json = _dict.get("sub_docs_json") set_tenderer = set() if sub_docs_json is not None: docs = json.loads(sub_docs_json) docs.sort(key=lambda x:x.get("win_bid_price",0)) for _doc in docs: if "win_tenderer" in _doc: set_dict_item_columns(set_columns,df_columns,item,"中标单位",_doc["win_tenderer"]) if "second_tenderer" in _doc: set_dict_item_columns(set_columns,df_columns,item,"第二候选单位",_doc["second_tenderer"]) set_tenderer.add(_doc.get("second_tenderer")) if "third_tenderer" in _doc: set_dict_item_columns(set_columns,df_columns,item,"第三候选单位",_doc["third_tenderer"]) set_tenderer.add(_doc.get("third_tenderer")) if "win_tenderee_manager" in _doc: set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人",_doc["win_tenderee_manager"]) if "win_tenderee_phone" in _doc: set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话",_doc["win_tenderee_phone"]) if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: set_dict_item_columns(set_columns,df_columns,item,"中标金额",_doc["win_bid_price"]) if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: set_dict_item_columns(set_columns,df_columns,item,"招标金额",_doc["bidding_budget"]) set_dict_item_columns(set_columns,df_columns,item,"入围供应商",",".join(list(set_tenderer))) if "第二候选单位" not in item: set_dict_item_columns(set_columns,df_columns,item,"第二候选单位","") if "第三候选单位" not in item: set_dict_item_columns(set_columns,df_columns,item,"第三候选单位","") if "招标金额" not in item: set_dict_item_columns(set_columns,df_columns,item,"招标金额","") if "中标金额" not in item: set_dict_item_columns(set_columns,df_columns,item,"中标金额","") if "中标单位" not in item: set_dict_item_columns(set_columns,df_columns,item,"中标单位","") if "中标单位联系人" not in item: set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人","") if "中标单位联系电话" not in item: set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话","") # if item["中标单位"] not in set_enter: # continue _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # if _line in set_line: # continue # if item["招标金额"]=="": # continue # set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df_data = {} set_columns = set() df_columns = [] # for name in df["ent_name_real"]: # if isinstance(name,str) and name!="": # list_should_q = [] # # list_should_q.append(MatchPhraseQuery("doctextcon",name)) # # list_should_q.append(MatchPhraseQuery("attachmenttextcon",name)) # NestedQuery("sub_docs_json","sub_docs_json.win_tenderer",name) # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01-01"), # RangeQuery("status",201,301), # # BoolQuery(should_queries=list_should_q), # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",name)) # ]) # # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True), # ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) # # # while True: # getData(df_data,rows,set(),"",set_columns,df_columns) # if not next_token: # break # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True), # ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) # if len(df_data.keys())>0: # print(len(df_data[list(df_data.keys())[0]]),total_count) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True), ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) while True: getData(df_data,rows,set(),"",set_columns,df_columns) if not next_token: break rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True), ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) if len(df_data.keys())>0: print(len(df_data[list(df_data.keys())[0]]),total_count) # appendAttachmentPath(df_data,"page_attachments","附件链接") # df_columns.append("附件链接") # df_columns = ["docid","公告标题","公告类别","发布时间","公告内容","省份","城市","项目编号","招标单位","招标金额","资金来源","招标方式","代理单位","中标单位","中标金额","第二候选单位","第三候选单位","url","附件链接"] df = pd.DataFrame(df_data) df.to_excel("../data/%s_%s.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"),page_time),columns=df_columns) def appendAttachmentPath(df_data,key,new_key): list_data = [] for _attach in df_data[key]: _dict = {key:_attach} list_data.append(_dict) task_queue = Queue() for _d in list_data: task_queue.put(_d) auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg") bucket_url = "http://oss-cn-hangzhou.aliyuncs.com" attachment_bucket_name = "attachment-hub" bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name) ots_client = getConnect_ots() def search(ots_client,table_name,key_tuple,columns_to_get): try: # 调用get_row接口查询,最后一个参数值1表示只需要返回一个版本的值。 consumed, return_row, next_token = ots_client.get_row(table_name, key_tuple, columns_to_get, None, 1) if return_row is not None: _dict = getRow_ots_primary(return_row) return _dict return None # 客户端异常,一般为参数错误或者网络异常。 except OTSClientError as e: traceback.print_exc() log("get row failed, http_status:%d, error_message:%s" % (e.get_http_status(), e.get_error_message())) # 服务端异常,一般为参数错误或者流控错误。 except OTSServiceError as e: traceback.print_exc() log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id())) def _handle(item,result_queue): page_attachments = json.loads(item.get(key,"[]")) list_url = [] for _a in page_attachments: fileMd5 = _a.get("fileMd5") print("==",fileMd5) _s_dict = search(ots_client,"attachment",[("filemd5",fileMd5)],["path"]) if _s_dict is not None: _path = _s_dict.get("path") if _path is not None: _url = bucket.sign_url("GET",_path,86500*5) list_url.append(_url) item[new_key] = json.dumps(list_url) mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() df_data[new_key] = [] for _d in list_data: df_data[new_key].append(_d.get(new_key)) def export_competition(): file = "select___from_province_indus_entity_top1.xlsx" df1 = pd.read_excel(file) ots_client = getConnect_ots() task_queue = queue.Queue() list_entity = [] for province,industry,entitys in zip(df1["province"],df1["industry"],df1["entitys"]): l_e = json.loads(entitys) for l in l_e: list_entity.append({"province":province, "industry":industry, "win_tenderer":l.get("win_tenderee","")}) for item in list_entity: task_queue.put(item) def _handle(item,result_queue): def getData(rows,_set): dict_rows = getRow_ots(rows) for _dict in dict_rows: sub_docs_json = _dict.get("sub_docs_json") if sub_docs_json is not None: for sub_docs in json.loads(sub_docs_json): if sub_docs.get("win_tenderer") is not None: _set.add(sub_docs.get("win_tenderer")) if sub_docs.get("second_tenderer") is not None: _set.add(sub_docs.get("second_tenderer")) if sub_docs.get("third_tenderer") is not None: _set.add(sub_docs.get("third_tenderer")) columns = ["sub_docs_json"] _company = item.get("win_tenderer") should_q = BoolQuery(should_queries=[NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_company)), NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",_company)), NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",_company))]) bool_query = BoolQuery(must_queries=[ # should_q, MatchPhraseQuery("doctextcon",_company), RangeQuery("docchannel",101)]) _set = set() rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True), ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) getData(rows,_set) _count = 0 _page = 0 while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True), ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) getData(rows,_set) _count += 1 _page += 1 if len(_set)>20 or _page>20: break if item["win_tenderer"] in _set: _set.remove(item["win_tenderer"]) item["competition"] = ",".join(list(_set)) mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() df_data = {} keys = ["province","industry","win_tenderer","competition"] for key in keys: if key not in df_data: df_data[key] = [] for item in list_entity: for key in keys: df_data[key].append(item.get(key)) df2 = pd.DataFrame(df_data) df2.to_excel("competition.xlsx",columns=keys) def document_dumplicate(): df = pd.read_excel("../data/2022-01-19_214329_export11.xlsx") print(df.keys()) def export_document_no_price(): df = pd.read_csv("select___from_document_no_price_tmp.csv",encoding="gbk") conn_oracle = getConnection_oracle() cursor = conn_oracle.cursor() sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO" cursor.execute(sql) dict_source = {} while 1: rows = cursor.fetchmany(10000) if not rows: break for row in rows: dict_source[row[0]] = row[1] dict_source[row[0].split("(")[0]] = row[1] list_name = [] set_web_source = set() for web_source_no in df["web_source_no"]: set_web_source.add(web_source_no) list_name.append(dict_source.get(web_source_no,"")) dict_source_year = {} for web_source_no,year,counts_no_price,counts_all,rate in zip(df["web_source_no"],df["year"],df["counts_no_price"],df["counts_all"],df["rate"]): dict_source_year["%s&%s"%(web_source_no,year)] = {"counts_no_price":counts_no_price,"counts_all":counts_all,"rate":rate} new_data = {"web_source_no":[], "web_source_name":[], "counts_no_price":[], "counts_all":[], "rate":[], "counts_no_price1":[], "counts_all1":[], "rate1":[]} for web_source_no in list(set_web_source): new_data["web_source_no"].append(web_source_no) new_data["web_source_name"].append(dict_source.get(web_source_no,"")) d_2020 = dict_source_year.get("%s&%s"%(web_source_no,"2020"),{}) d_2021 = dict_source_year.get("%s&%s"%(web_source_no,"2021"),{}) new_data["counts_no_price"].append(d_2020.get("counts_no_price")) new_data["counts_all"].append(d_2020.get("counts_all")) new_data["rate"].append(d_2020.get("rate")) new_data["counts_no_price1"].append(d_2021.get("counts_no_price")) new_data["counts_all1"].append( d_2021.get("counts_all")) new_data["rate1"].append(d_2021.get("rate")) # new_data = {"year":df["year"], # "web_source_no":df["web_source_no"], # "web_source_name":list_name, # "counts_no_price":df["counts_no_price"], # "counts_all":df["counts_all"], # "rate":df["rate"]} df2 = pd.DataFrame(new_data) df2.to_excel("websource_no_price1.xlsx",columns=["web_source_no","web_source_name","counts_no_price","counts_all","rate","counts_no_price1","counts_all1","rate1"]) def exportDetailLink(): df = pd.read_excel("招投标数据测试反馈表3.xlsx") list_item = [] for docid in df["docid"]: list_item.append({"docid":docid}) task_queue = queue.Queue() for item in list_item: task_queue.put(item) def _handle(item,result_queue,ots_client,pool_oracle): try: conn = pool_oracle.getConnector() docid = int(item["docid"]) partitionkey = int(docid%500+1) consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",int(docid))],["original_docchannel","detail_link","uuid"]) _dict = getRow_ots_primary(return_row) if _dict.get("detail_link") is not None and len(_dict.get("detail_link"))>0: item["detail_link"] = _dict.get("detail_link") else: original_docchannel = _dict.get("original_docchannel") _uuid = _dict.get("uuid") d_tablename = {"51":"T_GONG_GAO_BIAN_GENG", "52":"T_ZHAO_BIAO_GONG_GAO", "101":"T_ZHONG_BIAO_XIN_XI", "102":"T_ZHAO_BIAO_YU_GAO", "103":"T_ZHAO_BIAO_DA_YI", "104":"T_ZHAO_BIAO_WEN_JIAN", "114":"T_CAI_GOU_YI_XIANG" } _tablename = d_tablename.get(str(original_docchannel)) if _tablename is not None: cursor = conn.cursor() sql = "select detail_link from bxkc.%s where id='%s'"%(_tablename,_uuid) print(sql) cursor.execute(sql) rows = cursor.fetchall() if len(rows)>0: item["detail_link"] = rows[0][0] cursor.close() except Exception as e: traceback.print_exc() finally: pool_oracle.putConnector(conn) ots_client = getConnect_ots() pool_oracle = ConnectorPool(10,30,getConnection_oracle) mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client,pool_oracle=pool_oracle) mt.run() df_data = {"docid":[], "detail_link":[]} for item in list_item: for k,v in df_data.items(): v.append(item.get(k,"")) df2 = pd.DataFrame(df_data) df2.to_excel("222.xlsx") def process_doc(): df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",1) list_check = [] set_process_docid = set() for docid in df["process_docid"]: set_process_docid.add(docid) df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",0) for docid in df["docid"]: if docid in set_process_docid: list_check.append("1") else: list_check.append("0") df["check"] = list_check df.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S'))) def export_extract2(): ots_client = getConnect_ots() df_keys = ["docid","extract_json","status"] df_data = {} for _key in df_keys: df_data[_key] = [] bool_query = BoolQuery(must_queries=[ RangeQuery("status",1,1000,True,True)]) rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED)) list_dict = getRow_ots((rows)) for _dict in list_dict: if re.search("false",_dict.get("extract_json","")) is None: continue for k in df_keys: df_data[k].append(_dict.get(k)) _count = len(list_dict) while next_token: rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED)) list_dict = getRow_ots((rows)) for _dict in list_dict: if re.search("false",_dict.get("extract_json","")) is None: continue for k in df_keys: df_data[k].append(_dict.get(k)) _count += len(list_dict) print("%d/%d"%(_count,total_count)) df = pd.DataFrame(df_data) df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"))) def export_by_file(): df = pd.read_csv("../data/2022-04-01_121315_数据导出.csv",encoding="gbk") keys = df.keys() df_data = {} set_win = set() set_ruwei = set() for k in keys: df_data[k] = [] for _i in range(len(df["产品"])): product = df["产品"][_i] if product is None or not isinstance(product,str): continue print(product) win_tenderer = df["中标单位"][_i] if win_tenderer is not None and isinstance(win_tenderer,str): set_win.add(win_tenderer) set_ruwei.add(win_tenderer) ruwei = df["入围供应商"][_i] if ruwei is not None and isinstance(ruwei,str): l_s = ruwei.split(",") for _s in l_s: set_ruwei.add(_s) if re.search("公路|道路|路基|路面|快速通道|高速|隧道|飞机跑道|桥梁|养护|路段|市政|照明工程|照明设施|亮灯|灯光改造|灯光工程|管道|架线|园林|景观|绿化|排水|河道整治|环境治理|交通|地铁|跌路|高铁|桥梁|大桥|桥段",product) is not None: for k in keys: df_data[k].append(df[k][_i]) print("win count:%d ruwei:%d"%(len(set_win),len(set_ruwei))) # df1 = pd.DataFrame(df_data) # df1.to_excel("../data/%s_文件导出.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),columns=keys) def export_dump(): import pandas as pd df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=0) _set_number = set() _set_number |= set(df["号码"]) print(len(_set_number)) df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=1) _set_number |= set(df["号码"]) print(len(_set_number)) df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=2) keys = df.keys() df_data = {} for k in keys: df_data[k] = [] for _i in range(len(df[keys[0]])): if df["号码"][_i] not in _set_number: for k in keys: df_data[k].append(df[k][_i]) _set_number.add(df["号码"][_i]) df2 = pd.DataFrame(df_data) df2.to_excel("tmp222.xlsx") def check_data_synchronization(): filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log" list_uuid = [] _regrex = "ID='(?P.+)'" with open(filepath,"r",encoding="utf8") as f: while 1: _line = f.readline() if not _line: break _match = re.search(_regrex,_line) if _match is not None: _uuid = _match.groupdict().get("uuid") if _uuid is not None: list_uuid.append(_uuid) print(len(list_uuid)) task_queue = Queue() list_data = [] for _uuid in list_uuid: _dict = {"uuid":_uuid} list_data.append(_dict) task_queue.put(_dict) ots_client = getConnect_ots() def _handle(_item,result_queue): bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,get_total_count=True), columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE)) _item["exists"] = total_count mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() df_data = {"uuid":[], "exists":[]} for _data in list_data: for k,v in df_data.items(): v.append(_data.get(k)) import pandas as pd df2 = pd.DataFrame(df_data) df2.to_excel("check.xlsx") def group_xlsx(): filename = "厂商&赛道列表.xlsx" df0 = pd.read_excel(filename,0) df1 = pd.read_excel(filename,1) df2 = pd.read_excel(filename,2) set_1 = set(df0["中国厂商"]) | set(df1["国际厂商"]) set_2 = set(df2["a"]) | set(df2["b"]) | set(df2["c"]) filename = "../data/2022-05-24_185801_数据导出.xlsx" df = pd.read_excel(filename) dict_docid = {} for docid,keyword in zip(df["docid"],df["关键词"]): if docid not in dict_docid: dict_docid[docid] = [[],[]] if keyword in set_1: dict_docid[docid][0].append(keyword) else: dict_docid[docid][1].append(keyword) set_docid = set() for k,v in dict_docid.items(): if len(v[0])>=1 and len(v[1])>=1: set_docid.add(k) keys = df.keys() print(keys) df_data = {} for i in range(len(df["docid"])): print(i) docid = df["docid"][i] if docid in set_docid: for k in keys: if k not in df_data: df_data[k] = [] df_data[k].append(df[k][i]) df_data["关键词"][-1] = str(dict_docid[docid][0][0])+"+"+str(dict_docid[docid][1][0]) df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=keys) def static_process_time(): ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[ RangeQuery("crtime","2022-05-26","2022-05-27"), TermQuery("page_time","2022-05-26") ]) rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100), ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED)) list_data = [] _l = getRow_ots(rows) list_data.extend(_l) while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index", SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100), ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED)) _l = getRow_ots(rows) list_data.extend(_l) print("%d/%d"%(len(list_data),total_count)) list_dis = [] list_dis_a = [] list_dis_n_a = [] for _data in list_data: crtime = _data.get("crtime") opertime = _data.get("opertime") page_attachments = _data.get("page_attachments","[]") _d = time.mktime(time.strptime(opertime,"%Y-%m-%d %H:%M:%S"))-time.mktime(time.strptime(crtime,"%Y-%m-%d %H:%M:%S")) list_dis.append(_d) if page_attachments=="[]": list_dis_n_a.append(_d) else: list_dis_a.append(_d) print("avg_time:",sum(list_dis)/len(list_dis),max(list_dis),min(list_dis)) print("avg_time:",sum(list_dis_a)/len(list_dis_a),max(list_dis_a),min(list_dis_a)) print("avg_time:",sum(list_dis_n_a)/len(list_dis_n_a),max(list_dis_n_a),min(list_dis_n_a)) def export_dump_by_id(): filename = "遗漏待验证1.csv" df = pd.read_csv(filename) list_k = [] ots_client = getConnect_ots() for _main_url,_other_url in zip(df["_c0"],df["_c1"]): _d = {} main_docid = re.split("[-.]",_main_url)[3] l_other = [] for _l in _other_url.split(","): _docid = re.split("[-.]",_l)[3] l_other.append(_docid) _d["main_docid"] = main_docid _d["other_docid"] = l_other list_k.append(_d) task_queue = Queue() for _q in list_k: task_queue.put(_q) def _handle(item,result_queue): columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","page_attachments"] main_docid = item["main_docid"] other_docid = item["other_docid"] list_should_q = [] list_should_q.append(TermQuery("docid",main_docid)) for _d in other_docid: list_should_q.append(TermQuery("docid",_d)) _query = BoolQuery(should_queries=list_should_q) l_rows = [] rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) dict_row = getRow_ots(rows) l_rows.extend(dict_row) log("total count:%d"%total_count) _count = len(dict_row) while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) dict_row = getRow_ots(rows) l_rows.extend(dict_row) _count += len(dict_row) item["data"] = l_rows mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() df_data = {"main_docid":[]} set_line = set() dict_channel = getDict_docchannel() for _d in list_k: list_row = _d.get("data") if list_row is not None: main_docid = _d.get("main_docid") getRowData(df_data,list_row,set_line,[],dict_channel,True) for _ in list_row: df_data["main_docid"].append(main_docid) df1 = pd.DataFrame(df_data) list_df_columns1 = ["main_docid"].extend(list_df_columns) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns1) def count_product(): filename = "../data/2022-06-24_152201_数据导出.xlsx" df = pd.read_excel(filename) # _product = df["产品"] # dict_p_c = {} # for _p in _product: # if isinstance(_p,str) and _p!="": # l_p = _p.split(",") # for _p1 in l_p: # if _p1 not in dict_p_c: # dict_p_c[_p1] = 0 # dict_p_c[_p1] += 1 # df_data = {"产品":[], # "次数":[]} # for k,v in dict_p_c.items(): # df_data["产品"].append(k) # df_data["次数"].append(v) # df1 = pd.DataFrame(df_data) # df1.to_excel("222.xlsx") keys = df.keys() df_data = {} for k in keys: df_data[k] = [] product_pattern = "电脑|台式机|电脑|主机|网络|软件|开发|通信|系统|信息技术" df1 = pd.read_excel("222.xlsx") list_p = [] for _p,_n in zip(df1["产品"],df1["need"]): if _n==1: list_p.append(_p) product_pattern = product_pattern+"|"+"|".join(list_p) _product = df["产品"] for _i in range(len(_product)): if re.search(product_pattern,str(_product[_i])) is not None: for k in keys: df_data[k].append(df[k][_i]) df2 = pd.DataFrame(df_data) df2.to_excel("333.xlsx",columns=keys) from dataSource.source import getConnect_capacity def exportHonors_item_info(): ots_capacity = getConnect_capacity() bool_query = BoolQuery(should_queries=[ # TermQuery("ryjx","海河杯"), WildcardQuery("hjdw","*合肥建工集团有限公司*") ]) rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time")]),get_total_count=True,limit=100), columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL)) list_data = getRow_ots(rows) while next_token: rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index", SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100), columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL)) list_data.extend(getRow_ots(rows)) df_data = {} set_columns1 = set() list_df_columns1 = [] for _data in list_data: _dict = {} set_dict_item_columns(set_columns1,list_df_columns1,_dict,"record_id",_data.get("record_id")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"bfdw",_data.get("bfdw")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"create_time",_data.get("create_time")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"cs",_data.get("cs")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"detail_link",_data.get("detail_link")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"fbsj",_data.get("fbsj")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdw",_data.get("hjdw")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdwjs",_data.get("hjdwjs")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjxm",_data.get("hjxm")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxjb",_data.get("jxjb")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxlx",_data.get("jxlx")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"ryjx",_data.get("ryjx")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"sf",_data.get("sf")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"xmfzr",_data.get("xmfzr")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zgdw",_data.get("zgdw")) set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zxj",_data.get("zxj")) for k,v in _dict.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) df = pd.DataFrame(df_data) df.to_excel("honor_export.xlsx",columns=list_df_columns1) def check_dump_data(): ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[ generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery), RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"), RangeQuery("page_time","2022-09-07","2022-09-08"), RangeQuery("status",201,301) ]) list_data = getDocument([{"query":bool_query}],["docid"],table_name="document",table_index="document_index") bool_query1 = BoolQuery(must_queries=[ generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery), RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"), RangeQuery("page_time","2022-09-07","2022-09-08"), RangeQuery("status",81,100), TermQuery("save",1) ]) list_data1 = getDocument([{"query":bool_query1}],["docid"],table_name="document_tmp",table_index="document_tmp_index") set_docid = set() set_docid_tmp = set() for _data in list_data: set_docid.add(_data.get("docid")) for _data in list_data1: set_docid_tmp.add(_data.get("docid")) print("document - tmp",set_docid-set_docid_tmp) print("tmp - document",set_docid_tmp-set_docid) def search_title_count(): filename = "数据样例.xlsx" df = pd.read_excel(filename) list_title_dict = [] for _title in df["标题"]: _dict = {"标题":_title} list_title_dict.append(_dict) task_queue = Queue() for _d in list_title_dict: task_queue.put(_d) ots_client = getConnect_ots() def _handle(item,result_queue): columns = ["status","tenderee","agency","sub_docs_json"] _title = item.get("标题","") if _title!="": bool_query = BoolQuery(must_queries=[MatchPhraseQuery("doctitle",_title)]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,get_total_count=True,limit=10), columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) item["比地数量"] = total_count if len(list_data)>0: _str_docid = ",".join([str(a.get("docid")) for a in list_data]) item["比地_docid"] = _str_docid tenderee = list_data[0].get("tenderee") item["比地_招标人"] = tenderee agency = list_data[0].get("agency") item["比地_代理人"] = agency sub_docs_json = list_data[0].get("sub_docs_json") if sub_docs_json is not None: sub_docs = json.loads(sub_docs_json) win_tenderer = "" win_bid_price = "" for _doc in sub_docs: if _doc.get("win_tenderer","")!="": win_tenderer = _doc.get("win_tenderer") win_bid_price = _doc.get("win_bid_price") item["比地_中标人"] = win_tenderer item["比地_中标金额"] = win_bid_price mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() df_data = {} keys = ["标题","比地数量","比地_docid","比地_招标人","比地_代理人","比地_中标人","比地_中标金额"] for _d in list_title_dict: for k in keys: if k not in df_data: df_data[k] = [] df_data[k].append(_d.get(k,"")) df1 = pd.DataFrame(df_data) df1.to_excel("比地对比数据.xlsx",columns=keys) def getDumplicate_docid(): filename = "2022-11-02_154222_数据导出.xlsx" df = pd.read_excel(filename) list_docid = df["docid"] task_queue = Queue() list_d = [] for _docid in list_docid: _dict = {"docid":_docid} list_d.append(_dict) task_queue.put(_dict) ots_client = getConnect_ots() def _handle(item,result_queue): _docid = item.get("docid") bool_query = BoolQuery(must_queries=[TermQuery("docid",int(_docid))]) rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index", SearchQuery(bool_query), columns_to_get=ColumnsToGet(["save"],return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) if len(list_data)>0: _save = list_data[0].get("save") item["save"] = _save mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() list_d_docid = [] for _data in list_d: docid = _data.get("docid") save = _data.get("save") if save==0: list_d_docid.append(str(docid)) print(",".join(list_d_docid)) def getDocumentHtml(): filename = "../data/2023-02-20_154118_数据导出.xlsx" df = pd.read_excel(filename) ots_client = getConnect_capacity() list_html_data = [] _count = 0 for docid in df["docid"][:10000]: partitionkey = int(docid)%500+1 docid = int(docid) try: consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"]) _dict = getRow_ots_primary(return_row) list_html_data.append(_dict) _count += 1 print("%d/%d"%(_count,len(df["docid"]))) except Exception as e: pass save(list_html_data,"list_html_data.pk") def exportAgencyCount(): filename = "广州招标协会.xlsx" df = pd.read_excel(filename) a = df["a"] df_data = {} set_c = set() for line in a: list_c = line.split(" ") for _i in range(len(list_c)): _key = "c_%s"%(str(_i).rjust(2,'0')) if _key not in df_data: df_data[_key] = [] set_c.add(_key) df_data[_key].append(list_c[_i]) list_data = [] list_query = [] ots_client = getConnect_ots() for _agency in df_data["c_00"]: query = BoolQuery(must_queries=[TermQuery("city","广州"), TermQuery("docchannel",52), RangeQuery("status",201,301), RangeQuery("page_time","2022-01-01","2023-01-01"), TermQuery("agency",_agency), BoolQuery(should_queries=[ BoolQuery(should_queries=[MatchPhraseQuery("doctitle","工程施工"), MatchPhraseQuery("doctextcon","建造师"), MatchPhraseQuery("attachmenttextcon","建造师")]), BoolQuery(should_queries=[MatchPhraseQuery("doctitle","监理"), MatchPhraseQuery("doctextcon","监理工程师"), MatchPhraseQuery("attachmenttextcon","监理工程师")]), BoolQuery(should_queries=[MatchPhraseQuery("doctitle","造价咨询"), MatchPhraseQuery("doctitle","预算"), MatchPhraseQuery("doctitle","造价审核"), MatchPhraseQuery("doctitle","结算"), MatchPhraseQuery("doctitle","概算")]), ]) ], must_not_queries=[generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州公共资源交易中心"],MatchPhraseQuery)]) # list_row = getDocument([{"query":query}],["agency","page_time","sub_docs_json"],thread_count=1) list_query.append({"query":query}) df_data = {} set_line = set() columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"] list_row = getDocument(list_query,columns,thread_count=30) # list_row = filterRow(list_row,"product",list_not_key) log("get document %d rows"%len(list_row)) # getRowDataWithKey(df_data,list_row,columns) dict_channel = getDict_docchannel() getRowData(df_data,list_row,set_line,[''],dict_channel,True) # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True) # fixContactPerson(df_data,list_df_columns,get_legal_person=False) df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) return # _dict = {"一季度预算":0,"一季度总数":0,"一季度有金额占比":0, # "二季度预算":0,"二季度总数":0,"二季度有金额占比":0, # "三季度预算":0,"三季度总数":0,"三季度有金额占比":0, # "四季度预算":0,"四季度总数":0,"四季度有金额占比":0, # } # print(_agency,len(list_row)) # # # for _row in list_row: # print(_row.get("docid")) # page_time = _row.get("page_time","") # sub_docs_json = _row.get("sub_docs_json","") # _t = None # if page_time!="": # if page_time<="2022-03-31": # _t = "一季度" # elif page_time<="2022-06-31": # _t = "二季度" # elif page_time<="2022-09-31": # _t = "三季度" # elif page_time<="2022-12-31": # _t = "四季度" # if sub_docs_json != "": # sub_docs = json.loads(sub_docs_json) # for _doc in sub_docs: # bidding_budget = _doc.get("bidding_budget",0) # _dict["%s预算"%_t] += bidding_budget # _dict["%s总数"%_t] += 1 # if bidding_budget>0: # _dict["%s有金额占比"%_t] += 1 # print(_dict) # _sum = 0 # _sum_n = 0 # _sum_hm = 0 # for k in ["一季度","二季度","三季度","四季度"]: # km = "%s预算"%k # kn = "%s总数"%k # khm = "%s有金额占比"%k # _sum += _dict[km] # _sum_n += _dict[kn] # _sum_hm += _dict[khm] # _dict["全年预算"] = _sum # _dict["全年总数"] = _sum_n # _dict["全年有金额占比"] = _sum_hm # for k in ["一季度","二季度","三季度","四季度","全年"]: # km = "%s预算"%k # kn = "%s总数"%k # khm = "%s有金额占比"%k # _dict[khm] = _dict[khm]/_dict[kn] if _dict[kn]>0 else 0 # for k,v in _dict.items(): # if k not in df_data: # df_data[k] = [] # df_data[k].append(v) # # list_c = list(set_c) # list_c.sort(key=lambda x:x) # for k,v in df_data.items(): # print(k,len(v)) # df1 = pd.DataFrame(df_data) # list_c.append("一季度预算") # list_c.append("一季度总数") # list_c.append("一季度有金额占比") # list_c.append("二季度预算") # list_c.append("二季度总数") # list_c.append("二季度有金额占比") # list_c.append("三季度预算") # list_c.append("三季度总数") # list_c.append("三季度有金额占比") # list_c.append("四季度预算") # list_c.append("四季度总数") # list_c.append("四季度有金额占比") # list_c.append("全年预算") # list_c.append("全年总数") # list_c.append("全年有金额占比") # df1.to_excel("%s_1.xlsx"%(filename),columns=list_c) def attachAttachment(): filename = "北京电信ICT样例(2023一季度)v1.1(2).xlsx" df = pd.read_excel(filename,1) list_data = [] task_queue = Queue() for _docid in df["docid"]: _d = {"docid":_docid} list_data.append(_d) task_queue.put(_d) print("len_docid",len(df["docid"]),len(list_data)) capacity = getConnect_capacity() def _handle(item,result_queue): docid = item["docid"] consumed, return_row, next_token = capacity.get_row("document",[("partitionkey",int(docid)%500+1),("docid",int(docid))],["dochtmlcon"]) _d = getRow_ots_primary(return_row) _dochtmlcon = _d["dochtmlcon"] _dochtmlcon = re.sub("|||","",_dochtmlcon) _soup = BeautifulSoup(_dochtmlcon,"lxml") _div = _soup.find("div",attrs={"class":"richTextFetch"}) if _div is None: _div = "" item["attachment"] = _div mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() list_attachment = [] for _d in list_data: list_attachment.append(getLegal_str(_d.get("attachment",""))) df_data = {} df_data["附件html"] = list_attachment df_1 = pd.DataFrame(df_data) df_1.to_excel("附加html_"+filename) def compareData(): filename = "D:\\BaiduNetdiskDownload\\bidi_check.csv" list_data = [] with open(filename,"r",encoding="utf8") as f: list_lines = f.readlines() for _line in list_lines: docid,docchannel,win_tenderer,tenderee,win_bid_price,bidding_budget = [None if a[:2]=='\\N' else a for a in _line.split("\t")] _d = {"docid":int(docid), "docchannel":docchannel, "win_tenderer":win_tenderer, "tenderee":tenderee, "win_bid_price":float(win_bid_price) if win_bid_price is not None else None, "bidding_budget":float(bidding_budget) if bidding_budget is not None else None} list_data.append(_d) del list_lines # for _i in range(len(list_data)): # print(list_lines[_i]) # print(list_data[_i]) ots_client = getConnect_ots() task_queue = Queue() for _d in list_data: task_queue.put(_d) def _handle(item,result_queue): docid = item.get("docid") win_tenderer = item.get("win_tenderer") win_bid_price = item.get("win_bid_price") tenderee = item.get("tenderee") bidding_budget = item.get("bidding_budget") must_q = [TermQuery("docid",int(docid))] if tenderee is not None: must_q.append(TermQuery("tenderee",tenderee)) if win_tenderer is not None: must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",win_tenderer))) if win_bid_price is not None: must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_bid_price",win_bid_price))) if bidding_budget is not None: must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.bidding_budget",bidding_budget))) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(BoolQuery(must_queries=must_q),get_total_count=True), columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE)) item["total_count"] = total_count if total_count==0: print("docid %d total_count is 0",docid) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(BoolQuery(must_queries=[TermQuery("docid",docid)]),get_total_count=True), columns_to_get=ColumnsToGet(["tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED)) l_d = getRow_ots(rows) item["return_row"] = l_d mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() list_new_data = [] for data in list_data: if data.get("total_count")==0: new_d = {"docid":data.get("docid"), "docchannel":data.get("docchannel")} return_row = data.get("return_row") if len(return_row)>0: # print(return_row) _row = return_row[0] tenderee = _row.get("tenderee") sub_docs_json = _row.get("sub_docs_json") bidding_budget = None win_tenderer = None win_bid_price = None if sub_docs_json is not None: sub_docs = json.loads(sub_docs_json) for _doc in sub_docs: if _doc.get("bidding_budget") is not None: bidding_budget = _doc.get("bidding_budget") if _doc.get("win_tenderer") is not None: win_tenderer = _doc.get("win_tenderer") win_bid_price = _doc.get("win_bid_price") new_d["tenderee"] = tenderee new_d["bidding_budget"] = bidding_budget new_d["win_tenderer"] = win_tenderer new_d["win_bid_price"] = win_bid_price list_new_data.append(new_d) df_data_c = ["docid","docchannel","win_tenderer","tenderee","win_bid_price","bidding_budget"] df_data = {} for c in df_data_c: df_data[c] = [] for _d in list_new_data: for c in df_data_c: df_data[c].append(_d.get(c)) df = pd.DataFrame(df_data) df.to_csv("bid_check_result.csv",columns=df_data_c) def exportProducts(): filename = "货物关键词.xlsx" dict_channel = getDict_docchannel() df = pd.read_excel(filename) list_products = df["货物关键词"] list_q = [] list_result = [] ots_client = getConnect_ots() columns = ["产品","总数","匹配模式"] _index = 0 task_queue = Queue() for _product in list_products: _index += 1 print(_product,"%d/%d"%(_index,len(list_products))) bool_query = BoolQuery(must_queries=[NestedQuery("products",TermQuery("products.product",_product)), # RangeQuery("page_time","2021-01-01"), RangeQuery("status",201,301), TermQuery("docchannel",101)]) _q = {"query":bool_query,"product":_product,"匹配模式":"精准"} task_queue.put(_q) bool_query = BoolQuery(must_queries=[NestedQuery("products",WildcardQuery("products.product","*%s*"%_product)), # RangeQuery("page_time","2021-01-01"), RangeQuery("status",201,301), TermQuery("docchannel",101)]) _q = {"query":bool_query,"product":_product,"匹配模式":"包括"} task_queue.put(_q) def _handle(item,result_queue): bool_query = item["query"] _product = item["product"] _type = item["匹配模式"] rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,get_total_count=True), columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE)) list_result.append({"产品":_product,"总数":total_count,"匹配模式":_type}) mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() print("done result length:%d"%(len(list_result))) df_data = {} for _d in list_result: for c in columns: if c not in df_data: df_data[c] = [] df_data[c].append(_d.get(c)) df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=columns) def statics_attachment_counts(): bool_query = BoolQuery(must_queries=[ generateBoolShouldQuery(["industry"],["土木工程建筑业","建筑装饰和其他建筑业","房屋建筑业","专业施工","修缮工程","建筑安装业"],TermQuery), RangeQuery("page_time","2023-08-07","2023-08-14"), RangeQuery("status",201,301), NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")) ]) ots_client = getConnect_ots() rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True), ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True), ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED)) list_data.extend(getRow_ots(rows)) # if len(list_data)>1000: # break task_queue = Queue() for _data in list_data: task_queue.put(_data) def _handle(item,result_queue): page_attachments = item.get("page_attachments") _size = 0 if page_attachments is not None and page_attachments!="": list_attach = json.loads(page_attachments) for _attach in list_attach: _md5 = _attach.get("fileMd5") if _md5 is not None: consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_md5)],["size"]) _d = getRow_ots_primary(return_row) _size += _d.get("size",0) item["size"] = _size mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() dict_result = {} for data in list_data: industry = data.get("industry") docchannel = data.get("docchannel") _type = "" if docchannel==52: _type = "招标" elif docchannel in (101,118,119,120): _type = "中标" else: _type = "其他" _key = "%s-%s"%(industry,_type) if _key not in dict_result: dict_result[_key] = 0 dict_result[_key] += data.get("size",0) print(dict_result) for k,v in dict_result.items(): print(k,"%.2fM"%(v/7/1024/1024)) def static_dump(): import pandas as pd filename = "select___from_bxkc_bxkc_delete_document_.csv" df = pd.read_csv(filename) print(df.keys()) list_docid = df["docid"] list_dup_docid = df["dup_docid"] list_operate_time = df["operate_time"] list_a = [] for docid,dup_docid in zip(list_docid,list_dup_docid): docid = int(docid) _flag = False if isinstance(dup_docid,str) and dup_docid is not None and dup_docid!="": _l = dup_docid.split(",") for _i in _l: if _i.strip()!="": docid1 = int(_i) if docid1>docid: _flag = True break if _flag: list_a.append("是") else: list_a.append("否") df_data = {"被去重docid":list_docid, "重复id":list_dup_docid, "是否展示后删除":list_a} df1 = pd.DataFrame(df_data) df1.to_csv("16号去重统计.csv") def append_title(): import pandas as pd filename = "去重记录.xlsx" df = pd.read_excel(filename) list_docid = df["被去重id"] list_keep_id = df["保留id"] list_data = [] task_queue = Queue() for _docid,keep_docid in zip(list_docid,list_keep_id): _d = {"dup_docid":int(_docid), "keep_docid":int(keep_docid)} list_data.append(_d) task_queue.put(_d) ots_client = getConnect_ots() def _handle(item,result_queue): dup_docid = item.get("dup_docid") keep_docid = item.get("keep_docid") dup_partitionkey = dup_docid%500+1 keep_partitionkey = keep_docid%500+1 consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",dup_partitionkey),("docid",dup_docid)],["status","doctitle","extract_count"]) _d = getRow_ots_primary(return_row) if _d is not None: doctitle = _d.get("doctitle") item["dup_title"] = doctitle extract_count = _d.get("extract_count") item["dup_extract_count"] = extract_count consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",keep_partitionkey),("docid",keep_docid)],["status","doctitle","extract_count","extract_count"]) _d = getRow_ots_primary(return_row) if _d is not None: doctitle = _d.get("doctitle") item["keep_title"] = doctitle status = _d.get("status") extract_count = _d.get("extract_count") item["keep_extract_count"] = extract_count if status>=201 and status<=300: item["保留id状态"] = "正常" elif status>=401: item["保留id状态"] = "去重" else: item["保留id状态"] = "" mt = MultiThreadHandler(task_queue,_handle,None,39) mt.run() keys = ["dup_docid","keep_docid","dup_title","keep_title","保留id状态","dup_extract_count","keep_extract_count"] df_data = {} for data in list_data: for k in keys: if k not in df_data: df_data[k] = [] df_data[k].append(data.get(k)) df1 = pd.DataFrame(df_data) df1.to_excel("%s.xlsx"%(filename),columns=keys) def get_follows(): _json = ''' [ ] ''' ots_client = getConnect_ots() list_follows = json.loads(_json) new_list = [] for follow in list_follows: docid = follow.get("docid") partitionkey = docid%500+1 consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["tenderee"]) _d = getRow_ots_primary(return_row) print("docid",_d.get("tenderee")) if _d.get("tenderee")=="泗阳意杨产业科技园实业有限公司": new_list.append(follow) print(json.dumps(new_list,ensure_ascii=False)) def validateTitle(title): rstr = r"[\/\\\:\*\?\"\<\>\|\r\n]" # '/ \ : * ? " < > |' new_title = re.sub(rstr, "_", title) # 替换为下划线 return new_title def exportParameters(): from glob import glob attach_path = "F:/Workspace2016/BaseDataMaintenance/BaseDataMaintenance/maintenance/product/download" ots_client = getConnect_ots() bool_query = BoolQuery(must_queries=[TermQuery("parameter_status",1)]) save_dir = "product" if not os.path.exists(save_dir): os.mkdir(save_dir) rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True), ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) write_count = 0 for _data in list_data: bid_filemd5s = _data["bid_filemd5s"] parameter = _data["parameter"] name = _data["name"] original_name = _data["original_name"] list_md5s = bid_filemd5s.split(",") if len(list_md5s)==1: list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*") for _path in list_path: if not _path.endswith(".html"): filename = _path.split("\\")[-1] with open(os.path.join(save_dir,filename),"wb") as f: f.write(open(_path,"rb").read()) pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10]) pname = validateTitle(pname) with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f: f.write(parameter) write_count += 1 while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index", SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True), ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) for _data in list_data: bid_filemd5s = _data["bid_filemd5s"] parameter = _data["parameter"] name = _data["name"] original_name = _data["original_name"] list_md5s = bid_filemd5s.split(",") if len(list_md5s)==1: list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*") for _path in list_path: if not _path.endswith(".html"): filename = _path.split("\\")[-1] with open(os.path.join(save_dir,filename),"wb") as f: f.write(open(_path,"rb").read()) pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10]) pname = validateTitle(pname) with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f: f.write(parameter) write_count += 1 if write_count>=2000: return def exportProjects(): bool_query = BoolQuery(must_queries=[ TermQuery("docid_number",1), ExistsQuery("zhong_biao_page_time"), RangeQuery("page_time","2023-01-01","2023-10-10") ], must_not_queries=[ MatchPhraseQuery("doctitles","网上超市") ]) ots_client = getConnect_ots() rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index", SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True), ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED)) list_data = getRow_ots(rows) while next_token: rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index", SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True), ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED)) list_data.extend(getRow_ots(rows)) if len(list_data)>10000: break task_queue = Queue() for data in list_data: task_queue.put(data) def _handle(item,result_queue): docids = item["docids"] project_codes = item.get("project_codes","") if len(project_codes)>0: list_codes = project_codes.split(",") should_q = [] for code in list_codes: should_q.append(MatchPhraseQuery("doctextcon",code)) should_q.append(MatchPhraseQuery("attachmenttextcon",code)) _query = BoolQuery(must_queries=[BoolQuery(should_queries=should_q),RangeQuery("status",201,301)], must_not_queries=[TermQuery("docid",int(docids))]) rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index", SearchQuery(_query,limit=100), ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED)) item["result"] = json.dumps(getRow_ots(rows),ensure_ascii=False) mt = MultiThreadHandler(task_queue,_handle,None,30) mt.run() columns = ["docids","doctitles","project_codes","result"] df_data = {} for data in list_data: for c in columns: if c not in df_data: df_data[c] = [] df_data[c].append(data.get(c,"")) df = pd.DataFrame(df_data) df.to_excel("toMerge.xlsx",columns=columns) if __name__=="__main__": # compareData() # attachAttachment() # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"]) # processDocument() # export_extract_check() # exportArticle_by_websource() # export_keyword_count() # export_province_keyword_count() # exportDocument_dump() # exportDocument_dump_mysql() # export_attachment() # statics_attachment_counts() # get_follows() # append_title() # exportDocument_by_doctitle() # exportIndustryCount() exportDocument_by_pagetime() # exportProjects() # exportProducts() # exportParameters() # exportAgencyCount() # getDocumentHtml() # getDumplicate_docid() # exportHonors_item_info() # check_dump_data() # search_title_count() # count_product() # export_dump_by_id() # group_xlsx() # static_process_time() # check_data_synchronization() # process_doc() # export_competition() # for page_time in ["2022-08-01"]: # exportDocument_by_days(page_time) # exportDocument_forRecommen() # exportDocument_attachment() # exportWin_tenderer_count() # attachCompanyContact() # dumpWebSourceNo() # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%103571618)))) # exportNzj() # turn_status() # attachBidding_budget() # debug_documentMerge() # exportDocument_medicine("2021-05-24","2021-05-30") # signDocument() # transUUid() # fix_document() # export_document_no_price() # findProjects() # exportDetailLink() # export_extract_check() # export_extract2() # export_by_file() # export_dump()