#encoding:GBK import sys import os sys.path.append("../") import pandas as pd from dataSource.source import * import json from utils.multiThread import MultiThreadHandler import queue from utils.Utils import * from dataSource.pool import ConnectorPool import re from tablestore import * import traceback from utils.hashUtil import aesCipher from export.exportEnterprise import getDictEnterprise,getOneContact set_columns = set() list_df_columns = [] def set_dict_item(_dict,name,v): _dict[name] = getLegal_str(v) if name not in set_columns: set_columns.add(name) list_df_columns.append(getLegal_str(name)) def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v): _dict[name] = getLegal_str(v) if name not in set_columns1: set_columns1.add(name) list_df_columns1.append(getLegal_str(name)) def getTenderee(contacts): list_contacts = json.loads(contacts) for _contact in list_contacts: if _contact.get("type")=="业主单位": _phone = _contact.get("cellphone") if _phone and re.search("^1\d{10}$",_phone.split(" ")[-1]) is not None: return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("cellphone") for _contact in list_contacts: if _contact.get("type")=="业主单位": return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("phone") return "","","" def exportDocument_by_pagetime(): # filename = "../data/重复公告.xlsx" # df = pd.read_excel(filename) ots_client = getConnect_ots() set_enter = set() str_enter = ''' ''' for a in re.split("\s+",str_enter): if a.strip()!="": set_enter.add(a.strip()) columns = ["doctitle","doctextcon","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"] columns = ["page_time","contacts","covered_area","floor_space","progress","project_address","project_description","project_investment","project_name"] def getData(df_data,rows,set_line,list_keyword): list_data = getRow_ots(rows) for row in list_data: item = {} _dict = row # set_dict_item(item,"",_dict.get("docid","")) tenderee,contact_name,contact_phone = getTenderee(_dict.get("contacts","[]")) set_dict_item(item,"业主",tenderee) set_dict_item(item,"业主联系人",contact_name) set_dict_item(item,"业主联系电话",contact_phone) set_dict_item(item,"发布时间",_dict.get("page_time","")) set_dict_item(item,"建筑面积",_dict.get("covered_area","")) set_dict_item(item,"层数",_dict.get("floor_space","")) set_dict_item(item,"阶段",_dict.get("progress","")) set_dict_item(item,"项目地址",_dict.get("project_address","")) set_dict_item(item,"简介",_dict.get("project_description","")) set_dict_item(item,"项目投资",_dict.get("project_investment","")) set_dict_item(item,"地址","http://www.bidizhaobiao.com/nzjxm-%d.html"%_dict.get("id","")) # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district","")) # set_dict_item(item,"项目名称",_dict.get("project_name","")) # set_dict_item(item,"区县",_dict.get("district","")) # set_dict_item(item,"发布时间",_dict.get("page_time","")) # set_dict_item(item,"创建时间",_dict.get("crtime","")) # # set_dict_item(item,"行业一级分类",_dict.get("industry","")) # set_dict_item(item,"行业二级分类",_dict.get("info_type","")) # # set_dict_item(item,"uuid",_dict.get("uuid")) # # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle",""))) # # set_dict_item(item,"项目编号",_dict.get("project_code","")) # set_dict_item(item,"招标单位",_dict.get("tenderee","")) # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact","")) # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone","")) # set_dict_item(item,"代理单位",_dict.get("agency","")) # set_dict_item(item,"代理联系人",_dict.get("agency_contact","")) # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone","")) # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid"))))) # # set_dict_item(item,"截标时间",_dict.get("time_bidclose","")) # sub_docs_json = _dict.get("sub_docs_json") # if sub_docs_json is not None: # for _doc in json.loads(sub_docs_json): # if "win_tenderer" in _doc: # set_dict_item(item,"中标单位",_doc["win_tenderer"]) # if "win_tenderee_manager" in _doc: # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"]) # if "win_tenderee_phone" in _doc: # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"]) # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0: # set_dict_item(item,"中标金额",_doc["win_bid_price"]) # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0: # set_dict_item(item,"招标金额",_doc["bidding_budget"]) # if "招标金额" not in item: # set_dict_item(item,"招标金额","") # if "中标金额" not in item: # set_dict_item(item,"中标金额","") # if "中标单位" not in item: # set_dict_item(item,"中标单位","") # # if "中标单位联系人" not in item: # set_dict_item(item,"中标单位联系人","") # if "中标单位联系电话" not in item: # set_dict_item(item,"中标单位联系电话","") # # # if item["中标单位"] not in set_enter: # # continue # # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"])) # if _line in set_line: # continue # if item["招标金额"]=="": # continue # set_line.add(_line) for k,v in item.items(): if k not in df_data: df_data[k] = [] df_data[k].append(v) # list_province = ["江西","湖南","四川","安徽"] list_province = ["全国"] for _province in list_province: df_data = {} str_keywords = ''' 医院 养老院 疗养院 老人院 ''' list_keyword = [] list_should_keyword = [] for _p in re.split("\s|、|,|,|/",str_keywords): if _p.strip()=="": continue list_keyword.append(_p) print(_p) list_should_keyword.append(MatchPhraseQuery('full_text', '%s'%_p.strip())) # list_should_keyword.append(Ma('attachmenttextcon','%s'%_p.strip())) s_tenderee = '酒店、地产' list_should_ten = [] for _p in re.split("、",s_tenderee): if _p.strip()=="": continue list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip())) list_should_chan = [] list_should_chan.append(TermQuery("docchannel",101)) # list_should_chan.append(TermQuery("docchannel",101)) # list_should_chan.append(TermQuery("docchannel",102)) list_should_bidway = [] s_bidway = "公开招标、邀请招标、竞争性谈判、竞争性磋商、询价采购、单一来源采购" for _b in re.split("、",s_bidway): if _b.strip()=="": continue list_should_bidway.append(MatchPhraseQuery("doctextcon",_b.strip())) str_area = '北京、天津' list_should_area = [] for _p in str_area.split("、"): list_should_area.append(TermQuery("province",_p)) must_not_q = [] not_str = '校园电视台 虚拟演播室' for _s in not_str.split(" "): must_not_q.append(MatchPhraseQuery("doctextcon",_s)) should_q_keywrod = BoolQuery(should_queries=list_should_keyword) bool_query = BoolQuery(must_queries=[ RangeQuery("page_time","2020-01-01","2022-01-01",True,True) ,should_q_keywrod ] ) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) set_line = set() _count = len(rows) print(list_keyword) print("total_count:%d"%total_count) getData(df_data,rows,set_line,list_keyword) while next_token: print("%d/%d"%(_count,total_count)) rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index", SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True), ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED)) getData(df_data,rows,set_line,list_keyword) _count += len(rows) # if len(df_data[list(df_data.keys())[0]])>=100: # break # list_df_columns.append('信用代码') # list_df_columns.append('原网地址') df1 = pd.DataFrame(df_data) df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns) if __name__=="__main__": exportDocument_by_pagetime()