|
- #encoding:GBK
- import sys
- import os
- sys.path.append("../")
- import pandas as pd
- from dataSource.source import *
- import json
- from utils.multiThread import MultiThreadHandler
- import queue
- from utils.Utils import *
- from dataSource.pool import ConnectorPool
- import re
- from tablestore import *
- import traceback
- from utils.hashUtil import aesCipher
- from export.exportEnterprise import getDictEnterprise,getOneContact
- set_columns = set()
- list_df_columns = []
- def set_dict_item(_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns:
- set_columns.add(name)
- list_df_columns.append(getLegal_str(name))
- def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns1:
- set_columns1.add(name)
- list_df_columns1.append(getLegal_str(name))
- def getTenderee(contacts):
- list_contacts = json.loads(contacts)
- for _contact in list_contacts:
- if _contact.get("type")=="业主单位":
- _phone = _contact.get("cellphone")
- if _phone and re.search("^1\d{10}$",_phone.split(" ")[-1]) is not None:
- return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("cellphone")
- for _contact in list_contacts:
- if _contact.get("type")=="业主单位":
- return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("phone")
- return "","",""
- def exportDocument_by_pagetime():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- set_enter = set()
- str_enter = '''
-
- '''
- for a in re.split("\s+",str_enter):
- if a.strip()!="":
- set_enter.add(a.strip())
- columns = ["doctitle","doctextcon","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
- columns = ["page_time","contacts","covered_area","floor_space","progress","project_address","project_description","project_investment","project_name"]
- def getData(df_data,rows,set_line,list_keyword):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- # set_dict_item(item,"",_dict.get("docid",""))
- tenderee,contact_name,contact_phone = getTenderee(_dict.get("contacts","[]"))
- set_dict_item(item,"业主",tenderee)
- set_dict_item(item,"业主联系人",contact_name)
- set_dict_item(item,"业主联系电话",contact_phone)
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"建筑面积",_dict.get("covered_area",""))
- set_dict_item(item,"层数",_dict.get("floor_space",""))
- set_dict_item(item,"阶段",_dict.get("progress",""))
- set_dict_item(item,"项目地址",_dict.get("project_address",""))
- set_dict_item(item,"简介",_dict.get("project_description",""))
- set_dict_item(item,"项目投资",_dict.get("project_investment",""))
- set_dict_item(item,"地址","http://www.bidizhaobiao.com/nzjxm-%d.html"%_dict.get("id",""))
- # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- # set_dict_item(item,"项目名称",_dict.get("project_name",""))
- # set_dict_item(item,"区县",_dict.get("district",""))
- # set_dict_item(item,"发布时间",_dict.get("page_time",""))
- # set_dict_item(item,"创建时间",_dict.get("crtime",""))
- #
- # set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- # set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- #
- # set_dict_item(item,"uuid",_dict.get("uuid"))
- #
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- #
- # set_dict_item(item,"项目编号",_dict.get("project_code",""))
- # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- # set_dict_item(item,"代理单位",_dict.get("agency",""))
- # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- #
- # set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- # sub_docs_json = _dict.get("sub_docs_json")
- # if sub_docs_json is not None:
- # for _doc in json.loads(sub_docs_json):
- # if "win_tenderer" in _doc:
- # set_dict_item(item,"中标单位",_doc["win_tenderer"])
- # if "win_tenderee_manager" in _doc:
- # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- # if "win_tenderee_phone" in _doc:
- # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- # set_dict_item(item,"中标金额",_doc["win_bid_price"])
- # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- # set_dict_item(item,"招标金额",_doc["bidding_budget"])
- # if "招标金额" not in item:
- # set_dict_item(item,"招标金额","")
- # if "中标金额" not in item:
- # set_dict_item(item,"中标金额","")
- # if "中标单位" not in item:
- # set_dict_item(item,"中标单位","")
- #
- # if "中标单位联系人" not in item:
- # set_dict_item(item,"中标单位联系人","")
- # if "中标单位联系电话" not in item:
- # set_dict_item(item,"中标单位联系电话","")
- #
- # # if item["中标单位"] not in set_enter:
- # # continue
- #
- # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- # set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- # list_province = ["江西","湖南","四川","安徽"]
- list_province = ["全国"]
- for _province in list_province:
- df_data = {}
- str_keywords = '''
- 医院 养老院 疗养院 老人院
- '''
- list_keyword = []
- list_should_keyword = []
- for _p in re.split("\s|、|,|,|/",str_keywords):
- if _p.strip()=="":
- continue
- list_keyword.append(_p)
- print(_p)
- list_should_keyword.append(MatchPhraseQuery('full_text', '%s'%_p.strip()))
- # list_should_keyword.append(Ma('attachmenttextcon','%s'%_p.strip()))
- s_tenderee = '酒店、地产'
- list_should_ten = []
- for _p in re.split("、",s_tenderee):
- if _p.strip()=="":
- continue
- list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
- list_should_chan = []
- list_should_chan.append(TermQuery("docchannel",101))
- # list_should_chan.append(TermQuery("docchannel",101))
- # list_should_chan.append(TermQuery("docchannel",102))
- list_should_bidway = []
- s_bidway = "公开招标、邀请招标、竞争性谈判、竞争性磋商、询价采购、单一来源采购"
- for _b in re.split("、",s_bidway):
- if _b.strip()=="":
- continue
- list_should_bidway.append(MatchPhraseQuery("doctextcon",_b.strip()))
- str_area = '北京、天津'
- list_should_area = []
- for _p in str_area.split("、"):
- list_should_area.append(TermQuery("province",_p))
- must_not_q = []
- not_str = '校园电视台 虚拟演播室'
- for _s in not_str.split(" "):
- must_not_q.append(MatchPhraseQuery("doctextcon",_s))
- should_q_keywrod = BoolQuery(should_queries=list_should_keyword)
- bool_query = BoolQuery(must_queries=[
- RangeQuery("page_time","2020-01-01","2022-01-01",True,True)
- ,should_q_keywrod
- ]
- )
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- print(list_keyword)
- print("total_count:%d"%total_count)
- getData(df_data,rows,set_line,list_keyword)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line,list_keyword)
- _count += len(rows)
- # if len(df_data[list(df_data.keys())[0]])>=100:
- # break
- # list_df_columns.append('信用代码')
- # list_df_columns.append('原网地址')
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- if __name__=="__main__":
- exportDocument_by_pagetime()
|