exportDesigned.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #encoding:GBK
  2. import sys
  3. import os
  4. sys.path.append("../")
  5. import pandas as pd
  6. from dataSource.source import *
  7. import json
  8. from utils.multiThread import MultiThreadHandler
  9. import queue
  10. from utils.Utils import *
  11. from dataSource.pool import ConnectorPool
  12. import re
  13. from tablestore import *
  14. import traceback
  15. from utils.hashUtil import aesCipher
  16. from export.exportEnterprise import getDictEnterprise,getOneContact
  17. set_columns = set()
  18. list_df_columns = []
  19. def set_dict_item(_dict,name,v):
  20. _dict[name] = getLegal_str(v)
  21. if name not in set_columns:
  22. set_columns.add(name)
  23. list_df_columns.append(getLegal_str(name))
  24. def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
  25. _dict[name] = getLegal_str(v)
  26. if name not in set_columns1:
  27. set_columns1.add(name)
  28. list_df_columns1.append(getLegal_str(name))
  29. def getTenderee(contacts):
  30. list_contacts = json.loads(contacts)
  31. for _contact in list_contacts:
  32. if _contact.get("type")=="业主单位":
  33. _phone = _contact.get("cellphone")
  34. if _phone and re.search("^1\d{10}$",_phone.split(" ")[-1]) is not None:
  35. return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("cellphone")
  36. for _contact in list_contacts:
  37. if _contact.get("type")=="业主单位":
  38. return _contact.get("company_name"),_contact.get("contact_name"),_contact.get("phone")
  39. return "","",""
  40. def exportDocument_by_pagetime():
  41. # filename = "../data/重复公告.xlsx"
  42. # df = pd.read_excel(filename)
  43. ots_client = getConnect_ots()
  44. set_enter = set()
  45. str_enter = '''
  46. '''
  47. for a in re.split("\s+",str_enter):
  48. if a.strip()!="":
  49. set_enter.add(a.strip())
  50. columns = ["doctitle","doctextcon","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose"]
  51. columns = ["page_time","contacts","covered_area","floor_space","progress","project_address","project_description","project_investment","project_name"]
  52. def getData(df_data,rows,set_line,list_keyword):
  53. list_data = getRow_ots(rows)
  54. for row in list_data:
  55. item = {}
  56. _dict = row
  57. # set_dict_item(item,"",_dict.get("docid",""))
  58. tenderee,contact_name,contact_phone = getTenderee(_dict.get("contacts","[]"))
  59. set_dict_item(item,"业主",tenderee)
  60. set_dict_item(item,"业主联系人",contact_name)
  61. set_dict_item(item,"业主联系电话",contact_phone)
  62. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  63. set_dict_item(item,"建筑面积",_dict.get("covered_area",""))
  64. set_dict_item(item,"层数",_dict.get("floor_space",""))
  65. set_dict_item(item,"阶段",_dict.get("progress",""))
  66. set_dict_item(item,"项目地址",_dict.get("project_address",""))
  67. set_dict_item(item,"简介",_dict.get("project_description",""))
  68. set_dict_item(item,"项目投资",_dict.get("project_investment",""))
  69. set_dict_item(item,"地址","http://www.bidizhaobiao.com/nzjxm-%d.html"%_dict.get("id",""))
  70. # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  71. # set_dict_item(item,"项目名称",_dict.get("project_name",""))
  72. # set_dict_item(item,"区县",_dict.get("district",""))
  73. # set_dict_item(item,"发布时间",_dict.get("page_time",""))
  74. # set_dict_item(item,"创建时间",_dict.get("crtime",""))
  75. #
  76. # set_dict_item(item,"行业一级分类",_dict.get("industry",""))
  77. # set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
  78. #
  79. # set_dict_item(item,"uuid",_dict.get("uuid"))
  80. #
  81. # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  82. #
  83. # set_dict_item(item,"项目编号",_dict.get("project_code",""))
  84. # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  85. # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  86. # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  87. # set_dict_item(item,"代理单位",_dict.get("agency",""))
  88. # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  89. # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  90. # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  91. #
  92. # set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  93. # sub_docs_json = _dict.get("sub_docs_json")
  94. # if sub_docs_json is not None:
  95. # for _doc in json.loads(sub_docs_json):
  96. # if "win_tenderer" in _doc:
  97. # set_dict_item(item,"中标单位",_doc["win_tenderer"])
  98. # if "win_tenderee_manager" in _doc:
  99. # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  100. # if "win_tenderee_phone" in _doc:
  101. # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  102. # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  103. # set_dict_item(item,"中标金额",_doc["win_bid_price"])
  104. # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  105. # set_dict_item(item,"招标金额",_doc["bidding_budget"])
  106. # if "招标金额" not in item:
  107. # set_dict_item(item,"招标金额","")
  108. # if "中标金额" not in item:
  109. # set_dict_item(item,"中标金额","")
  110. # if "中标单位" not in item:
  111. # set_dict_item(item,"中标单位","")
  112. #
  113. # if "中标单位联系人" not in item:
  114. # set_dict_item(item,"中标单位联系人","")
  115. # if "中标单位联系电话" not in item:
  116. # set_dict_item(item,"中标单位联系电话","")
  117. #
  118. # # if item["中标单位"] not in set_enter:
  119. # # continue
  120. #
  121. # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  122. # if _line in set_line:
  123. # continue
  124. # if item["招标金额"]=="":
  125. # continue
  126. # set_line.add(_line)
  127. for k,v in item.items():
  128. if k not in df_data:
  129. df_data[k] = []
  130. df_data[k].append(v)
  131. # list_province = ["江西","湖南","四川","安徽"]
  132. list_province = ["全国"]
  133. for _province in list_province:
  134. df_data = {}
  135. str_keywords = '''
  136. 医院 养老院 疗养院 老人院
  137. '''
  138. list_keyword = []
  139. list_should_keyword = []
  140. for _p in re.split("\s|、|,|,|/",str_keywords):
  141. if _p.strip()=="":
  142. continue
  143. list_keyword.append(_p)
  144. print(_p)
  145. list_should_keyword.append(MatchPhraseQuery('full_text', '%s'%_p.strip()))
  146. # list_should_keyword.append(Ma('attachmenttextcon','%s'%_p.strip()))
  147. s_tenderee = '酒店、地产'
  148. list_should_ten = []
  149. for _p in re.split("、",s_tenderee):
  150. if _p.strip()=="":
  151. continue
  152. list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
  153. list_should_chan = []
  154. list_should_chan.append(TermQuery("docchannel",101))
  155. # list_should_chan.append(TermQuery("docchannel",101))
  156. # list_should_chan.append(TermQuery("docchannel",102))
  157. list_should_bidway = []
  158. s_bidway = "公开招标、邀请招标、竞争性谈判、竞争性磋商、询价采购、单一来源采购"
  159. for _b in re.split("、",s_bidway):
  160. if _b.strip()=="":
  161. continue
  162. list_should_bidway.append(MatchPhraseQuery("doctextcon",_b.strip()))
  163. str_area = '北京、天津'
  164. list_should_area = []
  165. for _p in str_area.split("、"):
  166. list_should_area.append(TermQuery("province",_p))
  167. must_not_q = []
  168. not_str = '校园电视台 虚拟演播室'
  169. for _s in not_str.split(" "):
  170. must_not_q.append(MatchPhraseQuery("doctextcon",_s))
  171. should_q_keywrod = BoolQuery(should_queries=list_should_keyword)
  172. bool_query = BoolQuery(must_queries=[
  173. RangeQuery("page_time","2020-01-01","2022-01-01",True,True)
  174. ,should_q_keywrod
  175. ]
  176. )
  177. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  178. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
  179. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  180. set_line = set()
  181. _count = len(rows)
  182. print(list_keyword)
  183. print("total_count:%d"%total_count)
  184. getData(df_data,rows,set_line,list_keyword)
  185. while next_token:
  186. print("%d/%d"%(_count,total_count))
  187. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  188. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  189. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  190. getData(df_data,rows,set_line,list_keyword)
  191. _count += len(rows)
  192. # if len(df_data[list(df_data.keys())[0]])>=100:
  193. # break
  194. # list_df_columns.append('信用代码')
  195. # list_df_columns.append('原网地址')
  196. df1 = pd.DataFrame(df_data)
  197. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  198. if __name__=="__main__":
  199. exportDocument_by_pagetime()