exportProject.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. #encoding:GBK
  2. import sys
  3. import os
  4. sys.path.append("../")
  5. import pandas as pd
  6. from dataSource.source import *
  7. import json
  8. from utils.multiThread import MultiThreadHandler
  9. import queue
  10. from utils.Utils import *
  11. from dataSource.pool import ConnectorPool
  12. import re
  13. from tablestore import *
  14. import traceback
  15. from utils.hashUtil import aesCipher
  16. from export.exportEnterprise import getDictEnterprise,getOneContact
  17. from export.exportUtils import generateBoolShouldQuery
  18. from queue import Queue
  19. data_path = "../data/"
  20. set_columns = set()
  21. list_df_columns = []
  22. def set_dict_item(_dict,name,v):
  23. _dict[name] = getLegal_str(v)
  24. if name not in set_columns:
  25. set_columns.add(name)
  26. list_df_columns.append(getLegal_str(name))
  27. def getDict_docchannel():
  28. conn = getConnection_mysql()
  29. cursor = conn.cursor()
  30. sql = "select channel_id,chnlname from sys_channel "
  31. cursor.execute(sql)
  32. rows = cursor.fetchall()
  33. _dict = dict()
  34. for row in rows:
  35. _dict[row[0]] = row[1]
  36. return _dict
  37. def exportProject_by_pagetime():
  38. # filename = "../data/重复公告.xlsx"
  39. # df = pd.read_excel(filename)
  40. ots_client = getConnect_ots()
  41. set_enter = set()
  42. str_enter = '''
  43. 成都四方伟业软件股份有限公司
  44. 北京数字冰雹信息技术有限公司
  45. 北京睿呈时代信息科技有限公司
  46. 北京五一视界数字孪生科技股份有限公司
  47. 易达云图(深圳)科技有限公司
  48. 北京优锘科技有限公司
  49. 深圳市鸿普森科技股份有限公司
  50. 厦门图扑软件科技有限公司
  51. 四川相数科技有限公司
  52. '''
  53. for a in re.split("\s+",str_enter):
  54. if a.strip()!="":
  55. set_enter.add(a.strip())
  56. columns = ["docids","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","win_bid_price","win_tenderer","bidding_budget"]
  57. columns = ["page_time","province","city","win_tenderer"]
  58. dict_channel = getDict_docchannel()
  59. def getData(df_data,rows,set_line):
  60. list_data = getRow_ots(rows)
  61. for row in list_data:
  62. item = {}
  63. _dict = row
  64. set_dict_item(item,"docids",_dict.get("docids",""))
  65. set_dict_item(item,"项目名称",_dict.get("project_name",""))
  66. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  67. # set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  68. # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  69. set_dict_item(item,"省份",_dict.get("province",""))
  70. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  71. set_dict_item(item,"城市",_dict.get("city",""))
  72. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  73. set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  74. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  75. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  76. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  77. set_dict_item(item,"代理单位",_dict.get("agency",""))
  78. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  79. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  80. # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  81. set_dict_item(item,"招标金额",_dict.get("bidding_budget",""))
  82. set_dict_item(item,"中标金额",_dict.get("win_bid_price",""))
  83. set_dict_item(item,"中标单位",_dict.get("win_tenderer",""))
  84. sub_docs_json = _dict.get("sub_docs_json")
  85. if sub_docs_json is not None:
  86. for _doc in json.loads(sub_docs_json):
  87. if "win_tenderer" in _doc:
  88. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  89. if "win_tenderee_manager" in _doc:
  90. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  91. if "win_tenderee_phone" in _doc:
  92. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  93. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  94. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  95. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  96. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  97. if "招标金额" not in item:
  98. set_dict_item(item,"招标金额","")
  99. if "中标金额" not in item:
  100. set_dict_item(item,"中标金额","")
  101. if "中标单位" not in item:
  102. set_dict_item(item,"中标单位","")
  103. if "中标单位联系人" not in item:
  104. set_dict_item(item,"中标单位联系人","")
  105. if "中标单位联系电话" not in item:
  106. set_dict_item(item,"中标单位联系电话","")
  107. # if item["中标单位"] not in set_enter:
  108. # continue
  109. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  110. # if _line in set_line:
  111. # continue
  112. # if item["招标金额"]=="":
  113. # continue
  114. # set_line.add(_line)
  115. for k,v in item.items():
  116. if k not in df_data:
  117. df_data[k] = []
  118. df_data[k].append(v)
  119. # list_province = ["江西","湖南","四川","安徽"]
  120. list_province = ["全国"]
  121. for _province in list_province:
  122. df_data = {}
  123. str_p = '''
  124. 家具
  125. '''
  126. # str_p = '''
  127. # 教育信息化 教学设备 智慧校园 互联网教育
  128. # '''
  129. list_prov = re.split("\s|、",str_p)
  130. list_mu = []
  131. for _p in list_prov:
  132. if _p.strip()=="":
  133. continue
  134. print(_p)
  135. list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
  136. s_tenderee = '教育局、中学、小学'
  137. list_should_ten = []
  138. for _p in re.split("、",s_tenderee):
  139. if _p.split()=="":
  140. continue
  141. list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
  142. # list_should_ten.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
  143. list_should_chan = []
  144. list_should_chan.append(TermQuery("docchannel",101))
  145. # list_should_chan.append(TermQuery("docchannel",101))
  146. # list_should_chan.append(TermQuery("docchannel",102))
  147. should_q1 = BoolQuery(should_queries=list_mu)
  148. should_q2 = BoolQuery(should_queries=list_should_ten)
  149. should_q3 = BoolQuery(should_queries=list_should_chan)
  150. bool_query = BoolQuery(must_queries=[
  151. generateBoolShouldQuery(["doctextcon"],["家具"],MatchPhraseQuery),
  152. generateBoolShouldQuery(["province"],["广东","安徽","江苏","浙江","四川","北京"],TermQuery),
  153. WildcardQuery("win_tenderer","*"),
  154. ])
  155. table_name = "project2"
  156. rows, next_token, total_count, is_all_succeed = ots_client.search(table_name, "%s_index"%table_name,
  157. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("page_time",SortOrder.ASC)]), limit=100, get_total_count=True),
  158. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  159. print(total_count)
  160. set_line = set()
  161. _count = len(rows)
  162. getData(df_data,rows,set_line)
  163. while next_token:
  164. print("%d/%d"%(_count,total_count))
  165. rows, next_token, total_count, is_all_succeed = ots_client.search(table_name, "%s_index"%table_name,
  166. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  167. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  168. getData(df_data,rows,set_line)
  169. _count += len(rows)
  170. # if len(df_data[list(df_data.keys())[0]])>=300:
  171. # break
  172. set_enterprise = set()
  173. for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  174. set_enterprise.add(_tenderee)
  175. set_enterprise.add(_agency)
  176. set_enterprise.add(_win_tenderer)
  177. if "" in set_enterprise:
  178. set_enterprise.remove("")
  179. if None in set_enterprise:
  180. set_enterprise.remove(None)
  181. # dict_enterprise = getDictEnterprise(list(set_enterprise))
  182. # if len(set_enterprise)>0:
  183. # for _i in range(len(df_data["招标单位"])):
  184. # _enterprise_name = df_data["招标单位"][_i]
  185. # if df_data["招标联系人电话"][_i]=="":
  186. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  187. # if contacts is not None:
  188. # _person,_phone = getOneContact(contacts)
  189. # df_data["招标联系人"][_i] = _person
  190. # df_data["招标联系人电话"][_i] = _phone
  191. #
  192. # _enterprise_name = df_data["代理单位"][_i]
  193. # if df_data["代理联系人电话"][_i]=="":
  194. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  195. # if contacts is not None:
  196. # _person,_phone = getOneContact(contacts)
  197. # df_data["代理联系人"][_i] = _person
  198. # df_data["代理联系人电话"][_i] = _phone
  199. #
  200. # _enterprise_name = df_data["中标单位"][_i]
  201. # if df_data["中标单位联系电话"][_i]=="":
  202. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  203. # if contacts is not None:
  204. # _person,_phone = getOneContact(contacts)
  205. # df_data["中标单位联系人"][_i] = _person
  206. # df_data["中标单位联系电话"][_i] = _phone
  207. # print(df_data)
  208. df1 = pd.DataFrame(df_data)
  209. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  210. def exportProjectWithOneDocid():
  211. ots_client = getConnect_ots()
  212. list_data = []
  213. bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-05-28")])
  214. columns = ["docids","project_name"]
  215. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  216. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.ASC)]),get_total_count=True,limit=100),
  217. columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  218. list_dict = getRow_ots(rows)
  219. for _dict in list_dict:
  220. if len(_dict["docids"].split(","))==1:
  221. list_data.append(_dict)
  222. _count = len(list_dict)
  223. while True:
  224. if not next_token:
  225. break
  226. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  227. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  228. columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  229. list_dict = getRow_ots(rows)
  230. _count += len(list_dict)
  231. print("%d/%d"%(_count,total_count))
  232. for _dict in list_dict:
  233. if len(_dict["docids"].split(","))==1:
  234. list_data.append(_dict)
  235. _index = 0
  236. task_queue = queue.Queue()
  237. for _dict in list_data:
  238. task_queue.put(_dict)
  239. def _handle(_dict,result_queue):
  240. docid = _dict["docids"]
  241. project_name = _dict["project_name"]
  242. _dict["candidate"] = []
  243. _dict["total_count"] = 0
  244. if len(project_name)>0:
  245. doc_query = BoolQuery(must_queries=[MatchPhraseQuery("doctextcon",project_name)
  246. ,RangeQuery("status",201,300,True,True)],
  247. must_not_queries=[TermQuery("docid",docid)])
  248. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  249. SearchQuery(doc_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=10,get_total_count=True),
  250. columns_to_get=ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
  251. l_d = getRow_ots(rows)
  252. for _d in l_d:
  253. _dict["candidate"].append(_d["docid"])
  254. _dict["total_count"] = total_count
  255. mt = MultiThreadHandler(task_queue,_handle,None,30)
  256. mt.run()
  257. df_data = {}
  258. for _d in list_data:
  259. for k,v in _d.items():
  260. if k not in df_data:
  261. df_data[k] = []
  262. df_data[k].append(v)
  263. df = pd.DataFrame(df_data)
  264. df.to_excel("../data/%s_未合并.xlsx"%(getCurrent_date("%Y-%m-%d %H%M%S")))
  265. def getPayStaffName():
  266. conn = getConnection_mysql()
  267. cursor = conn.cursor()
  268. sql = " select company,userid,phone,contactname,aftermarket from bxkc.b2c_mall_staff_basic_info where MEMBERLEVELID is not null and MEMBERLEVELID <> 81"
  269. cursor.execute(sql)
  270. rows = cursor.fetchall()
  271. dict_staff = {}
  272. for row in rows:
  273. company,userid,phone,contactname,aftermarket = row
  274. if company is not None:
  275. dict_staff[company] = {"userid":userid,"phone":phone,"contactname":contactname,"aftermarket":aftermarket}
  276. return dict_staff
  277. def exportCompanyByCycleProduct():
  278. filename = "../data/周期项目识别.csv"
  279. df = pd.read_csv(filename,encoding='gbk')
  280. task_queue = queue.Queue()
  281. result_queue = queue.Queue()
  282. pool_conn = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
  283. _count = 0
  284. for tenderee,product,last_time,avg_period,min_period,max_period,json_docid in zip(df["tenderee"],df["product"],df["last_time"],df["avg_period"],df["min_period"],df["max_period"],df["json_docid"]):
  285. _dict = {"tenderee":tenderee,"product":product,"last_time":last_time,"avg_period":avg_period,"min_period":min_period,
  286. "max_period":max_period,"json_docid":json_docid}
  287. task_queue.put(_dict)
  288. _count += 1
  289. sstr_staff = getPayStaffName()
  290. ots_client = getConnect_ots()
  291. def _comsumer(_dict,result_queue,ots_client,sstr_staff,pool_conn):
  292. new_dict = {"招标人":_dict["tenderee"],"产品":_dict["product"],"上次招标":_dict["last_time"],
  293. "预计招标范围":"%s-%s"%(timeAdd(_dict["last_time"],_dict["min_period"]),timeAdd(_dict["last_time"],_dict["max_period"])),
  294. "周期":_dict["avg_period"],"历史招标":_dict["json_docid"]}
  295. aint_docid = json.loads(_dict["json_docid"])
  296. aobj_should_q_docid = []
  297. consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",_dict["tenderee"])], ["contacts"], None, 1)
  298. dict_tmp = getRow_ots_primary(return_row)
  299. contacts = dict_tmp.get("contacts")
  300. phone_person,phone_no = getOneContact(contacts)
  301. new_dict["招标人联系人"] = phone_person
  302. new_dict["招标人联系电话"] = phone_no
  303. for int_docid in aint_docid:
  304. aobj_should_q_docid.append(TermQuery("docids",int_docid))
  305. bool_query = BoolQuery(should_queries=aobj_should_q_docid)
  306. columns = ['win_tenderer','second_tenderer','third_tenderer']
  307. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  308. SearchQuery(bool_query,limit=100,get_total_count=True),
  309. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  310. adict_rows = getRow_ots(rows)
  311. for dict_row in adict_rows:
  312. for _k,_company in dict_row.items():
  313. if _k in columns and _company is not None and _company!="":
  314. _succeed = True
  315. new_dict1 = {}
  316. for k,v in new_dict.items():
  317. new_dict1[k] = v
  318. new_dict1["潜在客户"] = _company
  319. consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",_company)], ["contacts"], None, 1)
  320. dict_tmp = getRow_ots_primary(return_row)
  321. contacts = dict_tmp.get("contacts")
  322. phone_person,phone_no = getOneContact(contacts)
  323. new_dict1["潜在客户联系人"] = phone_person
  324. new_dict1["潜在客户联系电话"] = phone_no
  325. if _company in sstr_staff:
  326. company_info = sstr_staff[_company]
  327. new_dict1["付费客户"] = "是"
  328. conn = pool_conn.getConnector()
  329. try:
  330. cursor = conn.cursor()
  331. sql = " select name from bxkc.b2c_mall_staff_basic_info where userid='%s'"%(company_info.get("aftermarket",""))
  332. cursor.execute(sql)
  333. rows = cursor.fetchall()
  334. if len(rows)>0:
  335. new_dict1["归属客服"] = rows[0][0]
  336. else:
  337. new_dict1["归属客服"] = ""
  338. new_dict1["付费客户联系人"] = company_info.get("contactname","")
  339. new_dict1["付费客户电话"] = company_info.get("phone","")
  340. sql = " select date_FORMAT(etiem,\'%Y-%m-%d\') from bxkc.bxkc_member_term where userid='"+company_info.get("userid","")+"' and memberlevelid<>81 order by etiem desc limit 1"
  341. cursor.execute(sql)
  342. rows = cursor.fetchall()
  343. if len(rows)>0:
  344. etime = rows[0][0]
  345. new_dict1["付费客户到期日"] = etime
  346. if time.mktime(time.strptime(etime,"%Y-%m-%d"))>time.mktime(time.localtime()):
  347. new_dict1["付费客户到期"] = "否"
  348. else:
  349. new_dict1["付费客户到期"] = "是"
  350. else:
  351. new_dict1["付费客户到期日"] = ""
  352. new_dict1["付费客户到期"] = ""
  353. except Exception as e:
  354. traceback.print_exc()
  355. _succeed = False
  356. finally:
  357. pool_conn.putConnector(conn)
  358. else:
  359. new_dict1["付费客户"] = "否"
  360. new_dict1["归属客服"] = ""
  361. new_dict1["付费客户联系人"] = ""
  362. new_dict1["付费客户电话"] = ""
  363. new_dict1["付费客户到期日"] = ""
  364. new_dict1["付费客户到期"] = ""
  365. if _succeed:
  366. result_queue.put(new_dict1)
  367. mt = MultiThreadHandler(task_queue,_comsumer,result_queue,ots_client=ots_client,sstr_staff=sstr_staff,pool_conn=pool_conn,thread_count=30)
  368. mt.run()
  369. df_data = {}
  370. set_staff = set()
  371. while True:
  372. try:
  373. _dict = result_queue.get(timeout=1)
  374. tenderee = _dict.get("招标人","")
  375. product = _dict.get("产品","")
  376. staff = _dict.get("潜在客户","")
  377. _s = "%s-%s-%s"%(tenderee,product,staff)
  378. if _s in set_staff:
  379. continue
  380. set_staff.add(_s)
  381. for k,v in _dict.items():
  382. if k not in df_data:
  383. df_data[k] = []
  384. df_data[k].append(v)
  385. except Exception as e:
  386. break
  387. df1 = pd.DataFrame(df_data)
  388. df1.to_excel("../data/%s_周期项目.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
  389. def appendCellphones():
  390. file = "../data/"
  391. from export.exportUtils import *
  392. import jieba
  393. def export_industry_keywords_by_enterprise(list_enterprise):
  394. task_queue = Queue()
  395. result_queue = Queue()
  396. list_query = []
  397. for _enterprise in list_enterprise:
  398. _query = BoolQuery(must_queries=[
  399. TermQuery("win_tenderer",_enterprise),
  400. RangeQuery("status",201,301)
  401. ])
  402. list_query.append({"query":_query,"limit":1000})
  403. list_data = getDocument(list_query,columns=["docid","doctitles","project_name","product","win_tenderer"],
  404. table_name="project2",
  405. table_index="project2_index")
  406. dict_keywords = {}
  407. dict_keywords_product = {}
  408. dict_keywords_product_count = {}
  409. for _data in list_data:
  410. doctitles = _data.get("doctitles","")
  411. project_name = _data.get("project_name","")
  412. product = _data.get("product","")
  413. for _keyword in doctitles.split(","):
  414. for _word in jieba.cut(_keyword):
  415. if _word in dict_keywords:
  416. dict_keywords[_word] += 1
  417. else:
  418. dict_keywords[_word] = 1
  419. for _keyword in project_name.split(","):
  420. for _word in jieba.cut(_keyword):
  421. if _word in dict_keywords:
  422. dict_keywords[_word] += 1
  423. else:
  424. dict_keywords[_word] = 1
  425. for _keyword in product.split(","):
  426. if _keyword in dict_keywords_product_count:
  427. dict_keywords_product_count[_keyword] += 1
  428. else:
  429. dict_keywords_product_count[_keyword] = 1
  430. for _word in jieba.cut(_keyword):
  431. if _word in dict_keywords:
  432. dict_keywords[_word] += 1
  433. else:
  434. dict_keywords[_word] = 1
  435. if _word in dict_keywords_product:
  436. dict_keywords_product[_word] += 1
  437. else:
  438. dict_keywords_product[_word] = 1
  439. list_keywords = []
  440. for _keyword,count in dict_keywords.items():
  441. list_keywords.append([_keyword,count])
  442. list_keywords.sort(key=lambda x:x[1],reverse=True)
  443. list_keywords.insert(0,["关键词","数量"])
  444. list_keywords = list_keywords[:10000]
  445. list_keywords_product = []
  446. for _keyword,count in dict_keywords_product.items():
  447. list_keywords_product.append([_keyword,count])
  448. list_keywords_product.sort(key=lambda x:x[1],reverse=True)
  449. list_keywords_product.insert(0,["关键词","数量"])
  450. list_keywords_product = list_keywords_product[:10000]
  451. list_keywords_product_count = []
  452. for _keyword,count in dict_keywords_product_count.items():
  453. list_keywords_product_count.append([_keyword,count])
  454. list_keywords_product_count.sort(key=lambda x:x[1],reverse=True)
  455. list_keywords_product_count.insert(0,["关键词","数量"])
  456. list_keywords_product_count = list_keywords_product_count[:10000]
  457. filename = "../data/%s_行业关键词.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"))
  458. with pd.ExcelWriter(filename) as writer:
  459. df_1 = pd.DataFrame(list_data)
  460. df_1.to_excel(writer,sheet_name="项目数据")
  461. df_data = pd.DataFrame(list_keywords)
  462. df_data.to_excel(writer,sheet_name="标题项目名称产品词频统计")
  463. df_data = pd.DataFrame(list_keywords_product)
  464. df_data.to_excel(writer,sheet_name="产品词频统计")
  465. df_data = pd.DataFrame(list_keywords_product_count)
  466. df_data.to_excel(writer,sheet_name="产品项目词频统计")
  467. def turn_structure():
  468. filename = r'G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2025-06\食堂食材行业关键词.xlsx'
  469. df = pd.read_excel(filename)
  470. list_product_exclude = df["产品排除词"]
  471. list_title_exclude = df["标题排除词"]
  472. list_keywords = df["标题+正文关键词"]
  473. list_title_exclude = [a for a in list_title_exclude if isinstance(a,str)]
  474. list_keywords = [a for a in list_keywords if isinstance(a,str)]
  475. list_product_exclude = [a for a in list_product_exclude if isinstance(a,str)]
  476. list_data = [["行业","全文关键词","全文排除词","标题排除词","产品排除词"]]
  477. list_data.append(["食堂食材","、".join(list_keywords),"","、".join(list_title_exclude),"、".join(list_product_exclude)])
  478. df1 = pd.DataFrame(list_data)
  479. df1.to_excel("../data/%s_行业关键词.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),sheet_name="行业关键词")
  480. if __name__=="__main__":
  481. # exportProject_by_pagetime()
  482. # exportProjectWithOneDocid()
  483. # exportCompanyByCycleProduct()
  484. turn_structure()
  485. # list_enterprise = splitIntoList('''
  486. # 明喆集团股份有限公司
  487. # 招商积余物业管理有限公司
  488. # 广州粤华物业有限公司
  489. # 广州广电城市服务集团股份有限公司
  490. # 绿城物业服务集团有限公司
  491. # 龙城城市运营服务集团有限公司
  492. # 深业物业运营集团股份有限公司
  493. # 广东宏德科技物业有限公司
  494. # 保利物业服务股份有限公司
  495. # 新大正物业集团股份有限公司
  496. # 山东明德物业管理集团有限公司
  497. # 深圳市金地物业管理有限公司
  498. # 上海复欣物业管理发展有限公司
  499. # 招商局物业管理有限公司
  500. # 东吴服务产业集团(江苏)有限公司
  501. # 碧桂园生活服务集团股份有限公司
  502. # 天津市金玉物业管理有限公司
  503. # 润加物业服务(深圳)有限公司
  504. # 山东宏泰物业发展有限公司
  505. # 爱玛客服务产业(中国)有限公司
  506. # 中海物业管理有限公司
  507. # 浙江亚太酒店物业服务有限公司
  508. # 深圳万物商企物业服务有限公司
  509. # 天津天孚物业管理有限公司
  510. # 上海益中亘泰(集团)股份有限公司
  511. # 上海申勤物业管理服务有限公司
  512. # 广东公诚设备资产服务有限公司
  513. # 上海东湖物业管理有限公司
  514. # 天津市赛驰物业服务有限公司
  515. # 安徽省长城物业管理有限公司
  516. # 深圳市万科物业服务有限公司
  517. # 上海生乐物业管理有限公司
  518. # 金科智慧服务集团股份有限公司
  519. # 山东润华物业管理有限公司
  520. # 国药诺达物业服务有限公司
  521. # 深业置地(深圳)物业管理有限公司
  522. # 广东华信服务集团有限公司
  523. # 天津玉龙源物业管理服务股份有限公司
  524. # 广州珠江城市管理服务集团股份有限公司
  525. # 中航物业管理有限公司
  526. # 深圳市广美隆物业清洁服务有限公司
  527. # 浙江浙大新宇物业集团有限公司
  528. # 金融街物业股份有限公司
  529. # 天津峥嵘物业管理有限公司
  530. # 新疆德泰保安服务有限公司
  531. # 北京住总北宇物业服务有限责任公司
  532. # 广东泰科物业管理有限公司
  533. # 德州市陵城区人才发展集团有限公司
  534. # 乌鲁木齐阳光管道物业服务有限公司
  535. # 深业物业集团有限公司
  536. # ''',_splitkeys="\n|\s")
  537. # export_industry_keywords_by_enterprise(list_enterprise)