exportDocument.py 105 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308
  1. #encoding:GBK
  2. import sys
  3. import os
  4. sys.path.append("../")
  5. import pandas as pd
  6. from dataSource.source import *
  7. import json
  8. from utils.multiThread import MultiThreadHandler
  9. import queue
  10. from utils.Utils import *
  11. from dataSource.pool import ConnectorPool
  12. import re
  13. from tablestore import *
  14. import traceback
  15. from utils.hashUtil import aesCipher
  16. from export.exportEnterprice import getDictEnterprise,getOneContact
  17. data_path = "../data/"
  18. def getCompanyTenderer():
  19. def _handle(item,result_queue):
  20. company = item
  21. dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
  22. dict_result["company"] = company
  23. graph = getConnect_neo4j()
  24. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
  25. finded = graph.run(cql)
  26. finded_ids = json.loads(json.dumps(finded.data()))
  27. dict_result["count"] = len(finded_ids)
  28. mongoDB = getConnect_mongodb()
  29. coll_zb = mongoDB.zhongbiao_extraction
  30. if len(finded_ids)>0:
  31. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
  32. finded = graph.run(cql)
  33. finded_names = json.loads(json.dumps(finded.data()))
  34. list_names = [_i["p.project_name"] for _i in finded_names]
  35. dict_result["project_name"] = str(list_names)
  36. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
  37. finded = graph.run(cql)
  38. finded_money = json.loads(json.dumps(finded.data()))
  39. whole_money = 0
  40. for item in finded_money:
  41. if item["r.price"] is not None:
  42. whole_money += getUnifyMoney(item["r.price"])
  43. dict_result["whole_money"] = str(whole_money)
  44. dict_competitor = {}
  45. for item in finded_ids:
  46. docId = item["p.zhong_biao_id"]
  47. if docId is not None:
  48. rows = coll_zb.find({"docId":docId})
  49. for row in rows:
  50. keys = ["second_tenderer","third_tenderer"]
  51. for _key in keys:
  52. if _key in row:
  53. if row[_key] not in dict_competitor:
  54. dict_competitor[row[_key]] = 0
  55. dict_competitor[row[_key]] += 1
  56. list_competitor = []
  57. for _key in dict_competitor:
  58. list_competitor.append([_key,dict_competitor[_key]])
  59. list_competitor.sort(key=lambda x:x[1],reverse=True)
  60. list_competitors = [i[0] for i in list_competitor[:10]]
  61. dict_result["competitor"] = str(list_competitors)
  62. result_queue.put(dict_result)
  63. # filename = "成交客户匹配中标项目的需求.xlsx"
  64. # df = pd.read_excel(filename)
  65. # list_company = df["公司名字"]
  66. # company = list_company[0]
  67. list_company = []
  68. filename = "../data/服务型客户.txt"
  69. with open(filename,"r",encoding="GBK") as f:
  70. while(True):
  71. line = f.readline()
  72. if not line:
  73. break
  74. list_company.append(line.strip())
  75. task_queue = queue.Queue()
  76. for company in list_company:
  77. task_queue.put(company)
  78. result_queue = queue.Queue()
  79. handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
  80. handler.run()
  81. list_company = []
  82. list_zb = []
  83. list_count = []
  84. list_project = []
  85. list_money = []
  86. list_competitor = []
  87. while(True):
  88. try:
  89. _result = result_queue.get(False)
  90. list_company.append(_result.get("company",""))
  91. list_zb.append("是" if _result.get("count","")>0 else "否")
  92. list_count.append(_result.get("count",""))
  93. list_project.append(_result.get("project_name",""))
  94. list_money.append(_result.get("whole_money",""))
  95. list_competitor.append(_result.get("competitor",""))
  96. except Exception as e:
  97. print(e)
  98. break
  99. df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
  100. df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
  101. def export_count_includeKeyword():
  102. filename = "../data/jc001.xlsx"
  103. list_name = []
  104. list_count = []
  105. df = pd.read_excel(filename)
  106. _index = 0
  107. for row in df["品目"]:
  108. _name = row
  109. data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
  110. if data is not None:
  111. _count = data["response"]["numFound"]
  112. else:
  113. _count = 0
  114. list_name.append(_name)
  115. list_count.append(_count)
  116. _index += 1
  117. print(_index)
  118. df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
  119. df1.to_excel("%s_export.xls"%filename)
  120. def export_count_includeKeyword_multiThread():
  121. def _handler(item,result_queue):
  122. data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
  123. if data is not None:
  124. _count = data["response"]["numFound"]
  125. else:
  126. _count = 0
  127. result_queue.put([item,_count])
  128. task_queue = queue.Queue()
  129. result_queue = queue.Queue()
  130. filename = "../data/jc001.xlsx"
  131. list_name = []
  132. list_count = []
  133. df = pd.read_excel(filename)
  134. _index = 0
  135. for row in df["品目"]:
  136. _name = row
  137. task_queue.put(_name)
  138. _index += 1
  139. multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
  140. multHandler.run()
  141. while(True):
  142. try:
  143. item = result_queue.get(False)
  144. list_name.append(item[0])
  145. list_count.append(item[1])
  146. except queue.Empty as e:
  147. break
  148. df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
  149. df1.to_excel("%s_export.xls"%filename)
  150. def exportKeywords():
  151. def _handle(item,result_queue,pool_mongo):
  152. docId = item["docId"]
  153. mongo = pool_mongo.getConnector()
  154. zhongbiao = mongo.zhongbiao_extraction
  155. zhaobiao = mongo.zhaobiao_extraction
  156. _project = ""
  157. rows = zhaobiao.find({"docId":docId},{"project_name":1})
  158. find_flag = False
  159. for row in rows:
  160. find_flag = True
  161. _project = row.get("project_name","")
  162. if not find_flag:
  163. rows = zhongbiao.find({"docId":docId},{"project_name":1})
  164. for row in rows:
  165. _project = row.get("project_name","")
  166. item["project_name"] = _project
  167. pool_mongo.putConnector(mongo)
  168. result_queue.put(item)
  169. list_key = []
  170. dict_key_ids = dict()
  171. with open("../data/品目.txt", "r", encoding="utf8") as f:
  172. while(True):
  173. row = f.readline()
  174. if not row:
  175. break
  176. list_key.append(row)
  177. dict_key_ids[row] = []
  178. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
  179. for item in data["response"]["docs"]:
  180. dict_key_ids[row].append(item["id"])
  181. task_queue = queue.Queue()
  182. result_queue = queue.Queue()
  183. for _key in dict_key_ids.keys():
  184. for item in dict_key_ids[_key]:
  185. task_queue.put({"docId":item,"project_name":""})
  186. pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
  187. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
  188. mt.run()
  189. dict_docId_projectname = {}
  190. while(True):
  191. try:
  192. item = result_queue.get(False)
  193. dict_docId_projectname[item["docId"]] = item["project_name"]
  194. except Exception:
  195. break
  196. dict_key_count = dict()
  197. for _key in dict_key_ids.keys():
  198. set_key = set()
  199. for docId in dict_key_ids[_key]:
  200. set_key.add(dict_docId_projectname.get(docId,""))
  201. dict_key_count[_key] = len(set_key)
  202. print("==")
  203. for _key in list_key:
  204. print(len(dict_key_ids[_key]))
  205. print("==")
  206. for _key in list_key:
  207. print(dict_key_count[_key])
  208. print("==")
  209. def getIndustryCompany():
  210. def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
  211. # mongoDB = getConnect_mongodb()
  212. log(item["enterprise_name"])
  213. mongoDB = pool_mongo.getConnector()
  214. # coll_zb = mongoDB.enterprise_profile
  215. # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
  216. # for row in rows:
  217. # item["regCapital"] = row.get("regCapital","")
  218. # item["legal_person"] = row.get("legal_person","")
  219. # item["phone"] = row.get("phone","")
  220. # item["actualCapital"] = row.get("actualCapital","")
  221. # item["industry"] = row.get("industry","")
  222. # item["estiblishTime"] = row.get("estiblishTime","")
  223. # item["socialStaffNum"] = row.get("socialStaffNum","")
  224. # item["businessScope"] = row.get("businessScope","")
  225. # graph = getConnect_neo4j()
  226. ots_client = pool_ots.getConnector()
  227. primary_key = [('name',item["enterprise_name"])]
  228. columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
  229. consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
  230. if return_row is not None:
  231. for att in return_row.attribute_columns:
  232. item[att[0]] = att[1]
  233. list_same_industry_company = []
  234. if "industry" in item:
  235. bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
  236. col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
  237. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  238. SearchQuery(bool_query, limit=10, get_total_count=True),
  239. col)
  240. for row in rows:
  241. for item1 in row[0]:
  242. list_same_industry_company.append(item1[1])
  243. # if "industry" in item:
  244. # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
  245. # for row in rows:
  246. # print(row)
  247. # list_same_industry_company.append(row.get("enterprise_name",""))
  248. item["same_industry_company"] = list_same_industry_company
  249. graph = pool_neo4j.getConnector()
  250. company_name = item["enterprise_name"]
  251. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
  252. finded = graph.run(cql)
  253. data = json.loads(json.dumps(finded.data()))
  254. _count = data[0]["_c"]
  255. # list_project = []
  256. # for _data in data:
  257. # if _count<=3:
  258. # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  259. # if _data["project_name"] is not None:
  260. # list_project.append(_data["project_name"])
  261. # _count += 1
  262. item["count"] = _count
  263. # item["project"] = str(list_project)
  264. result_queue.put(item)
  265. pool_mongo.putConnector(mongoDB)
  266. pool_neo4j.putConnector(graph)
  267. pool_ots.putConnector(ots_client)
  268. log_tofile("export.log")
  269. pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
  270. pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
  271. pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
  272. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  273. # list_company = getCompanys()
  274. # filename = "".join(["环境","生态","再生","回收","环保"])
  275. list_company = []
  276. filename = "../data/同行客户匹配.xlsx"
  277. df = pd.read_excel(filename,sheetname=0)
  278. for _com in df["公司名称"]:
  279. print(_com)
  280. if _com is not None and _com.strip()!="":
  281. _company = {"enterprise_name":""}
  282. _company["enterprise_name"] = _com
  283. list_company.append(_company)
  284. task_queue = queue.Queue()
  285. for item in list_company:
  286. task_queue.put(item)
  287. result_queue = queue.Queue()
  288. _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
  289. _muti.run()
  290. df_company = {}
  291. set_key = set()
  292. if len(list_company)>0:
  293. for item in list_company:
  294. for _key in item.keys():
  295. set_key.add(_key)
  296. if _key not in df_company:
  297. df_company[_key] = []
  298. list_key = list(set_key)
  299. for item in list_company:
  300. for _key in list_key:
  301. df_company[_key].append(item.get(_key,""))
  302. df1 = pd.DataFrame(df_company)
  303. df1.to_excel("%s_export.xlsx"%(filename))
  304. def exportWin_tenderer(time_from,time_to):
  305. '''
  306. :return:
  307. '''
  308. ost_client = getConnect_ots()
  309. last_docid = 0
  310. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
  311. TermQuery("docchannel",101),
  312. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  313. RangeQuery('docid', last_docid, include_lower=False)])
  314. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  315. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  316. ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  317. list_project = []
  318. def _getRow(list_project,rows,last_docid):
  319. for row in rows:
  320. project_name = row[1][0][1]
  321. docid = row[0][1][1]
  322. last_docid = docid
  323. list_pack = json.loads(row[1][1][1])
  324. _set_tenderer = set()
  325. win_tenderer = ""
  326. for _pack in list_pack:
  327. if "win_tenderer" in _pack and win_tenderer=="":
  328. win_tenderer = _pack["win_tenderer"]
  329. if "second_tenderer" in _pack:
  330. _set_tenderer.add(_pack["second_tenderer"])
  331. if "third_tenderer" in _pack:
  332. _set_tenderer.add(_pack["third_tenderer"])
  333. list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
  334. return last_docid
  335. _getRow(list_project,rows,last_docid)
  336. while(next_token):
  337. print("%d/%d"%(len(list_project),total_count))
  338. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  339. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  340. ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  341. last_docid = _getRow(list_project,rows,last_docid)
  342. task_queue = queue.Queue()
  343. result_queue = queue.Queue()
  344. for item in list_project:
  345. task_queue.put(item)
  346. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  347. def _handle(item,result_queue,pool_ots):
  348. if item["win_tenderer"]!="":
  349. ots_client = pool_ots.getConnector()
  350. consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
  351. _dict = dict()
  352. for _item in return_row.attribute_columns:
  353. _dict[_item[0]] = _item[1]
  354. for _key in _dict.keys():
  355. item[_key] = _dict[_key]
  356. data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
  357. for _item in data["response"]["docs"]:
  358. for _key in _item.keys():
  359. item[_key] = _item[_key]
  360. break
  361. pool_ots.putConnector(ots_client)
  362. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
  363. mt.run()
  364. keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
  365. df_data = {}
  366. for _key in keys:
  367. df_data[_key] = []
  368. for item in list_project:
  369. for _key in keys:
  370. if _key in item:
  371. df_data[_key].append(item[_key])
  372. else:
  373. df_data[_key].append("")
  374. df = pd.DataFrame(df_data)
  375. df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
  376. def exportContact():
  377. time_from = "2021-01-14"
  378. time_to = "2021-01-15"
  379. filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
  380. df1 = pd.read_excel(filename)
  381. set_company = set()
  382. for item in df1["tenderer"]:
  383. list_company = re.split("\['|', '|'\]|\[\]",item)
  384. for _company in list_company:
  385. if _company!="":
  386. set_company.add(_company)
  387. companys = list(set_company)
  388. task_queue = queue.Queue()
  389. list_company = []
  390. for _company in companys:
  391. item = {"company_name":_company}
  392. list_company.append(item)
  393. task_queue.put(item)
  394. result_queue = queue.Queue()
  395. def _handle(item,result_queue):
  396. company = item["company_name"]
  397. data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
  398. for _item in data["response"]["docs"]:
  399. for _key in _item.keys():
  400. item[_key] = _item[_key]
  401. break
  402. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
  403. mt.run()
  404. keys = ["company_name","contact_person","mobile_no","phone_no"]
  405. df_data = {}
  406. for _key in keys:
  407. df_data[_key] = []
  408. ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
  409. for item in list_company:
  410. for _key in keys:
  411. if _key in item:
  412. df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
  413. else:
  414. df_data[_key].append("")
  415. df = pd.DataFrame(df_data)
  416. df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
  417. def countKeyword():
  418. conn = getConnection_mysql()
  419. cursor = conn.cursor()
  420. print(0)
  421. sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
  422. cursor.execute(sql)
  423. print(0.1)
  424. df = pd.read_excel("万郡绿建细分关键词.xls")
  425. list_keywords = df["细分类别"]
  426. dict_keywords = dict()
  427. for _key in list_keywords:
  428. dict_keywords[_key] = 0
  429. print(1)
  430. from bs4 import BeautifulSoup
  431. while(True):
  432. rows = cursor.fetchmany(10000)
  433. print("==")
  434. if not rows:
  435. break
  436. for row in rows:
  437. _html = BeautifulSoup(row[0],"lxml").getText()
  438. for _key in list_keywords:
  439. if re.search(_key,_html) is not None:
  440. dict_keywords[_key] += 1
  441. print(dict_keywords)
  442. list_count = []
  443. for _key in list_keywords:
  444. list_count.append(dict_keywords[_key])
  445. df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
  446. df1.to_excel("关键词统计.xlsx")
  447. def countKeyword_solr():
  448. def _handle(item,result_queue):
  449. keyword = item["keyword"]
  450. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
  451. _num = data["response"]["numFound"]
  452. item["zhongbiao"] = _num
  453. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
  454. _num = data["response"]["numFound"]
  455. item["zhaobiao"] = _num
  456. result_queue.put(item)
  457. file = "../data/关键词11.xlsx"
  458. df = pd.read_excel(file)
  459. task_queue = queue.Queue()
  460. print(df.keys())
  461. for item in df["业务关键词"]:
  462. task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
  463. result_queue = queue.Queue()
  464. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
  465. mt.run()
  466. list_keyword = []
  467. list_zhaobiao = []
  468. list_zhongbiao = []
  469. while(True):
  470. try:
  471. item = result_queue.get(False)
  472. list_keyword.append(item["keyword"])
  473. list_zhaobiao.append(item["zhaobiao"])
  474. list_zhongbiao.append(item["zhongbiao"])
  475. except Exception:
  476. break
  477. df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
  478. df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
  479. def query_from_solr():
  480. data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
  481. _num = data["response"]["numFound"]
  482. print(_num)
  483. def export_province_keyword_count():
  484. def _handle(item,result_queue,pool_ots):
  485. columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
  486. ots_client = pool_ots.getConnector()
  487. _province = item["province"]
  488. print(item)
  489. # keywords = item["keyword"]+" "+item["key"]
  490. list_keyword = item["keyword"]
  491. # for _temp in keywords.split(" "):
  492. # if len(_temp)>0:
  493. # list_keyword.append(_temp)
  494. should_queries = []
  495. must_not_q = []
  496. for _temp in list_keyword:
  497. should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
  498. must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
  499. bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
  500. page_time = item["page_time"]
  501. bool_query = BoolQuery(must_queries=[bool_query_keyword
  502. # ,WildcardQuery("publishtime","%s*"%page_time)
  503. # ,MatchPhraseQuery("doctitle","服务")
  504. ,RangeQuery("page_time","2021-04-22","2021-04-27",include_lower=True,include_upper=True),
  505. TermQuery("docchannel",101),
  506. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  507. WildcardQuery('province', '%s*'%_province)
  508. ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
  509. ],
  510. must_not_queries=must_not_q)
  511. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  512. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
  513. ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
  514. item["count"] = total_count
  515. list_data = getRow_ots(rows)
  516. item["list_data"] = list_data
  517. print(item)
  518. pool_ots.putConnector(ots_client)
  519. df = pd.read_excel("../data/省份信息.xlsx")
  520. list_province = []
  521. for _name,_type in zip(df["cname"],df["ctype"]):
  522. if _type==20:
  523. list_province.append(_name)
  524. # filename = "../data/2021-02关键词导出数据.xlsx"
  525. # dict_keyword = {}
  526. # df1 = pd.read_excel(filename,dtype=str)
  527. # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
  528. # print("===",str(_keyword))
  529. # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
  530. # for _key in df1["关键词"]:
  531. # dict_keyword[_key] = ""
  532. keyword_str = '''
  533. 快递 物流 供应链 运输 配送
  534. 仓储 冷链 整车 服务
  535. '''
  536. list_key = []
  537. for _k in re.split("\s",keyword_str):
  538. _k1 = _k.strip()
  539. if len(_k1)>0:
  540. list_key.append(_k1)
  541. list_task = []
  542. page_time = "2020-11"
  543. for _province in list_province:
  544. list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
  545. task_queue = queue.Queue()
  546. for item in list_task:
  547. task_queue.put(item)
  548. result_queue = queue.Queue()
  549. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  550. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  551. mt.run()
  552. dict_key_data = dict()
  553. list_data = []
  554. for item in list_task:
  555. list_data.extend(item["list_data"])
  556. dict_channel = getDict_docchannel()
  557. df_data= {}
  558. print(list_data)
  559. for row in list_data:
  560. item = {}
  561. _dict = row
  562. set_dict_item(item,"docid",_dict.get("docid",""))
  563. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  564. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  565. set_dict_item(item,"省份",_dict.get("province",""))
  566. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  567. set_dict_item(item,"城市",_dict.get("city",""))
  568. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  569. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  570. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  571. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  572. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  573. set_dict_item(item,"代理单位",_dict.get("agency",""))
  574. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  575. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  576. set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  577. sub_docs_json = _dict.get("sub_docs_json")
  578. for _doc in json.loads(sub_docs_json):
  579. if "win_tenderer" in _doc:
  580. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  581. if "win_tenderee_manager" in _doc:
  582. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  583. if "win_tenderee_phone" in _doc:
  584. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  585. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  586. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  587. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  588. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  589. if "招标金额" not in item:
  590. set_dict_item(item,"招标金额","")
  591. if "中标金额" not in item:
  592. set_dict_item(item,"中标金额","")
  593. if "中标单位" not in item:
  594. set_dict_item(item,"中标单位","")
  595. if "中标单位联系人" not in item:
  596. set_dict_item(item,"中标单位联系人","")
  597. if "中标单位联系电话" not in item:
  598. set_dict_item(item,"中标单位联系电话","")
  599. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  600. # if _line in set_line:
  601. # continue
  602. # if item["招标金额"]=="":
  603. # continue
  604. # set_line.add(_line)
  605. for k,v in item.items():
  606. if k not in df_data:
  607. df_data[k] = []
  608. df_data[k].append(v)
  609. df1 = pd.DataFrame(df_data)
  610. df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
  611. # for item in list_task:
  612. # dict_key_data[item["key"]][item["province"]] = item
  613. # dict_key_province = dict()
  614. # dict_key_province["关键词"] = []
  615. # for _province in list_province:
  616. # dict_key_province[_province] = []
  617. # for _key in dict_keyword.keys():
  618. # dict_key_province["关键词"].append(_key)
  619. # for _province in list_province:
  620. # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
  621. # columns = ["关键词"]
  622. # columns.extend(list_province)
  623. # df2 = pd.DataFrame(dict_key_province)
  624. # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
  625. def export_keyword_count():
  626. def _handle(item,result_queue,pool_ots):
  627. ots_client = pool_ots.getConnector()
  628. bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",item["range_from"],item["range_to"]),
  629. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  630. MatchPhraseQuery(item["type"], item["keyword"])
  631. ])
  632. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  633. SearchQuery(bool_query, limit=1, get_total_count=True),
  634. ColumnsToGet(return_type=ColumnReturnType.ALL))
  635. item["count"] = total_count
  636. pool_ots.putConnector(ots_client)
  637. range_from = "2019-01-01"
  638. range_to = "2022-12-23"
  639. _type = "doctextcon"
  640. assert _type in ["doctitle","doctextcon"]
  641. list_dict_key_count = []
  642. filename = "../data/医院.xlsx"
  643. df = pd.read_excel(filename)
  644. for item in df["关键词"]:
  645. list_dict_key_count.append({"keyword":item,"count":0,"range_from":range_from,"range_to":range_to,"type":_type})
  646. task_queue = queue.Queue()
  647. for item in list_dict_key_count:
  648. task_queue.put(item)
  649. result_queue = queue.Queue()
  650. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  651. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  652. mt.run()
  653. columns = ["keyword","count","range_from","range_to","type"]
  654. df_data = {}
  655. for _c in columns:
  656. df_data[_c] = []
  657. for item in list_dict_key_count:
  658. for _c in columns:
  659. if _c in item:
  660. df_data[_c].append(item[_c])
  661. else:
  662. df_data[_c].append("")
  663. df2 = pd.DataFrame(df_data)
  664. df2.to_excel("%s_数量导出.xlsx"%filename,columns=columns)
  665. def export_keyword_title():
  666. ots_client = getConnect_ots()
  667. range_from = "2020-01-01"
  668. range_to = "2022-12-23"
  669. list_condition = [["医务室"],
  670. ["医院"],
  671. ["卫生院"],
  672. ["卫生所"],
  673. ["卫生室"],
  674. ["社区卫生服务中心"]]
  675. list_should_query = []
  676. for _c in list_condition:
  677. if len(_c)==1:
  678. list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
  679. else:
  680. _must_query = []
  681. for _q in _c:
  682. _must_query.append(MatchPhraseQuery("doctitle",_q))
  683. list_should_query.append(BoolQuery(must_queries=_must_query))
  684. keyword_query = BoolQuery(should_queries=list_should_query)
  685. bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
  686. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  687. keyword_query
  688. ])
  689. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  690. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  691. ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
  692. df_data = {"docid":[],"doctitle":[],"tenderee":[]}
  693. def setData(df_data,rows):
  694. list_dict = getRow_ots(rows)
  695. for _dict in list_dict:
  696. docid = _dict.get("docid","")
  697. doctitle = _dict.get("doctitle","")
  698. tenderee = _dict.get("tenderee","")
  699. df_data["docid"].append(docid)
  700. df_data["doctitle"].append(doctitle)
  701. df_data["tenderee"].append(tenderee)
  702. setData(df_data,rows)
  703. _count = len(rows)
  704. while next_token:
  705. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  706. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  707. ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
  708. setData(df_data,rows)
  709. _count += 100
  710. print(_count,total_count)
  711. file_begin = 0
  712. file_length = 100000
  713. _count = 0
  714. while file_begin<len(df_data["docid"]):
  715. _dict = dict()
  716. for _key,v in df_data.items():
  717. _dict[_key] = v[file_begin:file_begin+file_length]
  718. _count += 1
  719. file_begin += file_length
  720. df = pd.DataFrame(_dict)
  721. df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
  722. def exportArticle_by_websource():
  723. # conn = getConnection_testmysql()
  724. # cursor = conn.cursor()
  725. # sql = "select web_source_no from web_source"
  726. # cursor.execute(sql)
  727. # rows = cursor.fetchmany(10)
  728. # dict_websource = dict()
  729. # while(rows):
  730. # for row in rows:
  731. # web_source_no = row[0]
  732. # dict_websource[web_source_no] = []
  733. # rows = cursor.fetchmany(1000)
  734. #
  735. # task_queue = queue.Queue()
  736. # for _key in dict_websource.keys():
  737. # task_queue.put({"key":_key,"list":dict_websource[_key]})
  738. #
  739. # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
  740. # result_queue = queue.Queue()
  741. # def _handle(item,result_queue,pool_ots):
  742. # _key = item["key"]
  743. # print(_key)
  744. # ots_client = pool_ots.getConnector()
  745. # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  746. # TermQuery('web_source_no', '%s'%_key)
  747. # ])
  748. #
  749. # is_all_succeed = False
  750. #
  751. # while(not is_all_succeed):
  752. # try:
  753. # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  754. # SearchQuery(bool_query, limit=100, get_total_count=True),
  755. # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
  756. # list_zhaobiao = []
  757. # list_zhongbiao = []
  758. # for row in rows:
  759. # _dict = dict()
  760. # for values in row:
  761. # for _v in values:
  762. # _dict[_v[0]] = _v[1]
  763. # if _dict["docchannel"]==52:
  764. # list_zhaobiao.append(_dict)
  765. # elif _dict["docchannel"]==101:
  766. # list_zhongbiao.append(_dict)
  767. # item["list"].extend(list_zhaobiao[:5])
  768. # item["list"].extend(list_zhongbiao[:5])
  769. # except Exception as e:
  770. # print(str(e))
  771. #
  772. # pool_ots.putConnector(ots_client)
  773. #
  774. # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
  775. # mt.run()
  776. # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
  777. # for k,v in dict_websource.items():
  778. # for item in v:
  779. # df_data["docid"].append(item["docid"])
  780. # df_data["web_source_no"].append(k)
  781. # df_data["docchannel"].append(item["docchannel"])
  782. # df_data["dochtmlcon"].append(item["dochtmlcon"])
  783. # df = pd.DataFrame(df_data)
  784. # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
  785. df = pd.read_csv("../data/websouce_doc.csv")
  786. df_2000 = {"document_id":[],"document_text":[]}
  787. begin = 80000
  788. end = 100000
  789. _count = 0
  790. for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
  791. if len(_text)>100000:
  792. continue
  793. df_2000["document_id"].append(_id)
  794. df_2000["document_text"].append(_text)
  795. df_2 = pd.DataFrame(df_2000)
  796. df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
  797. # save(dict_websource,"../data/dict_websource.pk")
  798. def getWinTenderer(sub_doc_json):
  799. if sub_doc_json is not None:
  800. sub_doc = json.loads(sub_doc_json)
  801. for _doc in sub_doc:
  802. if "win_tenderer" in _doc:
  803. return _doc["win_tenderer"]
  804. return ""
  805. def exportDocument_by_keywords(page_time,
  806. list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
  807. task_queue = queue.Queue()
  808. result_queue = queue.Queue()
  809. for _k in list_keyword:
  810. task_queue.put(_k)
  811. def _handle(keyword,result_queue):
  812. should_queries = []
  813. for _temp in [keyword]:
  814. should_queries.append(MatchPhraseQuery("doctitle",_temp))
  815. bool_query_keyword = BoolQuery(should_queries=should_queries)
  816. ots_client = getConnect_ots()
  817. bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
  818. MatchPhraseQuery("doctitle",keyword),
  819. TermQuery("docchannel","101")
  820. ])
  821. is_all_succeed = False
  822. _count = 0
  823. total_count = 1
  824. next_token = None
  825. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  826. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  827. ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  828. for row in rows:
  829. _dict = dict()
  830. for values in row:
  831. for _v in values:
  832. _dict[_v[0]] = _v[1]
  833. result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
  834. print(keyword,next_token,total_count)
  835. while(next_token):
  836. try:
  837. # print(next_token)
  838. _count += len(rows)
  839. print("%s:%d/%d"%(keyword,_count,total_count))
  840. for row in rows:
  841. _dict = dict()
  842. for values in row:
  843. for _v in values:
  844. _dict[_v[0]] = _v[1]
  845. result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
  846. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  847. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  848. ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  849. except Exception as e:
  850. traceback.print_exc()
  851. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  852. mt.run()
  853. df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
  854. while(True):
  855. try:
  856. item = result_queue.get(block=True,timeout=1)
  857. for _k in df_data.keys():
  858. if _k in item:
  859. df_data[_k].append(item[_k])
  860. else:
  861. df_data[_k].append("")
  862. except queue.Empty as e:
  863. break
  864. except Exception as e:
  865. traceback.print_exc()
  866. df = pd.DataFrame(df_data)
  867. df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
  868. def exportGovement():
  869. should_queries1 = []
  870. for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
  871. should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
  872. should_queries2 = []
  873. for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
  874. should_queries2.append(WildcardQuery("province","*%s*"%_temp))
  875. ots_client = getConnect_ots()
  876. page_time = "2020-12"
  877. bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
  878. BoolQuery(should_queries=should_queries2),
  879. TermQuery("docchannel","52"),
  880. RangeQuery("publishtime",page_time)])
  881. columns = ["tenderee","tenderee_contact","tenderee_phone"]
  882. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  883. SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
  884. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  885. print(total_count)
  886. def getRow(rows,df_data,columns):
  887. for row in rows:
  888. _dict = dict()
  889. for part in row:
  890. for item in part:
  891. _dict[item[0]] = item[1]
  892. if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
  893. for key in columns:
  894. df_data[key].append(_dict.get(key,""))
  895. all_rows = 0
  896. df_data = {}
  897. for key in columns:
  898. df_data[key] = []
  899. getRow(rows,df_data,columns)
  900. _count = 100
  901. while(next_token):
  902. print(_count,total_count)
  903. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  904. SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
  905. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  906. _count += 100
  907. getRow(rows,df_data,columns)
  908. df2 = pd.DataFrame(df_data)
  909. df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
  910. def export_attachment():
  911. filename = "../data/销售部-字段名.xlsx"
  912. df = pd.read_excel(filename)
  913. list_dict = []
  914. for _key in df["关键词"]:
  915. if len(_key)>0:
  916. list_dict.append({"keyword":_key})
  917. def _handle(_dict,result_queue,pool_ots):
  918. _keyword = _dict["keyword"]
  919. ots_client = pool_ots.getConnector()
  920. keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword),
  921. MatchPhraseQuery("doctitle",_keyword)])
  922. bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True),
  923. keyword_query])
  924. columns = ["dochtmlcon","docid"]
  925. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  926. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
  927. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  928. print(_keyword,total_count)
  929. hit_count = 0
  930. def getData(rows,hit_count):
  931. list_fields = getRow_ots(rows)
  932. for _fields in list_fields:
  933. dochtmlcon = _fields["dochtmlcon"]
  934. docid = _fields["docid"]
  935. _flag = False
  936. for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon):
  937. try:
  938. result = requests.get(url,stream=True,timeout=100)
  939. if result.status_code==200:
  940. filename = get_file_name(url,result.headers)
  941. with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f:
  942. f.write(result.content)
  943. _flag = True
  944. except Exception:
  945. pass
  946. if _flag:
  947. hit_count += 1
  948. return hit_count
  949. hit_count = getData(rows,hit_count)
  950. _count = len(rows)
  951. while next_token:
  952. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  953. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  954. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  955. _count += len(rows)
  956. if _count>20000:
  957. break
  958. hit_count = getData(rows,hit_count)
  959. if hit_count>20:
  960. break
  961. pool_ots.putConnector(ots_client)
  962. task_queue = queue.Queue()
  963. result_queue = queue.Queue()
  964. for item in list_dict:
  965. task_queue.put(item)
  966. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  967. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  968. mt.run()
  969. def exportIndustryCount():
  970. import codecs
  971. time_from = "2020-12-21"
  972. time_to = "2020-12-25"
  973. # dict_channel = {"51":{"type":"公告变更"},
  974. # "52":{"type":"招标公告"},
  975. # "101":{"type":"中标信息"},
  976. # "102":{"type":"招标预告"},
  977. # "103":{"type":"招标答疑"},
  978. # "104":{"type":"招标文件"},
  979. # "105":{"type":"资审结果"},
  980. # "103":{"type":"招标控制价"},
  981. # "100":{"type":"未知类型"}}
  982. dict_industry = {}
  983. meta_industry = load("../data/class2dalei_menlei.pkl")
  984. for _key in meta_industry.keys():
  985. dict_industry[_key] = {"type":_key}
  986. print(dict_industry.keys())
  987. return
  988. task_queue = queue.Queue()
  989. result_queue = queue.Queue()
  990. for _key in dict_industry.keys():
  991. task_queue.put(dict_industry[_key])
  992. def _handle(item,result_queue,pool_ots):
  993. ots_client = pool_ots.getConnector()
  994. bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
  995. RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
  996. columns = ["docid"]
  997. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  998. SearchQuery(bool_query, limit=1,get_total_count=True),
  999. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1000. item["count"] = total_count
  1001. columns = ["dochtmlcon"]
  1002. bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
  1003. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1004. SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
  1005. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1006. for row in rows:
  1007. _dict = dict()
  1008. for part in row:
  1009. for v in part:
  1010. _dict[v[0]] = v[1]
  1011. with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
  1012. f.write(_dict["dochtmlcon"])
  1013. pool_ots.putConnector(ots_client)
  1014. pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
  1015. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  1016. mt.run()
  1017. columns = ["type","count"]
  1018. df_data = {}
  1019. for _c in columns:
  1020. df_data[_c] = []
  1021. for _indus in dict_industry.keys():
  1022. for _c in columns:
  1023. df_data[_c].append(dict_industry[_indus][_c])
  1024. df = pd.DataFrame(df_data)
  1025. df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
  1026. def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
  1027. '''
  1028. :return:
  1029. '''
  1030. ost_client = getConnect_ots()
  1031. last_docid = 0
  1032. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
  1033. RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
  1034. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  1035. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  1036. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1037. _count = len(rows)
  1038. df_data = {}
  1039. def getData(df_data,rows):
  1040. list_dict = getRow_ots(rows)
  1041. for _dict in list_dict:
  1042. for _k,_v in _dict.items():
  1043. if _k not in df_data:
  1044. df_data[_k] = []
  1045. df_data[_k].append(getLegal_str(_v))
  1046. getData(df_data,rows)
  1047. while(next_token):
  1048. print("%d/%d"%(_count,total_count))
  1049. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  1050. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  1051. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1052. _count += len(rows)
  1053. getData(df_data,rows)
  1054. df = pd.DataFrame(df_data)
  1055. df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
  1056. def processDocument():
  1057. filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
  1058. df = pd.read_excel(filename)
  1059. df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
  1060. return
  1061. list_dict = []
  1062. for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
  1063. list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
  1064. task_queue = queue.Queue()
  1065. for _dict in list_dict:
  1066. task_queue.put(_dict)
  1067. result_queue = queue.Queue()
  1068. def _handle(_dict,result_queue,pool_mysql):
  1069. conn = pool_mysql.getConnector()
  1070. cursor = conn.cursor()
  1071. sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
  1072. cursor.execute(sql)
  1073. conn.commit()
  1074. pool_mysql.putConnector(conn)
  1075. # url = "http://192.168.2.101:15030"
  1076. # myheaders = {'Content-Type': 'application/json'}
  1077. # print(int(_dict["docid"]))
  1078. # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
  1079. # resp = requests.post(url,json=data,headers=myheaders, verify=True)
  1080. # result = json.loads(resp.content.decode("utf8"),"utf8")
  1081. # _dict["product"] = result["product"]
  1082. pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
  1083. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
  1084. mt.run()
  1085. # columns = ["docid","doctitle","project_name","product"]
  1086. #
  1087. # df_data = {}
  1088. # for _c in columns:
  1089. # df_data[_c] = []
  1090. # for _dict in list_dict:
  1091. # for _c in columns:
  1092. # df_data[_c].append(_dict.get(_c,""))
  1093. # df = pd.DataFrame(df_data)
  1094. # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
  1095. def export_extract_check():
  1096. '''
  1097. :return:导出数据提取校验的结果并生成报告
  1098. '''
  1099. conn = getConnection_testmysql()
  1100. cursor = conn.cursor()
  1101. sql = " select docid,json_result from exportdb.extract_check "
  1102. cursor.execute(sql)
  1103. dict_global = {}
  1104. df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
  1105. df_document = {"docid":[]}
  1106. while True:
  1107. rows = cursor.fetchmany(10000)
  1108. if not rows:
  1109. break
  1110. for docid,json_result in rows:
  1111. df_document["docid"].append(docid)
  1112. _result = json.loads(json_result)
  1113. for k,v in _result.items():
  1114. key = k.split("_")
  1115. _key = "_".join(key[:-1])
  1116. if "punish" in _key or "complainants" in _key or "institutions" in _key:
  1117. continue
  1118. if k not in df_document:
  1119. df_document[k] = []
  1120. df_document[k].append(v)
  1121. key_type = key[-1]
  1122. if _key not in dict_global:
  1123. dict_global[_key] = {}
  1124. if key_type not in dict_global[_key]:
  1125. dict_global[_key][key_type] = 0
  1126. if key_type=="diff":
  1127. dict_global[_key][key_type] += v
  1128. if key_type in ("online","test"):
  1129. if isinstance(v,str):
  1130. if v!="":
  1131. dict_global[_key][key_type] += 1
  1132. elif isinstance(v,list):
  1133. dict_global[_key][key_type] += len(v)
  1134. for k,v in dict_global.items():
  1135. df_global["key_type"].append(k)
  1136. df_global["online_count"].append(v["online"])
  1137. df_global["test_count"].append(v["test"])
  1138. df_global["diff_count"].append(v["diff"])
  1139. df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
  1140. filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
  1141. with pd.ExcelWriter(filename) as writer:
  1142. df1 = pd.DataFrame(df_global)
  1143. df1.to_excel(writer,sheet_name="global")
  1144. for k,v in df_document.items():
  1145. print(k,len(v))
  1146. df2 = pd.DataFrame(df_document)
  1147. df2.to_excel(writer,sheet_name="document")
  1148. writer.save()
  1149. writer.close()
  1150. def exportDocument_dump():
  1151. # filename = "../data/重复公告.xlsx"
  1152. # df = pd.read_excel(filename)
  1153. ots_client = getConnect_ots()
  1154. columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
  1155. df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
  1156. df_data = {}
  1157. for _key in df_keys:
  1158. df_data[_key] = []
  1159. bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
  1160. RangeQuery("status",201,300,True,True)])
  1161. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1162. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  1163. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1164. def getData(df_data,rows):
  1165. list_data = getRow_ots(rows)
  1166. for row in list_data:
  1167. dict_find = {}
  1168. for _key in df_keys:
  1169. dict_find[_key] = 0
  1170. for _k,_v in row.items():
  1171. if _k in df_keys:
  1172. dict_find[_k] = 1
  1173. if _k=="project_code":
  1174. _v = '"%s"'%_v
  1175. df_data[_k].append(_v)
  1176. doctitle = row.get("doctitle","")
  1177. df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
  1178. df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
  1179. dict_find["URL"] = 1
  1180. dict_find["doctitle_refine"] = 1
  1181. sub_docs_json = row.get("sub_docs_json","[{}]")
  1182. doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  1183. if sub_docs_json is not None:
  1184. for sub_docs in json.loads(sub_docs_json):
  1185. for _key_sub_docs in sub_docs.keys():
  1186. if _key_sub_docs in doc_columns:
  1187. if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  1188. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  1189. if float(sub_docs[_key_sub_docs])>0:
  1190. doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1191. else:
  1192. doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1193. for _k,_v in doc_columns.items():
  1194. dict_find[_k] = 1
  1195. df_data[_k].append(_v)
  1196. for _k,_v in dict_find.items():
  1197. if _v==0:
  1198. df_data[_k].append("")
  1199. _count = len(rows)
  1200. getData(df_data,rows)
  1201. while next_token:
  1202. print("%d/%d"%(_count,total_count))
  1203. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1204. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1205. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1206. getData(df_data,rows)
  1207. _count += len(rows)
  1208. # for docid in df["docid"]:
  1209. # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
  1210. #
  1211. # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1212. # SearchQuery(bool_query , limit=100, get_total_count=True),
  1213. # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1214. # list_data = getRow_ots(rows)
  1215. # if len(list_data)>0:
  1216. # dict_find = {}
  1217. # for _key in df_keys:
  1218. # dict_find[_key] = 0
  1219. # for _k,_v in list_data[0].items():
  1220. # if _k in df_keys:
  1221. # dict_find[_k] = 1
  1222. # df_data[_k].append(_v)
  1223. # doctitle = list_data[0].get("doctitle","")
  1224. # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
  1225. # dict_find["doctitle_refine"] = 1
  1226. # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
  1227. # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  1228. # if sub_docs_json is not None:
  1229. # for sub_docs in json.loads(sub_docs_json):
  1230. # for _key_sub_docs in sub_docs.keys():
  1231. # if _key_sub_docs in doc_columns:
  1232. # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  1233. # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  1234. # if float(sub_docs[_key_sub_docs])>0:
  1235. # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1236. # else:
  1237. # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1238. # for _k,_v in doc_columns.items():
  1239. # dict_find[_k] = 1
  1240. # df_data[_k].append(_v)
  1241. # for _k,_v in dict_find.items():
  1242. # if _v==0:
  1243. # df_data[_k].append("")
  1244. df1 = pd.DataFrame(df_data)
  1245. df1.to_csv("../data/0303去重.csv",columns=df_keys)
  1246. def exportDocument_dump_mysql():
  1247. conn = getConnection_testmysql()
  1248. cursor = conn.cursor()
  1249. columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
  1250. df_data = {}
  1251. for _c in columns:
  1252. df_data[_c] = []
  1253. sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
  1254. cursor.execute(sql)
  1255. while True:
  1256. rows = cursor.fetchmany(100000)
  1257. if not rows:
  1258. break
  1259. for row in rows:
  1260. for _i in range(len(columns)):
  1261. df_data[columns[_i]].append(row[_i])
  1262. df = pd.DataFrame(df_data)
  1263. df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
  1264. print(cursor.description)
  1265. def getDict_docchannel():
  1266. conn = getConnection_mysql()
  1267. cursor = conn.cursor()
  1268. sql = "select channel_id,chnlname from sys_channel "
  1269. cursor.execute(sql)
  1270. rows = cursor.fetchall()
  1271. _dict = dict()
  1272. for row in rows:
  1273. _dict[row[0]] = row[1]
  1274. return _dict
  1275. def exportDocument_by_doctitle():
  1276. columns = ["docid","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource"]
  1277. dict_channel = getDict_docchannel()
  1278. str_keyword = '''
  1279. 报批技术服务 不动产 测绘 城市更新 档案整理
  1280. 房地一体 拆旧复垦 土地规划 城乡规划 村庄规划
  1281. 技术服务 技术审查 建设用地增减挂钩 勘察 垦造水田
  1282. 不动产数据建库 不动产数据整合 林权调查 土地调查 城市更新数据调查
  1283. 不动产平台 测绘系统 地理信息系统 城乡规划信息系统 一张图信息平台
  1284. 测绘信息平台 双评价 城市更新研究 垦造水田研究报告 生态修复研究
  1285. 土地规划研究 复垦咨询服务 生态修复咨询服务 城乡规划咨询服务 城市更新咨询服务
  1286. 勘测定界 多测合一 用地预审 国土规划数据治理 地名普查
  1287. 地形图 垦造水田咨询服务 评估 全域土地综合整治 生态修复
  1288. 林权数据建库 权属调查 权籍调查
  1289. '''
  1290. task_queue = queue.Queue()
  1291. result_queue = queue.Queue()
  1292. for _keyword in re.split("\s",str_keyword):
  1293. if len(_keyword.strip())==0:
  1294. continue
  1295. task_queue.put({"keyword":_keyword})
  1296. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  1297. def _handle(item,result_queue,pool_ots):
  1298. ots_client = pool_ots.getConnector()
  1299. should_q1 = BoolQuery(should_queries=[MatchPhraseQuery("doctitle",item["keyword"])])
  1300. should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东"),
  1301. WildcardQuery('province', '%s*'%"湖南"),
  1302. WildcardQuery('province', '%s*'%"广西")])
  1303. bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-01-01"),
  1304. RangeQuery("status",201,300,True,True),
  1305. TermQuery("docchannel",101),
  1306. should_q1,should_q2])
  1307. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1308. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]), limit=100, get_total_count=True),
  1309. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1310. list_data = getRow_ots(rows)
  1311. for _data in list_data:
  1312. _data["keyword"] = item["keyword"]
  1313. result_queue.put(_data)
  1314. _count = len(list_data)
  1315. while next_token:
  1316. print("%d/%d"%(_count,total_count))
  1317. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1318. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1319. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1320. list_data = getRow_ots(rows)
  1321. _count += len(list_data)
  1322. for _data in list_data:
  1323. _data["keyword"] = item["keyword"]
  1324. result_queue.put(_data)
  1325. pool_ots.putConnector(ots_client)
  1326. mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
  1327. mt.run()
  1328. list_item = []
  1329. try:
  1330. while True:
  1331. _dict = result_queue.get(False)
  1332. list_item.append(_dict)
  1333. except Exception as e:
  1334. print(e)
  1335. keys = list_item[0].keys()
  1336. df_data = {}
  1337. print(len(list_item))
  1338. set_line = set()
  1339. for row in list_item:
  1340. item = {}
  1341. _dict = row
  1342. set_dict_item(item,"公告id",_dict.get("docid",""))
  1343. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  1344. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  1345. set_dict_item(item,"省份",_dict.get("province",""))
  1346. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1347. set_dict_item(item,"城市",_dict.get("city",""))
  1348. set_dict_item(item,"区",_dict.get("district",""))
  1349. set_dict_item(item,"关键词",_dict.get("keyword",""))
  1350. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  1351. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  1352. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  1353. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  1354. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  1355. # set_dict_item(item,"代理单位",_dict.get("agency",""))
  1356. # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  1357. # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  1358. set_dict_item(item,"公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  1359. tenderee_in = "否"
  1360. tenderee_keyword = "否"
  1361. if _dict.get("tenderee","")!="":
  1362. if _dict.get("tenderee","") in _dict.get("doctitle",""):
  1363. tenderee_in = "是"
  1364. if _dict.get("keyword","") in _dict.get("tenderee",""):
  1365. tenderee_keyword = "是"
  1366. set_dict_item(item,"标题包含招标人",tenderee_in)
  1367. set_dict_item(item,"招标人含有关键词",tenderee_keyword)
  1368. sub_docs_json = _dict.get("sub_docs_json")
  1369. for _doc in json.loads(sub_docs_json):
  1370. if "win_tenderer" in _doc:
  1371. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  1372. # if "win_tenderee_manager" in _doc:
  1373. # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  1374. # if "win_tenderee_phone" in _doc:
  1375. # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  1376. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  1377. set_dict_item(item,"中标金额(万元)",float(_doc["win_bid_price"])/10000)
  1378. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  1379. set_dict_item(item,"招标金额(万元)",float(_doc["bidding_budget"])/10000)
  1380. if "中标单位" not in item:
  1381. set_dict_item(item,"中标单位","")
  1382. if "中标金额(万元)" not in item:
  1383. set_dict_item(item,"中标金额(万元)","")
  1384. if "招标金额(万元)" not in item:
  1385. set_dict_item(item,"招标金额(万元)","")
  1386. # if "中标单位联系人" not in item:
  1387. # set_dict_item(item,"中标单位联系人","")
  1388. # if "中标单位联系电话" not in item:
  1389. # set_dict_item(item,"中标单位联系电话","")
  1390. _line = "%s-%s-%s"%(item["项目编号"],item["中标单位"],item["中标金额(万元)"])
  1391. if _line in set_line:
  1392. continue
  1393. # if item["招标金额"]=="":
  1394. # continue
  1395. set_line.add(_line)
  1396. for k,v in item.items():
  1397. if k not in df_data:
  1398. df_data[k] = []
  1399. df_data[k].append(v)
  1400. df = pd.DataFrame(df_data)
  1401. df.to_excel("../data/2021-04-14_export11.xlsx",columns=list_df_columns)
  1402. set_columns = set()
  1403. list_df_columns = []
  1404. def set_dict_item(_dict,name,v):
  1405. _dict[name] = getLegal_str(v)
  1406. if name not in set_columns:
  1407. set_columns.add(name)
  1408. list_df_columns.append(getLegal_str(name))
  1409. def exportDocument_medicine(start_time,end_time):
  1410. # filename = "../data/重复公告.xlsx"
  1411. # df = pd.read_excel(filename)
  1412. ots_client = getConnect_ots()
  1413. columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
  1414. dict_channel = getDict_docchannel()
  1415. def getData(df_data,rows,set_line):
  1416. list_data = getRow_ots(rows)
  1417. for row in list_data:
  1418. item = {}
  1419. _dict = row
  1420. set_dict_item(item,"docid",_dict.get("docid",""))
  1421. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  1422. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  1423. set_dict_item(item,"省份",_dict.get("province",""))
  1424. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1425. set_dict_item(item,"城市",_dict.get("city",""))
  1426. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  1427. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  1428. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  1429. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  1430. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  1431. set_dict_item(item,"代理单位",_dict.get("agency",""))
  1432. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  1433. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  1434. set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  1435. sub_docs_json = _dict.get("sub_docs_json")
  1436. for _doc in json.loads(sub_docs_json):
  1437. if "win_tenderer" in _doc:
  1438. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  1439. if "win_tenderee_manager" in _doc:
  1440. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  1441. if "win_tenderee_phone" in _doc:
  1442. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  1443. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  1444. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  1445. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  1446. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  1447. if "招标金额" not in item:
  1448. set_dict_item(item,"招标金额","")
  1449. if "中标金额" not in item:
  1450. set_dict_item(item,"中标金额","")
  1451. if "中标单位" not in item:
  1452. set_dict_item(item,"中标单位","")
  1453. if "中标单位联系人" not in item:
  1454. set_dict_item(item,"中标单位联系人","")
  1455. if "中标单位联系电话" not in item:
  1456. set_dict_item(item,"中标单位联系电话","")
  1457. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  1458. # if _line in set_line:
  1459. # continue
  1460. # if item["招标金额"]=="":
  1461. # continue
  1462. set_line.add(_line)
  1463. for k,v in item.items():
  1464. if k not in df_data:
  1465. df_data[k] = []
  1466. df_data[k].append(v)
  1467. # list_province = ["江西","湖南","四川","安徽"]
  1468. list_province = ["全国"]
  1469. for _province in list_province:
  1470. df_data = {}
  1471. str_p = '''
  1472. 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
  1473. 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
  1474. 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
  1475. 临床应用
  1476. '''
  1477. list_prov = re.split("\s",str_p)
  1478. list_mu = []
  1479. for _p in list_prov:
  1480. if _p.strip()=="":
  1481. continue
  1482. print(_p)
  1483. list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
  1484. s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
  1485. list_should_ten = []
  1486. for _p in re.split("、",s_tenderee):
  1487. if _p.split()=="":
  1488. continue
  1489. list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
  1490. list_should_chan = []
  1491. list_should_chan.append(TermQuery("docchannel",52))
  1492. list_should_chan.append(TermQuery("docchannel",101))
  1493. list_should_chan.append(TermQuery("docchannel",102))
  1494. should_q1 = BoolQuery(should_queries=list_mu)
  1495. should_q2 = BoolQuery(should_queries=list_should_ten)
  1496. should_q3 = BoolQuery(should_queries=list_should_chan)
  1497. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
  1498. RangeQuery("status",201,300,True,True),
  1499. should_q1,should_q2,should_q3])
  1500. # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
  1501. # MatchPhraseQuery("doctitle","教学器材")])
  1502. # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
  1503. # MatchPhraseQuery("doctitle","教育设备")])
  1504. # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
  1505. # MatchPhraseQuery("doctitle","教学器材")])
  1506. # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
  1507. # MatchPhraseQuery("doctitle","教育设备")])
  1508. # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
  1509. # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
  1510. # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
  1511. # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
  1512. # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
  1513. # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
  1514. # MatchPhraseQuery("doctitle","设备")])
  1515. #
  1516. # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
  1517. # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
  1518. # MatchPhraseQuery("doctitle","医院"),
  1519. # MatchPhraseQuery("doctitle","工业园"),
  1520. # MatchPhraseQuery("doctitle","营养"),
  1521. # MatchPhraseQuery("doctitle","厨房设备")])
  1522. # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
  1523. # RangeQuery("status",201,300,True,True),
  1524. # TermQuery("docchannel",101),
  1525. # WildcardQuery('province', '%s*'%_province),
  1526. # all_should],
  1527. # must_not_queries=[must_not_q])
  1528. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1529. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  1530. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1531. set_line = set()
  1532. _count = len(rows)
  1533. getData(df_data,rows,set_line)
  1534. while next_token:
  1535. print("%d/%d"%(_count,total_count))
  1536. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1537. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1538. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1539. getData(df_data,rows,set_line)
  1540. _count += len(rows)
  1541. if len(df_data[list(df_data.keys())[0]])>=300:
  1542. break
  1543. set_enterprise = set()
  1544. for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  1545. set_enterprise.add(_tenderee)
  1546. set_enterprise.add(_agency)
  1547. set_enterprise.add(_win_tenderer)
  1548. if "" in set_enterprise:
  1549. set_enterprise.remove("")
  1550. if None in set_enterprise:
  1551. set_enterprise.remove(None)
  1552. dict_enterprise = getDictEnterprise(list(set_enterprise))
  1553. if len(set_enterprise)>0:
  1554. for _i in range(len(df_data["招标单位"])):
  1555. _enterprise_name = df_data["招标单位"][_i]
  1556. if df_data["招标联系人电话"][_i]=="":
  1557. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1558. if contacts is not None:
  1559. _person,_phone = getOneContact(contacts)
  1560. df_data["招标联系人"][_i] = _person
  1561. df_data["招标联系人电话"][_i] = _phone
  1562. _enterprise_name = df_data["代理单位"][_i]
  1563. if df_data["代理联系人电话"][_i]=="":
  1564. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1565. if contacts is not None:
  1566. _person,_phone = getOneContact(contacts)
  1567. df_data["代理联系人"][_i] = _person
  1568. df_data["代理联系人电话"][_i] = _phone
  1569. _enterprise_name = df_data["中标单位"][_i]
  1570. if df_data["中标单位联系电话"][_i]=="":
  1571. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1572. if contacts is not None:
  1573. _person,_phone = getOneContact(contacts)
  1574. df_data["中标单位联系人"][_i] = _person
  1575. df_data["中标单位联系电话"][_i] = _phone
  1576. return df_data
  1577. df1 = pd.DataFrame(df_data)
  1578. df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  1579. def exportDocument_by_pagetime():
  1580. # filename = "../data/重复公告.xlsx"
  1581. # df = pd.read_excel(filename)
  1582. ots_client = getConnect_ots()
  1583. columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
  1584. dict_channel = getDict_docchannel()
  1585. def getData(df_data,rows,set_line):
  1586. list_data = getRow_ots(rows)
  1587. for row in list_data:
  1588. item = {}
  1589. _dict = row
  1590. set_dict_item(item,"docid",_dict.get("docid",""))
  1591. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  1592. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  1593. set_dict_item(item,"省份",_dict.get("province",""))
  1594. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1595. set_dict_item(item,"城市",_dict.get("city",""))
  1596. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  1597. set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  1598. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  1599. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  1600. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  1601. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  1602. set_dict_item(item,"代理单位",_dict.get("agency",""))
  1603. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  1604. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  1605. set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  1606. sub_docs_json = _dict.get("sub_docs_json")
  1607. for _doc in json.loads(sub_docs_json):
  1608. if "win_tenderer" in _doc:
  1609. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  1610. if "win_tenderee_manager" in _doc:
  1611. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  1612. if "win_tenderee_phone" in _doc:
  1613. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  1614. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  1615. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  1616. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  1617. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  1618. if "招标金额" not in item:
  1619. set_dict_item(item,"招标金额","")
  1620. if "中标金额" not in item:
  1621. set_dict_item(item,"中标金额","")
  1622. if "中标单位" not in item:
  1623. set_dict_item(item,"中标单位","")
  1624. if "中标单位联系人" not in item:
  1625. set_dict_item(item,"中标单位联系人","")
  1626. if "中标单位联系电话" not in item:
  1627. set_dict_item(item,"中标单位联系电话","")
  1628. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  1629. # if _line in set_line:
  1630. # continue
  1631. # if item["招标金额"]=="":
  1632. # continue
  1633. set_line.add(_line)
  1634. for k,v in item.items():
  1635. if k not in df_data:
  1636. df_data[k] = []
  1637. df_data[k].append(v)
  1638. # list_province = ["江西","湖南","四川","安徽"]
  1639. list_province = ["全国"]
  1640. for _province in list_province:
  1641. df_data = {}
  1642. str_p = '''
  1643. 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
  1644. 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
  1645. 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
  1646. 临床应用
  1647. '''
  1648. list_prov = re.split("\s",str_p)
  1649. list_mu = []
  1650. for _p in list_prov:
  1651. if _p.strip()=="":
  1652. continue
  1653. print(_p)
  1654. list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
  1655. s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
  1656. list_should_ten = []
  1657. for _p in re.split("、",s_tenderee):
  1658. if _p.split()=="":
  1659. continue
  1660. list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
  1661. list_should_chan = []
  1662. list_should_chan.append(TermQuery("docchannel",52))
  1663. # list_should_chan.append(TermQuery("docchannel",101))
  1664. # list_should_chan.append(TermQuery("docchannel",102))
  1665. should_q1 = BoolQuery(should_queries=list_mu)
  1666. should_q2 = BoolQuery(should_queries=list_should_ten)
  1667. should_q3 = BoolQuery(should_queries=list_should_chan)
  1668. bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-05-07","2021-05-07",include_lower=True,include_upper=True),
  1669. RangeQuery("status",201,300,True,True)
  1670. # ,should_q1
  1671. # ,should_q2
  1672. ,should_q3])
  1673. # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
  1674. # MatchPhraseQuery("doctitle","教学器材")])
  1675. # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
  1676. # MatchPhraseQuery("doctitle","教育设备")])
  1677. # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
  1678. # MatchPhraseQuery("doctitle","教学器材")])
  1679. # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
  1680. # MatchPhraseQuery("doctitle","教育设备")])
  1681. # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
  1682. # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
  1683. # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
  1684. # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
  1685. # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
  1686. # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
  1687. # MatchPhraseQuery("doctitle","设备")])
  1688. #
  1689. # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
  1690. # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
  1691. # MatchPhraseQuery("doctitle","医院"),
  1692. # MatchPhraseQuery("doctitle","工业园"),
  1693. # MatchPhraseQuery("doctitle","营养"),
  1694. # MatchPhraseQuery("doctitle","厨房设备")])
  1695. # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
  1696. # RangeQuery("status",201,300,True,True),
  1697. # TermQuery("docchannel",101),
  1698. # WildcardQuery('province', '%s*'%_province),
  1699. # all_should],
  1700. # must_not_queries=[must_not_q])
  1701. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1702. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  1703. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1704. set_line = set()
  1705. _count = len(rows)
  1706. getData(df_data,rows,set_line)
  1707. while next_token:
  1708. print("%d/%d"%(_count,total_count))
  1709. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1710. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1711. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1712. getData(df_data,rows,set_line)
  1713. _count += len(rows)
  1714. # if len(df_data[list(df_data.keys())[0]])>=300:
  1715. # break
  1716. # set_enterprise = set()
  1717. # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  1718. # set_enterprise.add(_tenderee)
  1719. # set_enterprise.add(_agency)
  1720. # set_enterprise.add(_win_tenderer)
  1721. # if "" in set_enterprise:
  1722. # set_enterprise.remove("")
  1723. # if None in set_enterprise:
  1724. # set_enterprise.remove(None)
  1725. # dict_enterprise = getDictEnterprise(list(set_enterprise))
  1726. # if len(set_enterprise)>0:
  1727. # for _i in range(len(df_data["招标单位"])):
  1728. # _enterprise_name = df_data["招标单位"][_i]
  1729. # if df_data["招标联系人电话"][_i]=="":
  1730. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1731. # if contacts is not None:
  1732. # _person,_phone = getOneContact(contacts)
  1733. # df_data["招标联系人"][_i] = _person
  1734. # df_data["招标联系人电话"][_i] = _phone
  1735. #
  1736. # _enterprise_name = df_data["代理单位"][_i]
  1737. # if df_data["代理联系人电话"][_i]=="":
  1738. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1739. # if contacts is not None:
  1740. # _person,_phone = getOneContact(contacts)
  1741. # df_data["代理联系人"][_i] = _person
  1742. # df_data["代理联系人电话"][_i] = _phone
  1743. #
  1744. # _enterprise_name = df_data["中标单位"][_i]
  1745. # if df_data["中标单位联系电话"][_i]=="":
  1746. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1747. # if contacts is not None:
  1748. # _person,_phone = getOneContact(contacts)
  1749. # df_data["中标单位联系人"][_i] = _person
  1750. # df_data["中标单位联系电话"][_i] = _phone
  1751. df1 = pd.DataFrame(df_data)
  1752. df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  1753. def attachCompanyContact():
  1754. files = ["../data/2021-03-17_四川_关键词导出.csv",
  1755. "../data/2021-03-17_安徽_关键词导出.csv",
  1756. "../data/2021-03-17_江西_关键词导出.csv",
  1757. "../data/2021-03-17_湖南_关键词导出.csv"]
  1758. files = ["../data/欧科自然资源5w以上数据.xlsx"]
  1759. def _handle(item,result_queue,pool_ots):
  1760. ots_client = pool_ots.getConnector()
  1761. primary_key = [('name',str(item["招标单位"]))]
  1762. columns_to_get = ["province","city","district"]
  1763. consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
  1764. # bool_query = BoolQuery(must_queries=[TermQuery("nicknames",item["中标单位"])])
  1765. #
  1766. # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  1767. # SearchQuery(bool_query , limit=1, get_total_count=True),
  1768. # ColumnsToGet(["contacts"],return_type=ColumnReturnType.SPECIFIED))
  1769. # _docid = int(item["docid"])
  1770. # partitionkey = _docid%500+1
  1771. # primary_key = [('partitionkey',partitionkey),("docid",_docid)]
  1772. # columns_to_get = ["doctitle"]
  1773. # consumed, return_row, next_token = ots_client.get_row("document",primary_key, columns_to_get, None, 1)
  1774. if isinstance(item["招标单位"],str) and item["招标单位"]!="":
  1775. if return_row is not None:
  1776. _dict = getRow_ots_primary(return_row)
  1777. # item["doctitle"] = _dict.get("doctitle","")
  1778. item["招标人省份"] = _dict.get("province","")
  1779. item["招标人城市"] = _dict.get("city","")
  1780. item["招标人区域"] = _dict.get("district","")
  1781. province,city,district = getLocation(item["招标单位"])
  1782. if item["招标人省份"]=="" or item["招标人省份"]=="未知":
  1783. item["招标人省份"] = province
  1784. if item["招标人城市"]=="" or item["招标人城市"]=="未知":
  1785. item["招标人城市"] = city
  1786. if item["招标人区域"]=="" or item["招标人区域"]=="未知":
  1787. item["招标人区域"] = district
  1788. else:
  1789. province,city,district = getLocation(item["招标单位"])
  1790. item["招标人省份"] = province
  1791. item["招标人城市"] = city
  1792. item["招标人区域"] = district
  1793. else:
  1794. item["招标人省份"] = item["省份"]
  1795. item["招标人城市"] = item["城市"]
  1796. item["招标人区域"] = item["区"]
  1797. # contacts = json.loads(_dict["contacts"])
  1798. # contacts.sort(key=lambda x:x["level"],reverse=True)
  1799. # phone = ""
  1800. # phone_person = ""
  1801. # mobile = ""
  1802. # mobile_person = ""
  1803. # for contact in contacts:
  1804. # if mobile=="" and contact.get("mobile_no","")!="":
  1805. # mobile = contact.get("mobile_no","")
  1806. # mobile_person = contact.get("contact_person","")
  1807. # if phone=="" and contact.get("phone_no","")!="":
  1808. # phone = contact.get("phone_no",'')
  1809. # phone_person = contact.get("contact_person","")
  1810. # item["招标联系人"] = ""
  1811. # item["招标联系人电话"] = ""
  1812. # if mobile!="":
  1813. # item["招标联系人"] = mobile_person
  1814. # item["招标联系人电话"] = mobile
  1815. # else:
  1816. # item["中标单位联系人"] = phone_person
  1817. # item["中标单位联系电话"] = phone
  1818. pool_ots.putConnector(ots_client)
  1819. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  1820. for file in files:
  1821. task_queue = queue.Queue()
  1822. df = pd.read_excel(file,encoding="UTF8")
  1823. keys = df.keys()[1:]
  1824. list_item = []
  1825. for row in df.itertuples():
  1826. _dict = {}
  1827. for _key in keys:
  1828. if _key in dir(row):
  1829. _v = row.__getattribute__(_key)
  1830. else:
  1831. _v = ''
  1832. _dict[_key] = _v
  1833. list_item.append(_dict)
  1834. for item in list_item:
  1835. task_queue.put(item)
  1836. mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
  1837. mt.run()
  1838. df_data = {}
  1839. for _k in keys:
  1840. df_data[_k] = []
  1841. for item in list_item:
  1842. for _k in keys:
  1843. df_data[_k].append(getLegal_str(item.get(_k,"-")))
  1844. df1 = pd.DataFrame(df_data)
  1845. df1.to_excel("%s_attach.xlsx"%file,columns=keys)
  1846. def dumpWebSourceNo():
  1847. conn_oracle = getConnection_oracle()
  1848. cursor_oracle = conn_oracle.cursor()
  1849. sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
  1850. cursor_oracle.execute(sql)
  1851. rows = cursor_oracle.fetchall()
  1852. conn_mysql = getConnection_testmysql()
  1853. cursor_mysql = conn_mysql.cursor()
  1854. for row in rows:
  1855. sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
  1856. print(sql)
  1857. cursor_mysql.execute(sql)
  1858. conn_mysql.commit()
  1859. def exportNzj():
  1860. # filename = "../data/重复公告.xlsx"
  1861. # df = pd.read_excel(filename)
  1862. ots_client = getConnect_ots()
  1863. columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
  1864. def getData(df_data,rows,set_line):
  1865. list_data = getRow_ots(rows)
  1866. for row in list_data:
  1867. item = {}
  1868. _dict = row
  1869. set_dict_item(item,"docids",_dict.get("docids",""))
  1870. set_dict_item(item,"contacts",_dict.get("contacts",""))
  1871. set_dict_item(item,"covered_area",_dict.get("covered_area",""))
  1872. set_dict_item(item,"follows",_dict.get("follows",""))
  1873. set_dict_item(item,"project_type",_dict.get("project_type",""))
  1874. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1875. set_dict_item(item,"page_time",_dict.get("page_time",""))
  1876. set_dict_item(item,"progress",_dict.get("progress",""))
  1877. set_dict_item(item,"project_description",_dict.get("project_description",""))
  1878. set_dict_item(item,"project_follow",_dict.get("project_follow",""))
  1879. set_dict_item(item,"project_code",_dict.get("project_code",""))
  1880. set_dict_item(item,"project_name",_dict.get("project_name",""))
  1881. for k,v in item.items():
  1882. if k not in df_data:
  1883. df_data[k] = []
  1884. df_data[k].append(v)
  1885. df_data = {}
  1886. bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
  1887. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  1888. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
  1889. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1890. set_line = set()
  1891. _count = len(rows)
  1892. getData(df_data,rows,set_line)
  1893. while next_token:
  1894. print("%d/%d"%(_count,total_count))
  1895. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  1896. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1897. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1898. getData(df_data,rows,set_line)
  1899. _count += len(rows)
  1900. df1 = pd.DataFrame(df_data)
  1901. df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
  1902. def turn_status():
  1903. df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
  1904. conn = getConnection_testmysql()
  1905. cursor = conn.cursor()
  1906. for docid in df["公告id"]:
  1907. partitionkey = int(docid)%500+1
  1908. sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
  1909. cursor.execute(sql)
  1910. conn.commit()
  1911. def attachBidding_budget():
  1912. conn_mysql = getConnection_testmysql()
  1913. cursor = conn_mysql.cursor()
  1914. sql = "select docid from analysis_r2 where bidding_budget=''"
  1915. task_queue = queue.Queue()
  1916. result_queue = queue.Queue()
  1917. cursor.execute(sql)
  1918. rows = cursor.fetchmany(10000)
  1919. while(rows):
  1920. for row in rows:
  1921. task_queue.put(row[0])
  1922. rows = cursor.fetchmany(10000)
  1923. pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
  1924. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  1925. def _handle(item,result_queue,pool_mysql,pool_ots):
  1926. ots_client = pool_ots.getConnector()
  1927. bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
  1928. rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
  1929. SearchQuery(bool_query , limit=1, get_total_count=True),
  1930. ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
  1931. list_dict = getRow_ots(rows)
  1932. if len(list_dict)>0:
  1933. conn = pool_mysql.getConnector()
  1934. cursor = conn.cursor()
  1935. sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
  1936. cursor.execute(sql)
  1937. conn.commit()
  1938. pool_mysql.putConnector(conn)
  1939. pool_ots.putConnector(ots_client)
  1940. mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
  1941. mt.run()
  1942. def debug_documentMerge():
  1943. conn = getConnection_testmysql()
  1944. cursor = conn.cursor()
  1945. sql = "select merge_docids from project_group_final_log "
  1946. cursor.execute(sql)
  1947. task_queue = queue.Queue()
  1948. for row in cursor.fetchall():
  1949. task_queue.put(row[0])
  1950. print(task_queue.qsize())
  1951. def _handle(item,result_queue,pool_ots):
  1952. ots_client = pool_ots.getConnector()
  1953. list_docids = item.split(",")
  1954. must_q = []
  1955. for _docid in list_docids:
  1956. must_q.append(TermQuery("docids",_docid))
  1957. bool_query = BoolQuery(must_queries=must_q)
  1958. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  1959. SearchQuery(bool_query,limit=1,get_total_count=True),
  1960. ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
  1961. if total_count==0:
  1962. print(item)
  1963. result_queue.put(item)
  1964. pool_ots.putConnector(ots_client)
  1965. result_queue = queue.Queue()
  1966. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  1967. mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
  1968. mt.run()
  1969. while(True):
  1970. try:
  1971. item = result_queue.get(True)
  1972. print(item)
  1973. except Exception as e:
  1974. print(str(e))
  1975. break
  1976. def signDocument():
  1977. filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
  1978. sign_filename = "%s_sign.xlsx"%filename
  1979. df = pd.read_excel(filename)
  1980. df_data = {"sign":[]}
  1981. for item in df["segword"]:
  1982. content = re.sub("\s*","",item)
  1983. _find = re.search("(?P<key>流标|废标|中止|终止|撤销|采购失败)",content)
  1984. if _find is not None:
  1985. df_data["sign"].append(_find.groupdict().get("key"))
  1986. else:
  1987. df_data["sign"].append("无")
  1988. df1 = pd.DataFrame(df_data)
  1989. df1.to_excel(sign_filename)
  1990. if __name__=="__main__":
  1991. # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
  1992. # processDocument()
  1993. # export_extract_check()
  1994. # exportArticle_by_websource()
  1995. # export_keyword_count()
  1996. # export_province_keyword_count()
  1997. # exportDocument_dump()
  1998. # exportDocument_dump_mysql()
  1999. # export_attachment()
  2000. # exportDocument_by_doctitle()
  2001. # exportIndustryCount()
  2002. # exportDocument_by_pagetime()
  2003. # attachCompanyContact()
  2004. # dumpWebSourceNo()
  2005. # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%138306357))))
  2006. # exportNzj()
  2007. # turn_status()
  2008. # attachBidding_budget()
  2009. # debug_documentMerge()
  2010. exportDocument_medicine("2021-05-02","2021-05-08")
  2011. # signDocument()