1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308 |
- #encoding:GBK
- import sys
- import os
- sys.path.append("../")
- import pandas as pd
- from dataSource.source import *
- import json
- from utils.multiThread import MultiThreadHandler
- import queue
- from utils.Utils import *
- from dataSource.pool import ConnectorPool
- import re
- from tablestore import *
- import traceback
- from utils.hashUtil import aesCipher
- from export.exportEnterprice import getDictEnterprise,getOneContact
- data_path = "../data/"
- def getCompanyTenderer():
- def _handle(item,result_queue):
- company = item
- dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
- dict_result["company"] = company
- graph = getConnect_neo4j()
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
- finded = graph.run(cql)
- finded_ids = json.loads(json.dumps(finded.data()))
- dict_result["count"] = len(finded_ids)
- mongoDB = getConnect_mongodb()
- coll_zb = mongoDB.zhongbiao_extraction
- if len(finded_ids)>0:
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
- finded = graph.run(cql)
- finded_names = json.loads(json.dumps(finded.data()))
- list_names = [_i["p.project_name"] for _i in finded_names]
- dict_result["project_name"] = str(list_names)
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
- finded = graph.run(cql)
- finded_money = json.loads(json.dumps(finded.data()))
- whole_money = 0
- for item in finded_money:
- if item["r.price"] is not None:
- whole_money += getUnifyMoney(item["r.price"])
- dict_result["whole_money"] = str(whole_money)
- dict_competitor = {}
- for item in finded_ids:
- docId = item["p.zhong_biao_id"]
- if docId is not None:
- rows = coll_zb.find({"docId":docId})
- for row in rows:
- keys = ["second_tenderer","third_tenderer"]
- for _key in keys:
- if _key in row:
- if row[_key] not in dict_competitor:
- dict_competitor[row[_key]] = 0
- dict_competitor[row[_key]] += 1
- list_competitor = []
- for _key in dict_competitor:
- list_competitor.append([_key,dict_competitor[_key]])
- list_competitor.sort(key=lambda x:x[1],reverse=True)
- list_competitors = [i[0] for i in list_competitor[:10]]
- dict_result["competitor"] = str(list_competitors)
- result_queue.put(dict_result)
- # filename = "成交客户匹配中标项目的需求.xlsx"
- # df = pd.read_excel(filename)
- # list_company = df["公司名字"]
- # company = list_company[0]
- list_company = []
- filename = "../data/服务型客户.txt"
- with open(filename,"r",encoding="GBK") as f:
- while(True):
- line = f.readline()
- if not line:
- break
- list_company.append(line.strip())
- task_queue = queue.Queue()
- for company in list_company:
- task_queue.put(company)
- result_queue = queue.Queue()
- handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
- handler.run()
- list_company = []
- list_zb = []
- list_count = []
- list_project = []
- list_money = []
- list_competitor = []
- while(True):
- try:
- _result = result_queue.get(False)
- list_company.append(_result.get("company",""))
- list_zb.append("是" if _result.get("count","")>0 else "否")
- list_count.append(_result.get("count",""))
- list_project.append(_result.get("project_name",""))
- list_money.append(_result.get("whole_money",""))
- list_competitor.append(_result.get("competitor",""))
- except Exception as e:
- print(e)
- break
- df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
- df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
- def export_count_includeKeyword():
- filename = "../data/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- list_name.append(_name)
- list_count.append(_count)
- _index += 1
- print(_index)
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def export_count_includeKeyword_multiThread():
- def _handler(item,result_queue):
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- result_queue.put([item,_count])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- filename = "../data/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- task_queue.put(_name)
- _index += 1
- multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
- multHandler.run()
- while(True):
- try:
- item = result_queue.get(False)
- list_name.append(item[0])
- list_count.append(item[1])
- except queue.Empty as e:
- break
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def exportKeywords():
- def _handle(item,result_queue,pool_mongo):
- docId = item["docId"]
- mongo = pool_mongo.getConnector()
- zhongbiao = mongo.zhongbiao_extraction
- zhaobiao = mongo.zhaobiao_extraction
- _project = ""
- rows = zhaobiao.find({"docId":docId},{"project_name":1})
- find_flag = False
- for row in rows:
- find_flag = True
- _project = row.get("project_name","")
- if not find_flag:
- rows = zhongbiao.find({"docId":docId},{"project_name":1})
- for row in rows:
- _project = row.get("project_name","")
- item["project_name"] = _project
- pool_mongo.putConnector(mongo)
- result_queue.put(item)
- list_key = []
- dict_key_ids = dict()
- with open("../data/品目.txt", "r", encoding="utf8") as f:
- while(True):
- row = f.readline()
- if not row:
- break
- list_key.append(row)
- dict_key_ids[row] = []
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
- for item in data["response"]["docs"]:
- dict_key_ids[row].append(item["id"])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_key_ids.keys():
- for item in dict_key_ids[_key]:
- task_queue.put({"docId":item,"project_name":""})
- pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
- mt.run()
- dict_docId_projectname = {}
- while(True):
- try:
- item = result_queue.get(False)
- dict_docId_projectname[item["docId"]] = item["project_name"]
- except Exception:
- break
- dict_key_count = dict()
- for _key in dict_key_ids.keys():
- set_key = set()
- for docId in dict_key_ids[_key]:
- set_key.add(dict_docId_projectname.get(docId,""))
- dict_key_count[_key] = len(set_key)
- print("==")
- for _key in list_key:
- print(len(dict_key_ids[_key]))
- print("==")
- for _key in list_key:
- print(dict_key_count[_key])
- print("==")
- def getIndustryCompany():
- def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
- # mongoDB = getConnect_mongodb()
- log(item["enterprise_name"])
- mongoDB = pool_mongo.getConnector()
- # coll_zb = mongoDB.enterprise_profile
- # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
- # for row in rows:
- # item["regCapital"] = row.get("regCapital","")
- # item["legal_person"] = row.get("legal_person","")
- # item["phone"] = row.get("phone","")
- # item["actualCapital"] = row.get("actualCapital","")
- # item["industry"] = row.get("industry","")
- # item["estiblishTime"] = row.get("estiblishTime","")
- # item["socialStaffNum"] = row.get("socialStaffNum","")
- # item["businessScope"] = row.get("businessScope","")
- # graph = getConnect_neo4j()
- ots_client = pool_ots.getConnector()
- primary_key = [('name',item["enterprise_name"])]
- columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
- consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
- if return_row is not None:
- for att in return_row.attribute_columns:
- item[att[0]] = att[1]
- list_same_industry_company = []
- if "industry" in item:
- bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
- col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
- rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
- SearchQuery(bool_query, limit=10, get_total_count=True),
- col)
- for row in rows:
- for item1 in row[0]:
- list_same_industry_company.append(item1[1])
- # if "industry" in item:
- # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
- # for row in rows:
- # print(row)
- # list_same_industry_company.append(row.get("enterprise_name",""))
- item["same_industry_company"] = list_same_industry_company
- graph = pool_neo4j.getConnector()
- company_name = item["enterprise_name"]
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
- finded = graph.run(cql)
- data = json.loads(json.dumps(finded.data()))
- _count = data[0]["_c"]
- # list_project = []
- # for _data in data:
- # if _count<=3:
- # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
- # if _data["project_name"] is not None:
- # list_project.append(_data["project_name"])
- # _count += 1
- item["count"] = _count
- # item["project"] = str(list_project)
- result_queue.put(item)
- pool_mongo.putConnector(mongoDB)
- pool_neo4j.putConnector(graph)
- pool_ots.putConnector(ots_client)
- log_tofile("export.log")
- pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
- pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- # list_company = getCompanys()
- # filename = "".join(["环境","生态","再生","回收","环保"])
- list_company = []
- filename = "../data/同行客户匹配.xlsx"
- df = pd.read_excel(filename,sheetname=0)
- for _com in df["公司名称"]:
- print(_com)
- if _com is not None and _com.strip()!="":
- _company = {"enterprise_name":""}
- _company["enterprise_name"] = _com
- list_company.append(_company)
- task_queue = queue.Queue()
- for item in list_company:
- task_queue.put(item)
- result_queue = queue.Queue()
- _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
- _muti.run()
- df_company = {}
- set_key = set()
- if len(list_company)>0:
- for item in list_company:
- for _key in item.keys():
- set_key.add(_key)
- if _key not in df_company:
- df_company[_key] = []
- list_key = list(set_key)
- for item in list_company:
- for _key in list_key:
- df_company[_key].append(item.get(_key,""))
- df1 = pd.DataFrame(df_company)
- df1.to_excel("%s_export.xlsx"%(filename))
- def exportWin_tenderer(time_from,time_to):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- TermQuery("docchannel",101),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- RangeQuery('docid', last_docid, include_lower=False)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- list_project = []
- def _getRow(list_project,rows,last_docid):
- for row in rows:
- project_name = row[1][0][1]
- docid = row[0][1][1]
- last_docid = docid
- list_pack = json.loads(row[1][1][1])
- _set_tenderer = set()
- win_tenderer = ""
- for _pack in list_pack:
- if "win_tenderer" in _pack and win_tenderer=="":
- win_tenderer = _pack["win_tenderer"]
- if "second_tenderer" in _pack:
- _set_tenderer.add(_pack["second_tenderer"])
- if "third_tenderer" in _pack:
- _set_tenderer.add(_pack["third_tenderer"])
- list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
- return last_docid
- _getRow(list_project,rows,last_docid)
- while(next_token):
- print("%d/%d"%(len(list_project),total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- last_docid = _getRow(list_project,rows,last_docid)
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_project:
- task_queue.put(item)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_ots):
- if item["win_tenderer"]!="":
- ots_client = pool_ots.getConnector()
- consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
- _dict = dict()
- for _item in return_row.attribute_columns:
- _dict[_item[0]] = _item[1]
- for _key in _dict.keys():
- item[_key] = _dict[_key]
- data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
- mt.run()
- keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- for item in list_project:
- for _key in keys:
- if _key in item:
- df_data[_key].append(item[_key])
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
- def exportContact():
- time_from = "2021-01-14"
- time_to = "2021-01-15"
- filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
- df1 = pd.read_excel(filename)
- set_company = set()
- for item in df1["tenderer"]:
- list_company = re.split("\['|', '|'\]|\[\]",item)
- for _company in list_company:
- if _company!="":
- set_company.add(_company)
- companys = list(set_company)
- task_queue = queue.Queue()
- list_company = []
- for _company in companys:
- item = {"company_name":_company}
- list_company.append(item)
- task_queue.put(item)
- result_queue = queue.Queue()
- def _handle(item,result_queue):
- company = item["company_name"]
- data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
- mt.run()
- keys = ["company_name","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
- for item in list_company:
- for _key in keys:
- if _key in item:
- df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
- def countKeyword():
- conn = getConnection_mysql()
- cursor = conn.cursor()
- print(0)
- sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
- cursor.execute(sql)
- print(0.1)
- df = pd.read_excel("万郡绿建细分关键词.xls")
- list_keywords = df["细分类别"]
- dict_keywords = dict()
- for _key in list_keywords:
- dict_keywords[_key] = 0
- print(1)
- from bs4 import BeautifulSoup
- while(True):
- rows = cursor.fetchmany(10000)
- print("==")
- if not rows:
- break
- for row in rows:
- _html = BeautifulSoup(row[0],"lxml").getText()
- for _key in list_keywords:
- if re.search(_key,_html) is not None:
- dict_keywords[_key] += 1
- print(dict_keywords)
- list_count = []
- for _key in list_keywords:
- list_count.append(dict_keywords[_key])
- df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
- df1.to_excel("关键词统计.xlsx")
- def countKeyword_solr():
- def _handle(item,result_queue):
- keyword = item["keyword"]
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhongbiao"] = _num
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhaobiao"] = _num
- result_queue.put(item)
- file = "../data/关键词11.xlsx"
- df = pd.read_excel(file)
- task_queue = queue.Queue()
- print(df.keys())
- for item in df["业务关键词"]:
- task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
- result_queue = queue.Queue()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
- mt.run()
- list_keyword = []
- list_zhaobiao = []
- list_zhongbiao = []
- while(True):
- try:
- item = result_queue.get(False)
- list_keyword.append(item["keyword"])
- list_zhaobiao.append(item["zhaobiao"])
- list_zhongbiao.append(item["zhongbiao"])
- except Exception:
- break
- df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
- df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
- def query_from_solr():
- data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- print(_num)
- def export_province_keyword_count():
- def _handle(item,result_queue,pool_ots):
- columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- ots_client = pool_ots.getConnector()
- _province = item["province"]
- print(item)
- # keywords = item["keyword"]+" "+item["key"]
- list_keyword = item["keyword"]
- # for _temp in keywords.split(" "):
- # if len(_temp)>0:
- # list_keyword.append(_temp)
- should_queries = []
- must_not_q = []
- for _temp in list_keyword:
- should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
- must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
- page_time = item["page_time"]
- bool_query = BoolQuery(must_queries=[bool_query_keyword
- # ,WildcardQuery("publishtime","%s*"%page_time)
- # ,MatchPhraseQuery("doctitle","服务")
- ,RangeQuery("page_time","2021-04-22","2021-04-27",include_lower=True,include_upper=True),
- TermQuery("docchannel",101),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- WildcardQuery('province', '%s*'%_province)
- ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
- ],
- must_not_queries=must_not_q)
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
- ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- list_data = getRow_ots(rows)
- item["list_data"] = list_data
- print(item)
- pool_ots.putConnector(ots_client)
- df = pd.read_excel("../data/省份信息.xlsx")
- list_province = []
- for _name,_type in zip(df["cname"],df["ctype"]):
- if _type==20:
- list_province.append(_name)
- # filename = "../data/2021-02关键词导出数据.xlsx"
- # dict_keyword = {}
- # df1 = pd.read_excel(filename,dtype=str)
- # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
- # print("===",str(_keyword))
- # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
- # for _key in df1["关键词"]:
- # dict_keyword[_key] = ""
- keyword_str = '''
- 快递 物流 供应链 运输 配送
- 仓储 冷链 整车 服务
- '''
- list_key = []
- for _k in re.split("\s",keyword_str):
- _k1 = _k.strip()
- if len(_k1)>0:
- list_key.append(_k1)
- list_task = []
- page_time = "2020-11"
- for _province in list_province:
- list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
- task_queue = queue.Queue()
- for item in list_task:
- task_queue.put(item)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- dict_key_data = dict()
- list_data = []
- for item in list_task:
- list_data.extend(item["list_data"])
- dict_channel = getDict_docchannel()
- df_data= {}
- print(list_data)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- sub_docs_json = _dict.get("sub_docs_json")
- for _doc in json.loads(sub_docs_json):
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- # set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
- # for item in list_task:
- # dict_key_data[item["key"]][item["province"]] = item
- # dict_key_province = dict()
- # dict_key_province["关键词"] = []
- # for _province in list_province:
- # dict_key_province[_province] = []
- # for _key in dict_keyword.keys():
- # dict_key_province["关键词"].append(_key)
- # for _province in list_province:
- # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
- # columns = ["关键词"]
- # columns.extend(list_province)
- # df2 = pd.DataFrame(dict_key_province)
- # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
- def export_keyword_count():
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",item["range_from"],item["range_to"]),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.ALL))
- item["count"] = total_count
- pool_ots.putConnector(ots_client)
- range_from = "2019-01-01"
- range_to = "2022-12-23"
- _type = "doctextcon"
- assert _type in ["doctitle","doctextcon"]
- list_dict_key_count = []
- filename = "../data/医院.xlsx"
- df = pd.read_excel(filename)
- for item in df["关键词"]:
- list_dict_key_count.append({"keyword":item,"count":0,"range_from":range_from,"range_to":range_to,"type":_type})
- task_queue = queue.Queue()
- for item in list_dict_key_count:
- task_queue.put(item)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- columns = ["keyword","count","range_from","range_to","type"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for item in list_dict_key_count:
- for _c in columns:
- if _c in item:
- df_data[_c].append(item[_c])
- else:
- df_data[_c].append("")
- df2 = pd.DataFrame(df_data)
- df2.to_excel("%s_数量导出.xlsx"%filename,columns=columns)
- def export_keyword_title():
- ots_client = getConnect_ots()
- range_from = "2020-01-01"
- range_to = "2022-12-23"
- list_condition = [["医务室"],
- ["医院"],
- ["卫生院"],
- ["卫生所"],
- ["卫生室"],
- ["社区卫生服务中心"]]
- list_should_query = []
- for _c in list_condition:
- if len(_c)==1:
- list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
- else:
- _must_query = []
- for _q in _c:
- _must_query.append(MatchPhraseQuery("doctitle",_q))
- list_should_query.append(BoolQuery(must_queries=_must_query))
- keyword_query = BoolQuery(should_queries=list_should_query)
- bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- keyword_query
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- df_data = {"docid":[],"doctitle":[],"tenderee":[]}
- def setData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- docid = _dict.get("docid","")
- doctitle = _dict.get("doctitle","")
- tenderee = _dict.get("tenderee","")
- df_data["docid"].append(docid)
- df_data["doctitle"].append(doctitle)
- df_data["tenderee"].append(tenderee)
- setData(df_data,rows)
- _count = len(rows)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- setData(df_data,rows)
- _count += 100
- print(_count,total_count)
- file_begin = 0
- file_length = 100000
- _count = 0
- while file_begin<len(df_data["docid"]):
- _dict = dict()
- for _key,v in df_data.items():
- _dict[_key] = v[file_begin:file_begin+file_length]
- _count += 1
- file_begin += file_length
- df = pd.DataFrame(_dict)
- df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
- def exportArticle_by_websource():
- # conn = getConnection_testmysql()
- # cursor = conn.cursor()
- # sql = "select web_source_no from web_source"
- # cursor.execute(sql)
- # rows = cursor.fetchmany(10)
- # dict_websource = dict()
- # while(rows):
- # for row in rows:
- # web_source_no = row[0]
- # dict_websource[web_source_no] = []
- # rows = cursor.fetchmany(1000)
- #
- # task_queue = queue.Queue()
- # for _key in dict_websource.keys():
- # task_queue.put({"key":_key,"list":dict_websource[_key]})
- #
- # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
- # result_queue = queue.Queue()
- # def _handle(item,result_queue,pool_ots):
- # _key = item["key"]
- # print(_key)
- # ots_client = pool_ots.getConnector()
- # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery('web_source_no', '%s'%_key)
- # ])
- #
- # is_all_succeed = False
- #
- # while(not is_all_succeed):
- # try:
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query, limit=100, get_total_count=True),
- # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
- # list_zhaobiao = []
- # list_zhongbiao = []
- # for row in rows:
- # _dict = dict()
- # for values in row:
- # for _v in values:
- # _dict[_v[0]] = _v[1]
- # if _dict["docchannel"]==52:
- # list_zhaobiao.append(_dict)
- # elif _dict["docchannel"]==101:
- # list_zhongbiao.append(_dict)
- # item["list"].extend(list_zhaobiao[:5])
- # item["list"].extend(list_zhongbiao[:5])
- # except Exception as e:
- # print(str(e))
- #
- # pool_ots.putConnector(ots_client)
- #
- # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
- # mt.run()
- # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
- # for k,v in dict_websource.items():
- # for item in v:
- # df_data["docid"].append(item["docid"])
- # df_data["web_source_no"].append(k)
- # df_data["docchannel"].append(item["docchannel"])
- # df_data["dochtmlcon"].append(item["dochtmlcon"])
- # df = pd.DataFrame(df_data)
- # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
- df = pd.read_csv("../data/websouce_doc.csv")
- df_2000 = {"document_id":[],"document_text":[]}
- begin = 80000
- end = 100000
- _count = 0
- for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
- if len(_text)>100000:
- continue
- df_2000["document_id"].append(_id)
- df_2000["document_text"].append(_text)
- df_2 = pd.DataFrame(df_2000)
- df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
- # save(dict_websource,"../data/dict_websource.pk")
- def getWinTenderer(sub_doc_json):
- if sub_doc_json is not None:
- sub_doc = json.loads(sub_doc_json)
- for _doc in sub_doc:
- if "win_tenderer" in _doc:
- return _doc["win_tenderer"]
- return ""
- def exportDocument_by_keywords(page_time,
- list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _k in list_keyword:
- task_queue.put(_k)
- def _handle(keyword,result_queue):
- should_queries = []
- for _temp in [keyword]:
- should_queries.append(MatchPhraseQuery("doctitle",_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries)
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
- MatchPhraseQuery("doctitle",keyword),
- TermQuery("docchannel","101")
- ])
- is_all_succeed = False
- _count = 0
- total_count = 1
- next_token = None
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- print(keyword,next_token,total_count)
- while(next_token):
- try:
- # print(next_token)
- _count += len(rows)
- print("%s:%d/%d"%(keyword,_count,total_count))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- except Exception as e:
- traceback.print_exc()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
- mt.run()
- df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
- while(True):
- try:
- item = result_queue.get(block=True,timeout=1)
- for _k in df_data.keys():
- if _k in item:
- df_data[_k].append(item[_k])
- else:
- df_data[_k].append("")
- except queue.Empty as e:
- break
- except Exception as e:
- traceback.print_exc()
- df = pd.DataFrame(df_data)
- df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
- def exportGovement():
- should_queries1 = []
- for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
- should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
- should_queries2 = []
- for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
- should_queries2.append(WildcardQuery("province","*%s*"%_temp))
- ots_client = getConnect_ots()
- page_time = "2020-12"
- bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
- BoolQuery(should_queries=should_queries2),
- TermQuery("docchannel","52"),
- RangeQuery("publishtime",page_time)])
- columns = ["tenderee","tenderee_contact","tenderee_phone"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- print(total_count)
- def getRow(rows,df_data,columns):
- for row in rows:
- _dict = dict()
- for part in row:
- for item in part:
- _dict[item[0]] = item[1]
- if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
- for key in columns:
- df_data[key].append(_dict.get(key,""))
- all_rows = 0
- df_data = {}
- for key in columns:
- df_data[key] = []
- getRow(rows,df_data,columns)
- _count = 100
- while(next_token):
- print(_count,total_count)
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += 100
- getRow(rows,df_data,columns)
- df2 = pd.DataFrame(df_data)
- df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
- def export_attachment():
- filename = "../data/销售部-字段名.xlsx"
- df = pd.read_excel(filename)
- list_dict = []
- for _key in df["关键词"]:
- if len(_key)>0:
- list_dict.append({"keyword":_key})
- def _handle(_dict,result_queue,pool_ots):
- _keyword = _dict["keyword"]
- ots_client = pool_ots.getConnector()
- keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword),
- MatchPhraseQuery("doctitle",_keyword)])
- bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True),
- keyword_query])
- columns = ["dochtmlcon","docid"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- print(_keyword,total_count)
- hit_count = 0
- def getData(rows,hit_count):
- list_fields = getRow_ots(rows)
- for _fields in list_fields:
- dochtmlcon = _fields["dochtmlcon"]
- docid = _fields["docid"]
- _flag = False
- for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon):
- try:
- result = requests.get(url,stream=True,timeout=100)
- if result.status_code==200:
- filename = get_file_name(url,result.headers)
- with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f:
- f.write(result.content)
- _flag = True
- except Exception:
- pass
- if _flag:
- hit_count += 1
- return hit_count
- hit_count = getData(rows,hit_count)
- _count = len(rows)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += len(rows)
- if _count>20000:
- break
- hit_count = getData(rows,hit_count)
- if hit_count>20:
- break
- pool_ots.putConnector(ots_client)
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_dict:
- task_queue.put(item)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- def exportIndustryCount():
- import codecs
- time_from = "2020-12-21"
- time_to = "2020-12-25"
- # dict_channel = {"51":{"type":"公告变更"},
- # "52":{"type":"招标公告"},
- # "101":{"type":"中标信息"},
- # "102":{"type":"招标预告"},
- # "103":{"type":"招标答疑"},
- # "104":{"type":"招标文件"},
- # "105":{"type":"资审结果"},
- # "103":{"type":"招标控制价"},
- # "100":{"type":"未知类型"}}
- dict_industry = {}
- meta_industry = load("../data/class2dalei_menlei.pkl")
- for _key in meta_industry.keys():
- dict_industry[_key] = {"type":_key}
- print(dict_industry.keys())
- return
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_industry.keys():
- task_queue.put(dict_industry[_key])
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
- RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
- columns = ["docid"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- columns = ["dochtmlcon"]
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for part in row:
- for v in part:
- _dict[v[0]] = v[1]
- with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
- f.write(_dict["dochtmlcon"])
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- columns = ["type","count"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for _indus in dict_industry.keys():
- for _c in columns:
- df_data[_c].append(dict_industry[_indus][_c])
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
- def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count = len(rows)
- df_data = {}
- def getData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- for _k,_v in _dict.items():
- if _k not in df_data:
- df_data[_k] = []
- df_data[_k].append(getLegal_str(_v))
- getData(df_data,rows)
- while(next_token):
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += len(rows)
- getData(df_data,rows)
- df = pd.DataFrame(df_data)
- df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
- def processDocument():
- filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
- df = pd.read_excel(filename)
- df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
- return
- list_dict = []
- for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
- list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
- task_queue = queue.Queue()
- for _dict in list_dict:
- task_queue.put(_dict)
- result_queue = queue.Queue()
- def _handle(_dict,result_queue,pool_mysql):
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- # url = "http://192.168.2.101:15030"
- # myheaders = {'Content-Type': 'application/json'}
- # print(int(_dict["docid"]))
- # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
- # resp = requests.post(url,json=data,headers=myheaders, verify=True)
- # result = json.loads(resp.content.decode("utf8"),"utf8")
- # _dict["product"] = result["product"]
- pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
- mt.run()
- # columns = ["docid","doctitle","project_name","product"]
- #
- # df_data = {}
- # for _c in columns:
- # df_data[_c] = []
- # for _dict in list_dict:
- # for _c in columns:
- # df_data[_c].append(_dict.get(_c,""))
- # df = pd.DataFrame(df_data)
- # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
- def export_extract_check():
- '''
- :return:导出数据提取校验的结果并生成报告
- '''
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = " select docid,json_result from exportdb.extract_check "
- cursor.execute(sql)
- dict_global = {}
- df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
- df_document = {"docid":[]}
- while True:
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- for docid,json_result in rows:
- df_document["docid"].append(docid)
- _result = json.loads(json_result)
- for k,v in _result.items():
- key = k.split("_")
- _key = "_".join(key[:-1])
- if "punish" in _key or "complainants" in _key or "institutions" in _key:
- continue
- if k not in df_document:
- df_document[k] = []
- df_document[k].append(v)
- key_type = key[-1]
- if _key not in dict_global:
- dict_global[_key] = {}
- if key_type not in dict_global[_key]:
- dict_global[_key][key_type] = 0
- if key_type=="diff":
- dict_global[_key][key_type] += v
- if key_type in ("online","test"):
- if isinstance(v,str):
- if v!="":
- dict_global[_key][key_type] += 1
- elif isinstance(v,list):
- dict_global[_key][key_type] += len(v)
- for k,v in dict_global.items():
- df_global["key_type"].append(k)
- df_global["online_count"].append(v["online"])
- df_global["test_count"].append(v["test"])
- df_global["diff_count"].append(v["diff"])
- df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
- filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
- with pd.ExcelWriter(filename) as writer:
- df1 = pd.DataFrame(df_global)
- df1.to_excel(writer,sheet_name="global")
- for k,v in df_document.items():
- print(k,len(v))
- df2 = pd.DataFrame(df_document)
- df2.to_excel(writer,sheet_name="document")
- writer.save()
- writer.close()
- def exportDocument_dump():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
- df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
- df_data = {}
- for _key in df_keys:
- df_data[_key] = []
- bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
- RangeQuery("status",201,300,True,True)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- def getData(df_data,rows):
- list_data = getRow_ots(rows)
- for row in list_data:
- dict_find = {}
- for _key in df_keys:
- dict_find[_key] = 0
- for _k,_v in row.items():
- if _k in df_keys:
- dict_find[_k] = 1
- if _k=="project_code":
- _v = '"%s"'%_v
- df_data[_k].append(_v)
- doctitle = row.get("doctitle","")
- df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
- dict_find["URL"] = 1
- dict_find["doctitle_refine"] = 1
- sub_docs_json = row.get("sub_docs_json","[{}]")
- doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- for _key_sub_docs in sub_docs.keys():
- if _key_sub_docs in doc_columns:
- if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- if float(sub_docs[_key_sub_docs])>0:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- else:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- for _k,_v in doc_columns.items():
- dict_find[_k] = 1
- df_data[_k].append(_v)
- for _k,_v in dict_find.items():
- if _v==0:
- df_data[_k].append("")
- _count = len(rows)
- getData(df_data,rows)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows)
- _count += len(rows)
- # for docid in df["docid"]:
- # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query , limit=100, get_total_count=True),
- # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- # list_data = getRow_ots(rows)
- # if len(list_data)>0:
- # dict_find = {}
- # for _key in df_keys:
- # dict_find[_key] = 0
- # for _k,_v in list_data[0].items():
- # if _k in df_keys:
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # doctitle = list_data[0].get("doctitle","")
- # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- # dict_find["doctitle_refine"] = 1
- # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
- # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- # if sub_docs_json is not None:
- # for sub_docs in json.loads(sub_docs_json):
- # for _key_sub_docs in sub_docs.keys():
- # if _key_sub_docs in doc_columns:
- # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- # if float(sub_docs[_key_sub_docs])>0:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # else:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # for _k,_v in doc_columns.items():
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # for _k,_v in dict_find.items():
- # if _v==0:
- # df_data[_k].append("")
- df1 = pd.DataFrame(df_data)
- df1.to_csv("../data/0303去重.csv",columns=df_keys)
- def exportDocument_dump_mysql():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
- cursor.execute(sql)
- while True:
- rows = cursor.fetchmany(100000)
- if not rows:
- break
- for row in rows:
- for _i in range(len(columns)):
- df_data[columns[_i]].append(row[_i])
- df = pd.DataFrame(df_data)
- df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
- print(cursor.description)
- def getDict_docchannel():
- conn = getConnection_mysql()
- cursor = conn.cursor()
- sql = "select channel_id,chnlname from sys_channel "
- cursor.execute(sql)
- rows = cursor.fetchall()
- _dict = dict()
- for row in rows:
- _dict[row[0]] = row[1]
- return _dict
- def exportDocument_by_doctitle():
- columns = ["docid","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource"]
- dict_channel = getDict_docchannel()
- str_keyword = '''
- 报批技术服务 不动产 测绘 城市更新 档案整理
- 房地一体 拆旧复垦 土地规划 城乡规划 村庄规划
- 技术服务 技术审查 建设用地增减挂钩 勘察 垦造水田
- 不动产数据建库 不动产数据整合 林权调查 土地调查 城市更新数据调查
- 不动产平台 测绘系统 地理信息系统 城乡规划信息系统 一张图信息平台
- 测绘信息平台 双评价 城市更新研究 垦造水田研究报告 生态修复研究
- 土地规划研究 复垦咨询服务 生态修复咨询服务 城乡规划咨询服务 城市更新咨询服务
- 勘测定界 多测合一 用地预审 国土规划数据治理 地名普查
- 地形图 垦造水田咨询服务 评估 全域土地综合整治 生态修复
- 林权数据建库 权属调查 权籍调查
- '''
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _keyword in re.split("\s",str_keyword):
- if len(_keyword.strip())==0:
- continue
- task_queue.put({"keyword":_keyword})
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- should_q1 = BoolQuery(should_queries=[MatchPhraseQuery("doctitle",item["keyword"])])
- should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东"),
- WildcardQuery('province', '%s*'%"湖南"),
- WildcardQuery('province', '%s*'%"广西")])
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-01-01"),
- RangeQuery("status",201,300,True,True),
- TermQuery("docchannel",101),
- should_q1,should_q2])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- _count = len(list_data)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- _count += len(list_data)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
- mt.run()
- list_item = []
- try:
- while True:
- _dict = result_queue.get(False)
- list_item.append(_dict)
- except Exception as e:
- print(e)
- keys = list_item[0].keys()
- df_data = {}
- print(len(list_item))
- set_line = set()
- for row in list_item:
- item = {}
- _dict = row
- set_dict_item(item,"公告id",_dict.get("docid",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"区",_dict.get("district",""))
- set_dict_item(item,"关键词",_dict.get("keyword",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- # set_dict_item(item,"代理单位",_dict.get("agency",""))
- # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- tenderee_in = "否"
- tenderee_keyword = "否"
- if _dict.get("tenderee","")!="":
- if _dict.get("tenderee","") in _dict.get("doctitle",""):
- tenderee_in = "是"
- if _dict.get("keyword","") in _dict.get("tenderee",""):
- tenderee_keyword = "是"
- set_dict_item(item,"标题包含招标人",tenderee_in)
- set_dict_item(item,"招标人含有关键词",tenderee_keyword)
- sub_docs_json = _dict.get("sub_docs_json")
- for _doc in json.loads(sub_docs_json):
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- # if "win_tenderee_manager" in _doc:
- # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- # if "win_tenderee_phone" in _doc:
- # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额(万元)",float(_doc["win_bid_price"])/10000)
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额(万元)",float(_doc["bidding_budget"])/10000)
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标金额(万元)" not in item:
- set_dict_item(item,"中标金额(万元)","")
- if "招标金额(万元)" not in item:
- set_dict_item(item,"招标金额(万元)","")
- # if "中标单位联系人" not in item:
- # set_dict_item(item,"中标单位联系人","")
- # if "中标单位联系电话" not in item:
- # set_dict_item(item,"中标单位联系电话","")
- _line = "%s-%s-%s"%(item["项目编号"],item["中标单位"],item["中标金额(万元)"])
- if _line in set_line:
- continue
- # if item["招标金额"]=="":
- # continue
- set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df = pd.DataFrame(df_data)
- df.to_excel("../data/2021-04-14_export11.xlsx",columns=list_df_columns)
- set_columns = set()
- list_df_columns = []
- def set_dict_item(_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns:
- set_columns.add(name)
- list_df_columns.append(getLegal_str(name))
- def exportDocument_medicine(start_time,end_time):
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- dict_channel = getDict_docchannel()
- def getData(df_data,rows,set_line):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- sub_docs_json = _dict.get("sub_docs_json")
- for _doc in json.loads(sub_docs_json):
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- # list_province = ["江西","湖南","四川","安徽"]
- list_province = ["全国"]
- for _province in list_province:
- df_data = {}
- str_p = '''
- 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
- 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
- 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
- 临床应用
- '''
- list_prov = re.split("\s",str_p)
- list_mu = []
- for _p in list_prov:
- if _p.strip()=="":
- continue
- print(_p)
- list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
- s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
- list_should_ten = []
- for _p in re.split("、",s_tenderee):
- if _p.split()=="":
- continue
- list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
- list_should_chan = []
- list_should_chan.append(TermQuery("docchannel",52))
- list_should_chan.append(TermQuery("docchannel",101))
- list_should_chan.append(TermQuery("docchannel",102))
- should_q1 = BoolQuery(should_queries=list_mu)
- should_q2 = BoolQuery(should_queries=list_should_ten)
- should_q3 = BoolQuery(should_queries=list_should_chan)
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
- RangeQuery("status",201,300,True,True),
- should_q1,should_q2,should_q3])
- # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
- # MatchPhraseQuery("doctitle","教学器材")])
- # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
- # MatchPhraseQuery("doctitle","教育设备")])
- # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
- # MatchPhraseQuery("doctitle","教学器材")])
- # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
- # MatchPhraseQuery("doctitle","教育设备")])
- # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
- # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
- # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
- # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
- # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
- # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
- # MatchPhraseQuery("doctitle","设备")])
- #
- # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
- # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
- # MatchPhraseQuery("doctitle","医院"),
- # MatchPhraseQuery("doctitle","工业园"),
- # MatchPhraseQuery("doctitle","营养"),
- # MatchPhraseQuery("doctitle","厨房设备")])
- # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
- # RangeQuery("status",201,300,True,True),
- # TermQuery("docchannel",101),
- # WildcardQuery('province', '%s*'%_province),
- # all_should],
- # must_not_queries=[must_not_q])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- getData(df_data,rows,set_line)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line)
- _count += len(rows)
- if len(df_data[list(df_data.keys())[0]])>=300:
- break
- set_enterprise = set()
- for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- set_enterprise.add(_tenderee)
- set_enterprise.add(_agency)
- set_enterprise.add(_win_tenderer)
- if "" in set_enterprise:
- set_enterprise.remove("")
- if None in set_enterprise:
- set_enterprise.remove(None)
- dict_enterprise = getDictEnterprise(list(set_enterprise))
- if len(set_enterprise)>0:
- for _i in range(len(df_data["招标单位"])):
- _enterprise_name = df_data["招标单位"][_i]
- if df_data["招标联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["招标联系人"][_i] = _person
- df_data["招标联系人电话"][_i] = _phone
- _enterprise_name = df_data["代理单位"][_i]
- if df_data["代理联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["代理联系人"][_i] = _person
- df_data["代理联系人电话"][_i] = _phone
- _enterprise_name = df_data["中标单位"][_i]
- if df_data["中标单位联系电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["中标单位联系人"][_i] = _person
- df_data["中标单位联系电话"][_i] = _phone
- return df_data
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- def exportDocument_by_pagetime():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- dict_channel = getDict_docchannel()
- def getData(df_data,rows,set_line):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- sub_docs_json = _dict.get("sub_docs_json")
- for _doc in json.loads(sub_docs_json):
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- # list_province = ["江西","湖南","四川","安徽"]
- list_province = ["全国"]
- for _province in list_province:
- df_data = {}
- str_p = '''
- 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
- 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
- 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
- 临床应用
- '''
- list_prov = re.split("\s",str_p)
- list_mu = []
- for _p in list_prov:
- if _p.strip()=="":
- continue
- print(_p)
- list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
- s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
- list_should_ten = []
- for _p in re.split("、",s_tenderee):
- if _p.split()=="":
- continue
- list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
- list_should_chan = []
- list_should_chan.append(TermQuery("docchannel",52))
- # list_should_chan.append(TermQuery("docchannel",101))
- # list_should_chan.append(TermQuery("docchannel",102))
- should_q1 = BoolQuery(should_queries=list_mu)
- should_q2 = BoolQuery(should_queries=list_should_ten)
- should_q3 = BoolQuery(should_queries=list_should_chan)
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-05-07","2021-05-07",include_lower=True,include_upper=True),
- RangeQuery("status",201,300,True,True)
- # ,should_q1
- # ,should_q2
- ,should_q3])
- # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
- # MatchPhraseQuery("doctitle","教学器材")])
- # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
- # MatchPhraseQuery("doctitle","教育设备")])
- # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
- # MatchPhraseQuery("doctitle","教学器材")])
- # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
- # MatchPhraseQuery("doctitle","教育设备")])
- # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
- # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
- # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
- # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
- # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
- # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
- # MatchPhraseQuery("doctitle","设备")])
- #
- # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
- # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
- # MatchPhraseQuery("doctitle","医院"),
- # MatchPhraseQuery("doctitle","工业园"),
- # MatchPhraseQuery("doctitle","营养"),
- # MatchPhraseQuery("doctitle","厨房设备")])
- # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
- # RangeQuery("status",201,300,True,True),
- # TermQuery("docchannel",101),
- # WildcardQuery('province', '%s*'%_province),
- # all_should],
- # must_not_queries=[must_not_q])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- getData(df_data,rows,set_line)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line)
- _count += len(rows)
- # if len(df_data[list(df_data.keys())[0]])>=300:
- # break
- # set_enterprise = set()
- # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- # set_enterprise.add(_tenderee)
- # set_enterprise.add(_agency)
- # set_enterprise.add(_win_tenderer)
- # if "" in set_enterprise:
- # set_enterprise.remove("")
- # if None in set_enterprise:
- # set_enterprise.remove(None)
- # dict_enterprise = getDictEnterprise(list(set_enterprise))
- # if len(set_enterprise)>0:
- # for _i in range(len(df_data["招标单位"])):
- # _enterprise_name = df_data["招标单位"][_i]
- # if df_data["招标联系人电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["招标联系人"][_i] = _person
- # df_data["招标联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["代理单位"][_i]
- # if df_data["代理联系人电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["代理联系人"][_i] = _person
- # df_data["代理联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["中标单位"][_i]
- # if df_data["中标单位联系电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["中标单位联系人"][_i] = _person
- # df_data["中标单位联系电话"][_i] = _phone
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- def attachCompanyContact():
- files = ["../data/2021-03-17_四川_关键词导出.csv",
- "../data/2021-03-17_安徽_关键词导出.csv",
- "../data/2021-03-17_江西_关键词导出.csv",
- "../data/2021-03-17_湖南_关键词导出.csv"]
- files = ["../data/欧科自然资源5w以上数据.xlsx"]
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- primary_key = [('name',str(item["招标单位"]))]
- columns_to_get = ["province","city","district"]
- consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
- # bool_query = BoolQuery(must_queries=[TermQuery("nicknames",item["中标单位"])])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
- # SearchQuery(bool_query , limit=1, get_total_count=True),
- # ColumnsToGet(["contacts"],return_type=ColumnReturnType.SPECIFIED))
- # _docid = int(item["docid"])
- # partitionkey = _docid%500+1
- # primary_key = [('partitionkey',partitionkey),("docid",_docid)]
- # columns_to_get = ["doctitle"]
- # consumed, return_row, next_token = ots_client.get_row("document",primary_key, columns_to_get, None, 1)
- if isinstance(item["招标单位"],str) and item["招标单位"]!="":
- if return_row is not None:
- _dict = getRow_ots_primary(return_row)
- # item["doctitle"] = _dict.get("doctitle","")
- item["招标人省份"] = _dict.get("province","")
- item["招标人城市"] = _dict.get("city","")
- item["招标人区域"] = _dict.get("district","")
- province,city,district = getLocation(item["招标单位"])
- if item["招标人省份"]=="" or item["招标人省份"]=="未知":
- item["招标人省份"] = province
- if item["招标人城市"]=="" or item["招标人城市"]=="未知":
- item["招标人城市"] = city
- if item["招标人区域"]=="" or item["招标人区域"]=="未知":
- item["招标人区域"] = district
- else:
- province,city,district = getLocation(item["招标单位"])
- item["招标人省份"] = province
- item["招标人城市"] = city
- item["招标人区域"] = district
- else:
- item["招标人省份"] = item["省份"]
- item["招标人城市"] = item["城市"]
- item["招标人区域"] = item["区"]
- # contacts = json.loads(_dict["contacts"])
- # contacts.sort(key=lambda x:x["level"],reverse=True)
- # phone = ""
- # phone_person = ""
- # mobile = ""
- # mobile_person = ""
- # for contact in contacts:
- # if mobile=="" and contact.get("mobile_no","")!="":
- # mobile = contact.get("mobile_no","")
- # mobile_person = contact.get("contact_person","")
- # if phone=="" and contact.get("phone_no","")!="":
- # phone = contact.get("phone_no",'')
- # phone_person = contact.get("contact_person","")
- # item["招标联系人"] = ""
- # item["招标联系人电话"] = ""
- # if mobile!="":
- # item["招标联系人"] = mobile_person
- # item["招标联系人电话"] = mobile
- # else:
- # item["中标单位联系人"] = phone_person
- # item["中标单位联系电话"] = phone
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- for file in files:
- task_queue = queue.Queue()
- df = pd.read_excel(file,encoding="UTF8")
- keys = df.keys()[1:]
- list_item = []
- for row in df.itertuples():
- _dict = {}
- for _key in keys:
- if _key in dir(row):
- _v = row.__getattribute__(_key)
- else:
- _v = ''
- _dict[_key] = _v
- list_item.append(_dict)
- for item in list_item:
- task_queue.put(item)
- mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
- mt.run()
- df_data = {}
- for _k in keys:
- df_data[_k] = []
- for item in list_item:
- for _k in keys:
- df_data[_k].append(getLegal_str(item.get(_k,"-")))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("%s_attach.xlsx"%file,columns=keys)
- def dumpWebSourceNo():
- conn_oracle = getConnection_oracle()
- cursor_oracle = conn_oracle.cursor()
- sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
- cursor_oracle.execute(sql)
- rows = cursor_oracle.fetchall()
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- for row in rows:
- sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
- print(sql)
- cursor_mysql.execute(sql)
- conn_mysql.commit()
- def exportNzj():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
- def getData(df_data,rows,set_line):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docids",_dict.get("docids",""))
- set_dict_item(item,"contacts",_dict.get("contacts",""))
- set_dict_item(item,"covered_area",_dict.get("covered_area",""))
- set_dict_item(item,"follows",_dict.get("follows",""))
- set_dict_item(item,"project_type",_dict.get("project_type",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"page_time",_dict.get("page_time",""))
- set_dict_item(item,"progress",_dict.get("progress",""))
- set_dict_item(item,"project_description",_dict.get("project_description",""))
- set_dict_item(item,"project_follow",_dict.get("project_follow",""))
- set_dict_item(item,"project_code",_dict.get("project_code",""))
- set_dict_item(item,"project_name",_dict.get("project_name",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df_data = {}
- bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- getData(df_data,rows,set_line)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line)
- _count += len(rows)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
- def turn_status():
- df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- for docid in df["公告id"]:
- partitionkey = int(docid)%500+1
- sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
- cursor.execute(sql)
- conn.commit()
- def attachBidding_budget():
- conn_mysql = getConnection_testmysql()
- cursor = conn_mysql.cursor()
- sql = "select docid from analysis_r2 where bidding_budget=''"
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- cursor.execute(sql)
- rows = cursor.fetchmany(10000)
- while(rows):
- for row in rows:
- task_queue.put(row[0])
- rows = cursor.fetchmany(10000)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_mysql,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
- SearchQuery(bool_query , limit=1, get_total_count=True),
- ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots(rows)
- if len(list_dict)>0:
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
- mt.run()
- def debug_documentMerge():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = "select merge_docids from project_group_final_log "
- cursor.execute(sql)
- task_queue = queue.Queue()
- for row in cursor.fetchall():
- task_queue.put(row[0])
- print(task_queue.qsize())
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- list_docids = item.split(",")
- must_q = []
- for _docid in list_docids:
- must_q.append(TermQuery("docids",_docid))
- bool_query = BoolQuery(must_queries=must_q)
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,limit=1,get_total_count=True),
- ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
- if total_count==0:
- print(item)
- result_queue.put(item)
- pool_ots.putConnector(ots_client)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
- mt.run()
- while(True):
- try:
- item = result_queue.get(True)
- print(item)
- except Exception as e:
- print(str(e))
- break
- def signDocument():
- filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
- sign_filename = "%s_sign.xlsx"%filename
- df = pd.read_excel(filename)
- df_data = {"sign":[]}
- for item in df["segword"]:
- content = re.sub("\s*","",item)
- _find = re.search("(?P<key>流标|废标|中止|终止|撤销|采购失败)",content)
- if _find is not None:
- df_data["sign"].append(_find.groupdict().get("key"))
- else:
- df_data["sign"].append("无")
- df1 = pd.DataFrame(df_data)
- df1.to_excel(sign_filename)
- if __name__=="__main__":
- # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
- # processDocument()
- # export_extract_check()
- # exportArticle_by_websource()
- # export_keyword_count()
- # export_province_keyword_count()
- # exportDocument_dump()
- # exportDocument_dump_mysql()
- # export_attachment()
- # exportDocument_by_doctitle()
- # exportIndustryCount()
- # exportDocument_by_pagetime()
- # attachCompanyContact()
- # dumpWebSourceNo()
- # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%138306357))))
- # exportNzj()
- # turn_status()
- # attachBidding_budget()
- # debug_documentMerge()
- exportDocument_medicine("2021-05-02","2021-05-08")
- # signDocument()
|