exportEnterprise.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207
  1. #coding:UTF8
  2. import sys
  3. import os
  4. sys.path.append("../")
  5. import pandas as pd
  6. from dataSource.source import *
  7. import json
  8. from utils.multiThread import MultiThreadHandler
  9. import queue
  10. from utils.Utils import *
  11. from dataSource.pool import ConnectorPool
  12. import re
  13. from tablestore import *
  14. import traceback
  15. from export.exportUtils import generateBoolShouldQuery,splitIntoList
  16. data_path = "../data/"
  17. def getCompanys():
  18. list_company = []
  19. keywords = ["环境","生态","再生","回收","环保"]
  20. provinces = ["广东"]
  21. for _name in keywords:
  22. for _prov in provinces:
  23. data = make_elasticSearch({
  24. "query": {
  25. "bool": {
  26. "must": [
  27. {
  28. "wildcard": {
  29. "name.keyword": "*%s*"%_name
  30. }
  31. }
  32. # ,
  33. # {
  34. # "term": {
  35. # "province.keyword": "%s"%_prov
  36. # }
  37. # }
  38. # ,
  39. # {
  40. # "range": {
  41. # "zhongBiaoNumber": {
  42. # "gt": "0"
  43. # }
  44. # }
  45. # }
  46. ],
  47. "must_not": [ ],
  48. "should": [ ]
  49. }
  50. },
  51. "from": 0,
  52. "size": 1000000,
  53. "sort": [ ],
  54. "aggs": { }
  55. })
  56. print("--",data["hits"]["total"])
  57. for item in data["hits"]["hits"]:
  58. _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
  59. _company["enterprise_name"] = item["_source"].get("name","")
  60. _company["regCapital"] = item["_source"].get("regCapital","")
  61. _company["zhongBiaoNumber"] = item["_source"].get("zhongBiaoNumber","0")
  62. list_company.append(_company)
  63. # data = make_elasticSearch({
  64. # "query": {
  65. # "bool": {
  66. # "must": [
  67. # {
  68. # "wildcard": {
  69. # "name.keyword": "*电商*"
  70. # }
  71. # }
  72. # ,
  73. # {
  74. # "term": {
  75. # "province.keyword": "北京"
  76. # }
  77. # }
  78. # ,
  79. # {
  80. # "range": {
  81. # "zhongBiaoNumber": {
  82. # "gt": "0"
  83. # }
  84. # }
  85. # }
  86. # ],
  87. # "must_not": [ ],
  88. # "should": [ ]
  89. # }
  90. # },
  91. # "from": 0,
  92. # "size": 10000,
  93. # "sort": [ ],
  94. # "aggs": { }
  95. # })
  96. #
  97. # for item in data["hits"]["hits"]:
  98. # _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
  99. # _company["enterprise_name"] = item["_source"].get("name","")
  100. # _company["regCapital"] = item["_source"].get("regCapital","")
  101. # list_company.append(_company)
  102. print(len(list_company))
  103. return list_company
  104. def exportFactory():
  105. def _handle(item,result_queue,pool_mongo,pool_neo4j):
  106. company_name = item["enterprise_name"]
  107. mongo = pool_mongo.getConnector()
  108. coll_zb = mongo.enterprise_profile
  109. rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1, "actualCapital":1,"estiblishTime":1,"legal_person":1,"phone":1 })
  110. _flag = False
  111. for row in rows:
  112. actualCapital = row.get("actualCapital","0")
  113. estiblishTime = row.get("estiblishTime","2020-01-01")
  114. _captial = re.match("\d+[亿万]+",actualCapital)
  115. # if _captial is not None:
  116. # if getUnifyMoney(_captial.group())>getUnifyMoney("5000万"):
  117. # if estiblishTime<="2015-10-09":
  118. item["legal_person"] = row.get("legal_person","")
  119. item["phone"] = row.get("phone","")
  120. item["actualCapital"] = actualCapital
  121. item["estiblishTime"] = row.get("estiblishTime","")
  122. _flag = True
  123. break
  124. if _flag:
  125. result_queue.put(item)
  126. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
  127. graph = pool_neo4j.getConnector()
  128. finded = graph.run(cql)
  129. data = json.loads(json.dumps(finded.data()))
  130. _count = data[0]["_c"]
  131. # list_project = []
  132. # for _data in data:
  133. # if _count<=3:
  134. # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  135. # if _data["project_name"] is not None:
  136. # list_project.append(_data["project_name"])
  137. # _count += 1
  138. item["count"] = _count
  139. pool_mongo.putConnector(mongo)
  140. pool_neo4j.putConnector(graph)
  141. # list_company = getCompanys()
  142. list_company = []
  143. filename = "../data/天眼查1(1).xlsx"
  144. df1 = pd.read_excel(filename)
  145. for item in df1["公司名称"]:
  146. list_company.append({"enterprise_name":item,"regCapital":"","legal_person":"","phone":"","industry":"","province":""})
  147. task_queue = queue.Queue()
  148. result_queue = queue.Queue()
  149. for item in list_company:
  150. task_queue.put(item)
  151. pool_mongo = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_mongodb)
  152. pool_neo4j = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_neo4j)
  153. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j)
  154. _mult.run()
  155. list_name = []
  156. list_actualCapital = []
  157. list_estiblishTime = []
  158. list_legal_person = []
  159. list_phone = []
  160. list_zb = []
  161. while(True):
  162. try:
  163. item = result_queue.get(False)
  164. list_name.append(item["enterprise_name"])
  165. list_actualCapital.append(item["actualCapital"])
  166. list_estiblishTime.append(item["estiblishTime"])
  167. list_legal_person.append(item["legal_person"])
  168. list_phone.append(item["phone"])
  169. list_zb.append(item["count"])
  170. except:
  171. break
  172. df = pd.DataFrame({"公司":list_name,"实缴":list_actualCapital,
  173. "注册时间":list_estiblishTime,"联系人":list_legal_person,"联系电话":list_phone,
  174. "中标次数":list_zb})
  175. df.to_excel("%s"%filename+"_export.xlsx",columns=["公司","实缴","注册时间","联系人","联系电话","中标次数"])
  176. def deal():
  177. def _handle(item,result_queue):
  178. graph = getConnect_neo4j()
  179. company_name = item["enterprise_name"]
  180. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc limit 3"%(company_name)
  181. finded = graph.run(cql)
  182. data = json.loads(json.dumps(finded.data()))
  183. _count = 1
  184. list_project = []
  185. for _data in data:
  186. if _count<=3:
  187. if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  188. list_project.append(_data["project_name"])
  189. _count += 1
  190. item["project"] = str(list_project)
  191. result_queue.put(item)
  192. file = "../data/北京行业_export.xls"
  193. df = pd.read_excel(file)
  194. list_company = []
  195. for _company,rep,industry,project,count,person,phone in zip(df["公司名字"],df["注册资金"],df["行业"],df["中标项目"],df["中标次数"],df["联系人"],df["联系电话"]):
  196. list_company.append({"enterprise_name":_company,"regCapital":rep,"legal_person":person,"phone":phone,"industry":industry,"province":"","count":count})
  197. task_queue = queue.Queue()
  198. result_queue = queue.Queue()
  199. for item in list_company:
  200. task_queue.put(item)
  201. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  202. _mult.run()
  203. list_name = []
  204. list_regCapital = []
  205. list_industry = []
  206. list_count = []
  207. list_person = []
  208. list_phone = []
  209. list_project = []
  210. while(True):
  211. try:
  212. _result = result_queue.get(False)
  213. list_name.append(_result["enterprise_name"])
  214. list_regCapital.append(_result["regCapital"])
  215. list_industry.append(_result["industry"])
  216. list_count.append(_result["count"])
  217. list_person.append(_result["legal_person"])
  218. list_phone.append(_result["phone"])
  219. list_project.append(_result["project"])
  220. except Exception as e:
  221. print(e)
  222. break
  223. df1 = pd.DataFrame({"公司名字":list_name,"注册资金":list_regCapital,"行业":list_industry,"中标项目":list_project,"中标次数":list_count,"联系人":list_person,"联系电话":list_phone})
  224. df1.to_excel("%s_export1.xls"%("北京行业"),columns=["公司名字","注册资金","行业","中标项目","中标次数","联系人","联系电话"])
  225. def deal1():
  226. def _handle(item,result_queue):
  227. graph = getConnect_neo4j()
  228. company_name = item["enterprise_name"]
  229. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc "%(company_name)
  230. finded = graph.run(cql)
  231. data = json.loads(json.dumps(finded.data()))
  232. _count = 0
  233. list_project = []
  234. for _data in data:
  235. if _count<=2:
  236. if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  237. list_project.append(_data["project_name"])
  238. _count += 1
  239. item["count"] = _count
  240. item["project"] = str(list_project)
  241. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company_name)
  242. print(cql)
  243. finded = graph.run(cql)
  244. finded_money = json.loads(json.dumps(finded.data()))
  245. whole_money = 0
  246. for _item in finded_money:
  247. if _item["r.price"] is not None:
  248. whole_money += getUnifyMoney(_item["r.price"])
  249. item["whole_money"] = str(whole_money)
  250. result_queue.put(item)
  251. # filename = "数据导出需求9.11(1)(1).xlsx"
  252. filename = "../data/新建 XLSX 工作表(1).xlsx"
  253. df = pd.read_excel(filename)
  254. list_company = []
  255. for _key in df.keys():
  256. print(_key,len(df[_key]))
  257. for _company in df["公司名称"]:
  258. list_company.append({"enterprise_name":_company,"regCapital":"","legal_person":"","phone":"","industry":"","province":"","count":0})
  259. task_queue = queue.Queue()
  260. result_queue = queue.Queue()
  261. for item in list_company:
  262. task_queue.put(item)
  263. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  264. _mult.run()
  265. _dict_item = {}
  266. while(True):
  267. try:
  268. item = result_queue.get(False)
  269. if item["enterprise_name"]!="":
  270. _dict_item[item["enterprise_name"]] = item
  271. except Exception as e:
  272. print(str(e))
  273. break
  274. list_count = []
  275. list_project = []
  276. list_money = []
  277. list_zb = []
  278. for _company in df["公司名称"]:
  279. if _company in _dict_item:
  280. list_count.append(_dict_item[_company]["count"])
  281. list_project.append(_dict_item[_company]["project"])
  282. list_money.append(_dict_item[_company]["whole_money"])
  283. list_zb.append("是" if _dict_item[_company]["count"]>0 else "否")
  284. else:
  285. print(_company)
  286. list_count.append(0)
  287. list_project.append("")
  288. list_money.append("0")
  289. list_zb.append("否")
  290. print(len(list_count),len(list_project),len(list_money),len(list_zb))
  291. df2 = pd.DataFrame({"公司名称":df["公司名称"],"次数":list_count})
  292. df2.to_excel("%s_export.xls"%filename)
  293. # df1 = pd.DataFrame({"月份":df["月份"],"电话":df["电话"],"公司名字":df["公司名字"],"开通时间":df["开通时间"],
  294. # "到期时间":df["到期时间"],"客户公司注册时间":df["客户公司注册时间"],"客户公司注册资金":df["客户公司注册资金"],
  295. # "实际缴费资金":df["实际缴费资金"],"天眼查行业分类":df["天眼查行业分类"],"是否中标":list_zb,
  296. # "中标次数":list_count,"中标项目|3个":list_project,"中标金额":list_money,"客户设置关键词":df["客户设置关键词"],"客户搜索词":df["客户搜索词"].xls})
  297. # df1.to_excel("%s_补充.xls"%filename,columns=["月份","电话","公司名字", "开通时间" ,"到期时间" ,"客户公司注册时间" ,"客户公司注册资金" ,"实际缴费资金" ,"天眼查行业分类" ,"是否中标" ,"中标次数" ,"中标项目|3个" ,"中标金额" ,"客户设置关键词" ,"客户搜索词"])
  298. def deal3():
  299. filename = "../data/导出工厂.xlsx"
  300. df = pd.DataFrame(filename)
  301. count = 0
  302. for item in df["实缴"]:
  303. if getUnifyMoney(item)>getUnifyMoney("5000万"):
  304. count += 1
  305. print(count)
  306. def exportEnterpriseByName():
  307. df = pd.read_csv("../data/中标家具公司.csv",encoding="GBK")
  308. def _handle(item,result_queue,pool_ots):
  309. ots_client = pool_ots.getConnector()
  310. primary_key = [('name',str(item["name"]))]
  311. columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
  312. consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
  313. print(return_row)
  314. for _item in return_row.attribute_columns:
  315. if _item[0]=="contacts":
  316. a = json.loads(_item[1])
  317. for i in a:
  318. if i.get("mobile_no","")==item["phone"] or i.get("phone_no","")==item["phone"]:
  319. item["contact_person"] = i.get("contact_person","")
  320. else:
  321. item[_item[0]] = _item[1]
  322. list_dict = []
  323. for name,phone in zip(df["name"],df["phone"]):
  324. list_dict.append({"name":name,"phone":phone})
  325. task_queue = queue.Queue()
  326. for item in list_dict:
  327. task_queue.put(item)
  328. result_queue = queue.Queue()
  329. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  330. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_ots=pool_ots)
  331. mt.run()
  332. columns = ["name","contact_person","phone","reg_capital","actual_capital","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
  333. df_data = {}
  334. for _c in columns:
  335. df_data[_c] = []
  336. for item in list_dict:
  337. for _key in columns:
  338. df_data[_key].append(item.get(_key,""))
  339. df1 = pd.DataFrame(df_data)
  340. df1.to_csv("中标家具公司1.csv")
  341. def getCompanys():
  342. conn = getConnection_mysql()
  343. cursor = conn.cursor()
  344. sql = '''select C.login_id as 登陆名,B.company ,B.contactname as 联系人,B.phone as 联系电话 ,(select MLEVELNAME from sys_memberlevel where id =A.memberlevelid) as 会员等级,( select name from b2c_mall_staff_basic_info where userid=B.aftermarket) as 售后客服 from bxkc.bxkc_member_term A,bxkc.b2c_mall_staff_basic_info B,bxkc.b2c_user_login_info C
  345. where A.USERID=B.USERID and B.USERID=C.USERID and B.innerOrg like '广州%'
  346. and A.memberlevelid!=81 and A.status='01' and str_to_date('2020-11-20','%Y-%m-%d') between A.stime and A.etiem ;
  347. '''
  348. cursor.execute(sql)
  349. vol = cursor.description
  350. list_company = []
  351. rows = cursor.fetchall()
  352. for row in rows:
  353. _company = {}
  354. for _vol,_value in zip(vol,row):
  355. _name = _vol[0]
  356. _company[_name] = _value
  357. list_company.append(_company)
  358. return list_company
  359. def exportEnterprise_byindustry(page_time,
  360. columns = ["name","address","business_scope","province","city","district","reg_capital","phone","estiblish_time"],
  361. keywords = ["钢材","水泥","五金","水电","暖通","暖气","电缆"]):
  362. list_should_q = []
  363. for _key in keywords:
  364. list_should_q.append(WildcardQuery("industry","*%s*"%_key))
  365. list_should_q.append(WildcardQuery("nicknames","*%s*"%_key))
  366. key_query = BoolQuery(should_queries=list_should_q)
  367. #WildcardQuery("industry","*建筑*")
  368. ots_client = getConnect_ots()
  369. bool_query = BoolQuery(must_queries=[RangeQuery("bidi_id",0,include_lower=True),
  370. key_query,
  371. RangeQuery("estiblish_time",range_to="2017-01-01")])
  372. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  373. SearchQuery(bool_query, limit=100, get_total_count=True),
  374. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  375. all_rows = 0
  376. df_data = {}
  377. for key in columns:
  378. df_data[key] = []
  379. for row in rows:
  380. _dict = dict()
  381. for part in row:
  382. for item in part:
  383. _dict[item[0]] = item[1]
  384. for key in columns:
  385. df_data[key].append(_dict.get(key,""))
  386. # if "reg_capital" in _dict:
  387. # _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
  388. # if _money is not None:
  389. # if getUnifyMoney(_money.group())>2000000:
  390. # for key in columns:
  391. # df_data[key].append(_dict.get(key,""))
  392. all_rows += len(rows)
  393. # print(next_token)
  394. while(next_token):
  395. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  396. SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
  397. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  398. for row in rows:
  399. _dict = dict()
  400. for part in row:
  401. for item in part:
  402. _dict[item[0]] = item[1]
  403. for key in columns:
  404. df_data[key].append(_dict.get(key,""))
  405. # if "reg_capital" in _dict:
  406. # _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
  407. # if _money is not None:
  408. # if getUnifyMoney(_money.group())>2000000:
  409. # for key in columns:
  410. # df_data[key].append(_dict.get(key,""))
  411. all_rows += len(rows)
  412. print(all_rows,total_count,len(df_data[columns[0]]))
  413. df = pd.DataFrame(df_data)
  414. df.to_csv("../data/enterprise_2017_a.csv",columns=columns)
  415. def getTyc_company():
  416. root_path = ["G:/文档/tyc国企","G:/文档/tyc机构"]
  417. list_files = []
  418. for _path in root_path:
  419. for file in os.listdir(_path):
  420. list_files.append(os.path.join(_path,file))
  421. list_files = ["G:/文档/tyc机构\\高级搜索导出数据结果—自定义条件—天眼查(W20011656561610789770227).xlsx"]
  422. pool_mysql = ConnectorPool(method_init=getConnection_testmysql,init_num=10,max_num=30)
  423. task_queue = queue.Queue()
  424. result_queue = queue.Queue()
  425. for _file in list_files:
  426. task_queue.put(_file)
  427. def _handle(_file,task_queue,pool_mysql):
  428. print("handle",_file)
  429. conn = pool_mysql.getConnector()
  430. cursor = conn.cursor()
  431. df = pd.read_excel(_file,header=2)
  432. for name,social_credit,identification,regist_num,organization_code in zip(df["公司名称"],df["统一社会信用代码"],df["纳税人识别号"],df["注册号"],df["组织机构代码"]):
  433. try:
  434. sql = " insert into Enterprise(name,social_credit,identification,regist_num,organization_code) values ('%s','%s','%s','%s','%s')"%(name,social_credit,identification,regist_num,organization_code)
  435. cursor.execute(sql)
  436. except Exception as e:
  437. print("error")
  438. conn.commit()
  439. pool_mysql.putConnector(conn)
  440. mt = MultiThreadHandler(task_queue,_handle,result_queue,20,pool_mysql=pool_mysql)
  441. mt.run()
  442. set_columns = set()
  443. list_df_columns = []
  444. def set_dict_item(_dict,name,v):
  445. _dict[name] = getLegal_str(v)
  446. if name not in set_columns:
  447. set_columns.add(name)
  448. list_df_columns.append(getLegal_str(name))
  449. def exportEnterprise_by_bidNum():
  450. columns = ["name","contacts","province","city","address","reg_location"]
  451. list_data = []
  452. ots_client = getConnect_ots()
  453. bool_query = BoolQuery(must_not_queries=[
  454. ExistsQuery("tyc_id"),
  455. RangeQuery("bid_number",1),
  456. RangeQuery("status",401,451),
  457. BoolQuery(should_queries=[NestedQuery("contacts",ExistsQuery("contacts.phone_no")),
  458. NestedQuery("contacts",ExistsQuery("contacts.mobile_no"))])
  459. ])
  460. for _prov in ["北京","天津"]:
  461. bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=[TermQuery("province",_prov)]),
  462. BoolQuery(should_queries=[MatchPhraseQuery("nicknames","地产"),MatchPhraseQuery("nicknames","酒店")]),
  463. NestedQuery("contacts",WildcardQuery("contacts.mobile_no","1*"))])
  464. #
  465. # bool_query = BoolQuery(must_queries=[MatchPhraseQuery("nicknames","物资回收"),
  466. # TermQuery("province","贵州")]
  467. # ,must_not_queries=[ExistsQuery("tyc_id"),NestedQuery("contacts",ExistsQuery("contacts"))]
  468. # )
  469. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  470. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
  471. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  472. def getData(df_data,rows):
  473. list_dict = getRow_ots(rows)
  474. for _dict in list_dict:
  475. print(_dict)
  476. for mobile_person,mobile_no in getMobiles(_dict.get("contacts","[]")):
  477. # for contact_person,mobile_no in getMobiles(_dict.get("contacts","[{}]")):
  478. _d = {}
  479. set_dict_item(_d,"名称",_dict.get("name",""))
  480. set_dict_item(_d,"省份",_dict.get("province",""))
  481. set_dict_item(_d,"城市",_dict.get("city",""))
  482. set_dict_item(_d,"联系人",mobile_person)
  483. set_dict_item(_d,"手机",mobile_no)
  484. list_data.append(_d)
  485. # _d = {}
  486. # set_dict_item(_d,"名称",_dict.get("name",""))
  487. # set_dict_item(_d,"省份",_dict.get("province",""))
  488. # set_dict_item(_d,"城市",_dict.get("city",""))
  489. # list_data.append(_d)
  490. # mobile_person,mobile_no = getOneContact(_dict.get("contacts"))
  491. # if mobile_no!="":
  492. # set_dict_item(_d,"联系人",mobile_person)
  493. # set_dict_item(_d,"手机",mobile_no)
  494. # # _address = _dict.get("address","")
  495. # # reg_location = _dict.get("reg_location","")
  496. # # if _address=="":
  497. # # _address = reg_location
  498. # # set_dict_item(_d,"地址",_address)
  499. # list_data.append(_d)
  500. getData(df_data,rows)
  501. _count = len(rows)
  502. while(next_token):
  503. print("%d/%d"%(_count,total_count))
  504. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  505. SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
  506. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  507. getData(df_data,rows)
  508. _count += len(rows)
  509. if _count>=300:
  510. break
  511. df_data = {}
  512. for item in list_data:
  513. for k in list_df_columns:
  514. if k not in df_data:
  515. df_data[k] = []
  516. df_data[k].append(item.get(k))
  517. df = pd.DataFrame(df_data)
  518. df.to_excel("../data/%s_enterprise_bidinum.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
  519. def make_Legal_enterprise():
  520. import codecs
  521. def format(_e):
  522. if _e is None:
  523. return None
  524. if not isinstance(_e,str):
  525. return None
  526. if re.search("^[a-zA-Z0-9]+$",_e) is not None:
  527. return None
  528. if re.search("[<《]>-。\-\.\?]",_e) is not None:
  529. return None
  530. _e1 = re.sub("\s+","",_e.replace("(","(").replace(")",")"))
  531. if re.search("[省市区县乡镇]$",_e) is not None:
  532. return None
  533. if len(_e1)>=4:
  534. return _e1
  535. return None
  536. set_enterprise = set()
  537. df = pd.read_csv("../data/other/enterprise_bidinum.csv", encoding="GBK")
  538. _count = 0
  539. for _e in df["name"]:
  540. _count += 1
  541. if _count%10000==0:
  542. print(_count)
  543. _e1 = format(_e)
  544. if _e1 is not None:
  545. set_enterprise.add(_e1)
  546. conn = getConnection_testmysql()
  547. cursor = conn.cursor()
  548. sql = " select name from Enterprise "
  549. cursor.execute(sql)
  550. rows = cursor.fetchmany(10000)
  551. while rows:
  552. for row in rows:
  553. _count += 1
  554. if _count%10000==0:
  555. print(_count)
  556. _e = row[0]
  557. _e1 = format(_e)
  558. if _e1 is not None:
  559. set_enterprise.add(_e1)
  560. rows = cursor.fetchmany(10000)
  561. with codecs.open("../data/other/LEGAL_ENTERPRISE.txt", "w", encoding="UTF8") as f:
  562. for _e in list(set_enterprise):
  563. f.write(_e+"\n")
  564. def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","industry","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code"]):
  565. task_queue = queue.Queue()
  566. result_queue= queue.Queue()
  567. for _enterprise in list_enterprise:
  568. task_queue.put(str(_enterprise))
  569. def _handle(item,result_queue,pool_ots):
  570. ots_client = pool_ots.getConnector()
  571. try:
  572. primary_key = [("name",item)]
  573. consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
  574. dict_data = getRow_ots_primary(return_row)
  575. if dict_data is not None:
  576. result_queue.put({item:dict_data})
  577. except Exception as e:
  578. traceback.print_exc()
  579. pool_ots.putConnector(ots_client)
  580. pool_ots = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_ots)
  581. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=50,pool_ots=pool_ots)
  582. mt.run()
  583. dict_enterprise = {}
  584. while True:
  585. try:
  586. _dict = result_queue.get(False)
  587. for k,v in _dict.items():
  588. dict_enterprise[k] = v
  589. except Exception as e:
  590. break
  591. return dict_enterprise
  592. def getOneContact(contacts,tojson=True,mobile_first=True,mobile_only=True):
  593. mobile_person = ""
  594. mobile_no = ''
  595. phone_person = ""
  596. phone_no = ''
  597. if contacts is None:
  598. return "",""
  599. try:
  600. if tojson:
  601. list_contacts = json.loads(contacts)
  602. else:
  603. list_contacts = contacts
  604. for _contact in list_contacts:
  605. if _contact.get("mobile_no","")!="":
  606. mobile_person = _contact.get("contact_person","")
  607. mobile_no = _contact.get("mobile_no","")
  608. if _contact.get("phone_no","")!="":
  609. phone_person = _contact.get("contact_person","")
  610. phone_no = _contact.get("phone_no","")
  611. if mobile_first:
  612. if mobile_no!="":
  613. return mobile_person,mobile_no
  614. else:
  615. if mobile_only:
  616. return mobile_person,mobile_no
  617. except Exception as e:
  618. pass
  619. return phone_person,phone_no
  620. def getMobiles(contacts,to_json=True):
  621. if to_json:
  622. list_contacts = json.loads(contacts)
  623. else:
  624. list_contacts = contacts
  625. list_result = []
  626. for _c in list_contacts:
  627. if _c.get("mobile_no","")!="":
  628. list_result.append([_c.get("contact_person",""),_c.get("mobile_no")])
  629. return list_result
  630. def getEnterpriseData(list_enterprise,df_data):
  631. def getEnterpriseData(list_enterprise,df_data):
  632. for _e in list_enterprise:
  633. _dict = {}
  634. set_dict_item(_dict,"公司名称",_e.get("name"))
  635. set_dict_item(_dict,"省份",_e.get("province"))
  636. set_dict_item(_dict,"城市",_e.get("city"))
  637. set_dict_item(_dict,"法人",_e.get("legal_person"))
  638. set_dict_item(_dict,"法人电话",_e.get("phone"))
  639. _match = re.search("^1\d{10}",_e.get("phone",""))
  640. set_dict_item(_dict,"是否手机","是" if _match is not None else "否")
  641. # set_dict_item(_dict,"企业属性",v.get("business_scope",""))
  642. # set_dict_item(_dict,"行业",v.get("industry",""))
  643. # contact_person,mobile_no = getOneContact(v.get("contacts",'[]'))
  644. # set_dict_item(_dict,"所有联系方式",v.get("contacts"))
  645. # set_dict_item(_dict,"联系人",contact_person)
  646. # set_dict_item(_dict,"手机号",mobile_no)
  647. # set_dict_item(_dict,"注册时间",v.get("estiblish_time",""))
  648. # set_dict_item(_dict,"注册资金",v.get("reg_capital",""))
  649. # set_dict_item(_dict,"bid_number",v.get("bid_number",0))
  650. # set_dict_item(_dict,"招标次数",v.get("zhao_biao_number",0))
  651. # set_dict_item(_dict,"投标次数",v.get("tou_biao_number",0))
  652. # set_dict_item(_dict,"中标次数",v.get("zhong_biao_number",0))
  653. # set_dict_item(_dict,"主营产品",v.get("products",""))
  654. for k,v in _dict.items():
  655. if k not in df_data:
  656. df_data[k] = []
  657. df_data[k].append(v)
  658. def exportEnterprise():
  659. def getEnterpriseData(list_enterprise,df_data):
  660. for _e in list_enterprise:
  661. _dict = {}
  662. set_dict_item(_dict,"公司名称",_e.get("name"))
  663. bool_query = BoolQuery(must_queries=[
  664. TermQuery("enterprise_name",_e.get("name"))
  665. ])
  666. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
  667. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC)]),limit=5,get_total_count=False),
  668. ColumnsToGet(["contact_person","position","phone_no"],ColumnReturnType.SPECIFIED))
  669. list_row = getRow_ots(rows)
  670. for _i in range(1,6):
  671. if _i-1<len(list_row):
  672. set_dict_item(_dict,"企业联系人%d"%_i,"%s(%s)\n%s"%(list_row[_i-1].get("contact_person",""),list_row[_i-1].get("position",""),list_row[_i-1].get("phone_no","")))
  673. else:
  674. set_dict_item(_dict,"企业联系人%d"%_i,"")
  675. # 企业状态 行业 机构类型 注册资本 成立时间 企业注册地 企业地址
  676. set_dict_item(_dict,"企业状态",_e.get("reg_status"))
  677. set_dict_item(_dict,"行业",_e.get("industry"))
  678. set_dict_item(_dict,"机构类型","公司")
  679. set_dict_item(_dict,"注册资本",_e.get("reg_capital"))
  680. set_dict_item(_dict,"成立时间",_e.get("found_date"))
  681. set_dict_item(_dict,"企业注册地","%s-%s"%(_e.get("province",""),_e.get("city","")))
  682. set_dict_item(_dict,"企业地址",_e.get("reg_location"))
  683. # _match = re.search("^1\d{10}",_e.get("phone",""))
  684. # set_dict_item(_dict,"是否手机","是" if _match is not None else "否")
  685. # set_dict_item(_dict,"企业属性",v.get("business_scope",""))
  686. # set_dict_item(_dict,"行业",v.get("industry",""))
  687. # contact_person,mobile_no = getOneContact(v.get("contacts",'[]'))
  688. # set_dict_item(_dict,"所有联系方式",v.get("contacts"))
  689. # set_dict_item(_dict,"联系人",contact_person)
  690. # set_dict_item(_dict,"手机号",mobile_no)
  691. # set_dict_item(_dict,"注册时间",v.get("estiblish_time",""))
  692. # set_dict_item(_dict,"注册资金",v.get("reg_capital",""))
  693. # set_dict_item(_dict,"bid_number",v.get("bid_number",0))
  694. # set_dict_item(_dict,"招标次数",v.get("zhao_biao_number",0))
  695. # set_dict_item(_dict,"投标次数",v.get("tou_biao_number",0))
  696. # set_dict_item(_dict,"中标次数",v.get("zhong_biao_number",0))
  697. # set_dict_item(_dict,"主营产品",v.get("products",""))
  698. for k,v in _dict.items():
  699. if k not in df_data:
  700. df_data[k] = []
  701. df_data[k].append(v)
  702. a = '''
  703. '''
  704. sys_keys = splitIntoList(a,"\s")
  705. # data = pd.read_excel("../data/用户投标情况导出.xlsx")
  706. _name_c = "公司名称"
  707. list_enterprise = []
  708. columns = ["province","city","legal_person","phone","reg_status","industry","reg_capital","found_date","reg_location"]
  709. ots_client = getConnect_ots()
  710. bool_query = BoolQuery(must_queries=[
  711. # RangeQuery("zhong_biao_number",1000)
  712. # TermQuery("qualifications_number",0),
  713. # MatchPhraseQuery("nicknames","公司"),
  714. generateBoolShouldQuery(["province"],["四川"],WildcardQuery),
  715. generateBoolShouldQuery([""])
  716. # generateBoolShouldQuery(["province"],["上海","江苏","浙江","安徽","福建","江西","山东"],WildcardQuery),
  717. # generateBoolShouldQuery(["nicknames"],["工程","建筑","建设"],MatchPhraseQuery)
  718. ],
  719. must_not_queries=[RangeQuery("status",401,451)])
  720. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  721. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("zhong_biao_number",SortOrder.ASC)]),limit=100,get_total_count=True),
  722. columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  723. print("total_count",total_count)
  724. list_data = getRow_ots(rows)
  725. list_enterprise.extend(list_data)
  726. while next_token:
  727. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  728. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  729. columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  730. print("%d/%d"%(len(list_enterprise),total_count))
  731. list_data = getRow_ots(rows)
  732. list_enterprise.extend(list_data)
  733. if len(list_enterprise)>=200:
  734. break
  735. # dict_enterprise = getDictEnterprise(data[_name_c][:1050000],
  736. df_data = {}
  737. getEnterpriseData(list_enterprise,df_data)
  738. df = pd.DataFrame(df_data)
  739. df.to_excel("../data/%s企业导出.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
  740. import numpy as np
  741. def exportEnterprise_by_phone():
  742. ots_client = getConnect_ots()
  743. filename = "C:\\Users\\Administrator\\Desktop\\用户数据0910.xlsx"
  744. df = pd.read_excel(filename)
  745. astr_phone = df["手机"]
  746. all_count = 0
  747. _begin = 0
  748. int_count = 0
  749. while _begin<5582:
  750. # should_q = []
  751. # print("-=")
  752. # for str_phone,str_enter,int_throw,int_search in zip(astr_phone[_begin:_begin+100],df["公司名称"][_begin:_begin+100],df["浏览条数"][_begin:_begin+100],df["搜索次数"][_begin:_begin+100]):
  753. # if str(str_phone) !="nan" and str(str_enter)!="nan" and str(int_search)=="nan" and str(int_throw)!="nan":
  754. # int_count += 1
  755. # print(str_phone,str_enter,int_throw,int_search)
  756. # _phone = str(int(str_phone))
  757. # # should_q.append(NestedQuery("contacts",TermQuery("contacts.mobile_no",_phone)))
  758. # should_q.append(MatchPhraseQuery("nicknames",str(str_enter)))
  759. # _begin += 100
  760. # if should_q:
  761. # bool_query = BoolQuery(should_queries=should_q)
  762. # rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  763. # SearchQuery(bool_query,get_total_count=True),
  764. # columns_to_get=ColumnsToGet(["nicknames"],ColumnReturnType.SPECIFIED))
  765. try:
  766. str_enter = str(df["公司名称"][_begin])
  767. consumed, return_row, next_token = ots_client.get_row("enterprise",[('name',str_enter)], ["nicknames"], None, 1)
  768. rows = getRow_ots_primary(return_row)
  769. total_count = len(rows)
  770. _begin += 1
  771. int_count += 1
  772. if total_count>0:
  773. all_count += total_count
  774. print("===",str_enter,int_count,all_count)
  775. except Exception as e:
  776. pass
  777. print("===",int_count,all_count)
  778. def attachColumn():
  779. filename = "../data/中标单位.xlsx"
  780. list_data = {}
  781. list_enterprise = []
  782. df1 = pd.read_excel(filename)
  783. for _name in df1["中标单位"]:
  784. list_enterprise.append(_name)
  785. d_e = getDictEnterprise(list_enterprise,["legal_person","phone"])
  786. df_data = {}
  787. columns = ["name","legal_person","phone"]
  788. for _name in list_enterprise:
  789. for _c in columns:
  790. if _c not in df_data:
  791. df_data[_c] = []
  792. df_data[_c].append(d_e.get(_name).get(_c))
  793. df = pd.DataFrame(df_data)
  794. df.to_excel("%s.attach.xlsx"%(filename))
  795. def transform_enterprise():
  796. conn_source = getConnection_testmysql()
  797. conn_target = getConnection_oracle()
  798. cursor_source = conn_source.cursor()
  799. cursor_target = conn_target.cursor()
  800. sql = " select name,province,city,credit_code,org_number,tax_number from enterprise_build "
  801. cursor_source.execute(sql)
  802. rows_source = cursor_source.fetchmany(10)
  803. excepted = False
  804. _index = 0
  805. while True:
  806. try:
  807. if excepted:
  808. print("==")
  809. for _r in rows_source:
  810. _sql = " insert into BXKC.COMPANY_NAME_INFO(COMPANY_NAME,PROVINCE,CITY,TAX_NUM,ORG_NUM,CREDIT_CODE) values ('%s','%s','%s','%s','%s','%s')"%(_r[0],_r[1],_r[2],_r[5],_r[4],_r[3])
  811. _sql = _sql.replace("None","")
  812. cursor_target.execute(_sql)
  813. conn_target.commit()
  814. excepted = False
  815. else:
  816. _sql = " INSERT ALL"
  817. for _r in rows_source:
  818. _sql += " into BXKC.COMPANY_NAME_INFO(COMPANY_NAME,PROVINCE,CITY,TAX_NUM,ORG_NUM,CREDIT_CODE) values ('%s','%s','%s','%s','%s','%s') "%(_r[0],_r[1],_r[2],_r[5],_r[4],_r[3])
  819. _sql = _sql +" select 1 from dual "
  820. _sql = _sql.replace("None","")
  821. cursor_target.execute(_sql)
  822. conn_target.commit()
  823. excepted = False
  824. except Exception as e:
  825. excepted = True
  826. traceback.print_exc()
  827. rows_source = cursor_source.fetchmany(1000)
  828. _index += 1
  829. print(_index,excepted)
  830. if not rows_source or len(rows_source)==0:
  831. break
  832. def exportEnterprise_GMV():
  833. task_queue = queue.Queue()
  834. ots_client = getConnect_ots()
  835. bool_query = BoolQuery(must_queries=[
  836. RangeQuery("zhong_biao_number",20,100)
  837. ])
  838. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  839. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("zhong_biao_number")]),limit=100,get_total_count=True),
  840. ColumnsToGet(["zhao_biao_number"],ColumnReturnType.SPECIFIED))
  841. list_dict = getRow_ots(rows)
  842. for _dict in list_dict:
  843. task_queue.put(_dict)
  844. while next_token:
  845. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  846. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  847. ColumnsToGet(["zhao_biao_number"],ColumnReturnType.SPECIFIED))
  848. list_dict = getRow_ots(rows)
  849. for _dict in list_dict:
  850. task_queue.put(_dict)
  851. if task_queue.qsize()>=10000:
  852. break
  853. def _handle(_dict,result_queue,ots_client):
  854. name = _dict.get("name")
  855. bool_query = BoolQuery(must_queries=[
  856. RangeQuery("page_time","2020-01-01","2021-12-31",True,True),
  857. TermQuery("win_tenderer",name)
  858. ])
  859. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  860. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
  861. ColumnsToGet(["page_time","win_bid_price"],ColumnReturnType.SPECIFIED))
  862. list_rows = getRow_ots(rows)
  863. _dict["c3"] = 0
  864. _dict["c6"] = 0
  865. _dict["c12"] = 0
  866. _dict["c24"] = 0
  867. for _row in list_rows:
  868. page_time = _row.get("page_time")
  869. win_bid_price = _row.get("win_bid_price",0)
  870. if page_time>="2021-10-01":
  871. _dict["c3"] += win_bid_price
  872. _dict["c6"] += win_bid_price
  873. _dict["c12"] += win_bid_price
  874. _dict["c24"] += win_bid_price
  875. elif page_time>="2021-07-01":
  876. _dict["c6"] += win_bid_price
  877. _dict["c12"] += win_bid_price
  878. _dict["c24"] += win_bid_price
  879. elif page_time>="2021-01-01":
  880. _dict["c12"] += win_bid_price
  881. _dict["c24"] += win_bid_price
  882. else:
  883. _dict["c24"] += win_bid_price
  884. while next_token:
  885. ows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  886. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
  887. ColumnsToGet(["page_time","win_bid_price"],ColumnReturnType.SPECIFIED))
  888. list_rows = getRow_ots(rows)
  889. for _row in list_rows:
  890. page_time = _row.get("page_time")
  891. win_bid_price = _row.get("win_bid_price",0)
  892. if page_time>="2021-10-01":
  893. _dict["c3"] += win_bid_price
  894. _dict["c6"] += win_bid_price
  895. _dict["c12"] += win_bid_price
  896. _dict["c24"] += win_bid_price
  897. elif page_time>="2021-07-01":
  898. _dict["c6"] += win_bid_price
  899. _dict["c12"] += win_bid_price
  900. _dict["c24"] += win_bid_price
  901. elif page_time>="2021-01-01":
  902. _dict["c12"] += win_bid_price
  903. _dict["c24"] += win_bid_price
  904. else:
  905. _dict["c24"] += win_bid_price
  906. result_queue.put(_dict)
  907. result_queue = queue.Queue()
  908. mt = MultiThreadHandler(task_queue,_handle,result_queue,30,ots_client=ots_client)
  909. mt.run()
  910. list_item = []
  911. while True:
  912. try:
  913. _dict = result_queue.get(False)
  914. list_item.append(_dict)
  915. except Exception as e:
  916. break
  917. df_data = {"公司名称":[],
  918. "近3个月营收":[],
  919. "近6个月营收":[],
  920. "近12个月营收":[],
  921. "近24个月营收":[]}
  922. for _dict in list_item:
  923. df_data["公司名称"].append(_dict.get("name"))
  924. df_data["近3个月营收"].append(_dict.get("c3"))
  925. df_data["近6个月营收"].append(_dict.get("c6"))
  926. df_data["近12个月营收"].append(_dict.get("c12"))
  927. df_data["近24个月营收"].append(_dict.get("c24"))
  928. df = pd.DataFrame(df_data)
  929. df.to_excel("蚂蚁测试数据.xlsx",columns=["公司名称","近3个月营收","近6个月营收","近12个月营收","近24个月营收"])
  930. def attachColumn1():
  931. filename = "全国剩下数据16570-1(2).xlsx"
  932. df = pd.read_excel(filename)
  933. list_enter = list(set(df["公司名"]))
  934. dict_en = getDictEnterprise(list_enter)
  935. list_zhongbiao = []
  936. for company in df["公司名"]:
  937. _zb = dict_en.get(company,{}).get("zhong_biao_number",0)
  938. if _zb>0:
  939. _c = "是"
  940. else:
  941. _c = "否"
  942. list_zhongbiao.append(_c)
  943. df["是否中标"] = list_zhongbiao
  944. df.to_excel("全国剩下数据16570-1(2)11.xlsx")
  945. def exportContact():
  946. filename = "../data/2023-03-06_190109_to_excel.xlsx"
  947. df = pd.read_excel(filename)
  948. list_ename = df["_id"]
  949. list_dict = []
  950. for _en in list_ename:
  951. if isinstance(_en,(str)) and _en!="":
  952. _dict = {"enterprise_name":_en}
  953. list_dict.append(_dict)
  954. task_queue = queue.Queue()
  955. for _d in list_dict:
  956. task_queue.put(_d)
  957. ots_client = getConnect_ots()
  958. def _handle(_d,result_queue):
  959. _name = _d["enterprise_name"]
  960. bool_query = BoolQuery(must_queries=[TermQuery("name",_name)])
  961. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  962. SearchQuery(bool_query,limit=1),
  963. columns_to_get=ColumnsToGet(["reg_location"],return_type=ColumnReturnType.SPECIFIED))
  964. l_data = getRow_ots(rows)
  965. if len(l_data)>0:
  966. _d.update(l_data[0])
  967. bool_query = BoolQuery(must_queries=[TermQuery("enterprise_name",_name),
  968. BoolQuery(should_queries=[TermQuery("is_legal_person",1),
  969. TermQuery("is_mobile",1)])])
  970. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
  971. SearchQuery(bool_query,limit=5),
  972. columns_to_get=ColumnsToGet(["enterprise_name","contact_person","phone_no","position"],return_type=ColumnReturnType.SPECIFIED))
  973. l_data = getRow_ots(rows)
  974. if len(l_data)>0:
  975. _d.update(l_data[0])
  976. mt = MultiThreadHandler(task_queue,_handle,None,60)
  977. mt.run()
  978. df_data= {}
  979. columns = ["name","contact_person","phone_no","reg_location"]
  980. for _d in list_dict:
  981. if "phone_no" in _d:
  982. for c in columns:
  983. if c not in df_data:
  984. df_data[c] = []
  985. df_data[c].append(getLegal_str(_d.get(c,"")))
  986. df = pd.DataFrame(df_data)
  987. df.to_excel("../data/%s_export_enterprise.xlsx"%(getCurrent_date(format="%Y-%m-%d_%H%M%S")),encoding="utf",columns=columns)
  988. def getTycCompany():
  989. filename = "公司地址(1).xlsx"
  990. df = pd.read_excel(filename)
  991. list_name = df["name"]
  992. task_queue = queue.Queue()
  993. list_data = []
  994. for _i in range(len(list_name)):
  995. _name = list_name[_i]
  996. _d = {"企业名称":_name,
  997. "地址":df["address"][_i],
  998. "注册地址":df["reg_location"][_i]}
  999. task_queue.put(_d)
  1000. list_data.append(_d)
  1001. ots_client = getConnect_ots()
  1002. columns = ["legal_person","phone_number"]
  1003. def _handle(item,result_queue):
  1004. try:
  1005. bool_query = BoolQuery(must_queries=[TermQuery("name",item.get("企业名称"))])
  1006. rows, next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
  1007. SearchQuery(bool_query,limit=1),columns_to_get=ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
  1008. item["count"] = len(getRow_ots(rows))
  1009. if item["count"]==1:
  1010. _d = getRow_ots(rows)[0]
  1011. item["法人"] = _d.get("legal_person")
  1012. item["法人电话"] = _d.get("phone_number")
  1013. # item["简称"] = _d.get("alias")
  1014. item["营业状态"] = _d.get("reg_status")
  1015. bool_query = BoolQuery(must_queries=[
  1016. RangeQuery("status",201,301),
  1017. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item.get("企业名称")],MatchPhraseQuery)
  1018. ])
  1019. rows, next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  1020. SearchQuery(bool_query,limit=1,get_total_count=True),columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
  1021. item["公告数量"] = total_count
  1022. bool_query = BoolQuery(must_queries=[
  1023. TermQuery("status",1),
  1024. TermQuery("enterprise_name",item.get("企业名称")),
  1025. ])
  1026. rows, next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
  1027. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC)]),limit=10,get_total_count=True),
  1028. columns_to_get=ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
  1029. list_concat = getRow_ots(rows)
  1030. concat = ""
  1031. for data_i in range(len(list_concat)):
  1032. data = list_concat[data_i]
  1033. concat += "联系人%d%s(%s)\n"%(data_i+1,data.get("contact_person",""),data.get("phone_no",""))
  1034. item["联系人"] = concat
  1035. except Exception:
  1036. traceback.print_exc()
  1037. mt = MultiThreadHandler(task_queue,_handle,None,30)
  1038. mt.run()
  1039. columns = ["企业名称","法人","法人电话","地址","注册地址","公告数量","联系人"]
  1040. df_data = {}
  1041. for data in list_data:
  1042. for c in columns:
  1043. if c not in df_data:
  1044. df_data[c] = []
  1045. df_data[c].append(data.get(c))
  1046. df = pd.DataFrame(df_data)
  1047. df.to_excel("%s.xlsx"%filename,columns=columns)
  1048. if __name__=="__main__":
  1049. # getTyc_company()
  1050. getTycCompany()
  1051. # exportEnterprise_by_bidNum()
  1052. # print(getDictEnterprise(["南宁宏基建筑工程有限责任公司"],["phone"]))
  1053. # exportEnterprise_by_phone()
  1054. # make_Legal_enterprise()
  1055. # transform_enterprise()
  1056. # exportEnterprise()
  1057. # exportContact()
  1058. # attachColumn()
  1059. # attachColumn()
  1060. # ots_client = getConnect_ots()
  1061. # bool_query = BoolQuery(must_queries=[RangeQuery("tyc_id",1,include_lower=True),
  1062. # RangeQuery("bid_number",4,include_lower=True)
  1063. # ])
  1064. # bool_query = BoolQuery(must_queries=[TermQuery("bid_number",0)],
  1065. # must_not_queries=[ExistsQuery("tyc_id"),NestedQuery("contacts",ExistsQuery("contacts"))])
  1066. #
  1067. #
  1068. # columns = ["name","contacts","province","city","address","reg_location"]
  1069. # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  1070. # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
  1071. # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1072. # print(total_count)
  1073. # exportEnterprise_GMV()