exportEnterprice.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. #coding:UTF8
  2. import sys
  3. import os
  4. sys.path.append("../")
  5. import pandas as pd
  6. from dataSource.source import *
  7. import json
  8. from utils.multiThread import MultiThreadHandler
  9. import queue
  10. from utils.Utils import *
  11. from dataSource.pool import ConnectorPool
  12. import re
  13. from tablestore import *
  14. import traceback
  15. data_path = "../data/"
  16. def getCompanys():
  17. list_company = []
  18. keywords = ["环境","生态","再生","回收","环保"]
  19. provinces = ["广东"]
  20. for _name in keywords:
  21. for _prov in provinces:
  22. data = make_elasticSearch({
  23. "query": {
  24. "bool": {
  25. "must": [
  26. {
  27. "wildcard": {
  28. "name.keyword": "*%s*"%_name
  29. }
  30. }
  31. # ,
  32. # {
  33. # "term": {
  34. # "province.keyword": "%s"%_prov
  35. # }
  36. # }
  37. # ,
  38. # {
  39. # "range": {
  40. # "zhongBiaoNumber": {
  41. # "gt": "0"
  42. # }
  43. # }
  44. # }
  45. ],
  46. "must_not": [ ],
  47. "should": [ ]
  48. }
  49. },
  50. "from": 0,
  51. "size": 1000000,
  52. "sort": [ ],
  53. "aggs": { }
  54. })
  55. print("--",data["hits"]["total"])
  56. for item in data["hits"]["hits"]:
  57. _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
  58. _company["enterprise_name"] = item["_source"].get("name","")
  59. _company["regCapital"] = item["_source"].get("regCapital","")
  60. _company["zhongBiaoNumber"] = item["_source"].get("zhongBiaoNumber","0")
  61. list_company.append(_company)
  62. # data = make_elasticSearch({
  63. # "query": {
  64. # "bool": {
  65. # "must": [
  66. # {
  67. # "wildcard": {
  68. # "name.keyword": "*电商*"
  69. # }
  70. # }
  71. # ,
  72. # {
  73. # "term": {
  74. # "province.keyword": "北京"
  75. # }
  76. # }
  77. # ,
  78. # {
  79. # "range": {
  80. # "zhongBiaoNumber": {
  81. # "gt": "0"
  82. # }
  83. # }
  84. # }
  85. # ],
  86. # "must_not": [ ],
  87. # "should": [ ]
  88. # }
  89. # },
  90. # "from": 0,
  91. # "size": 10000,
  92. # "sort": [ ],
  93. # "aggs": { }
  94. # })
  95. #
  96. # for item in data["hits"]["hits"]:
  97. # _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
  98. # _company["enterprise_name"] = item["_source"].get("name","")
  99. # _company["regCapital"] = item["_source"].get("regCapital","")
  100. # list_company.append(_company)
  101. print(len(list_company))
  102. return list_company
  103. def exportFactory():
  104. def _handle(item,result_queue,pool_mongo,pool_neo4j):
  105. company_name = item["enterprise_name"]
  106. mongo = pool_mongo.getConnector()
  107. coll_zb = mongo.enterprise_profile
  108. rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1, "actualCapital":1,"estiblishTime":1,"legal_person":1,"phone":1 })
  109. _flag = False
  110. for row in rows:
  111. actualCapital = row.get("actualCapital","0")
  112. estiblishTime = row.get("estiblishTime","2020-01-01")
  113. _captial = re.match("\d+[亿万]+",actualCapital)
  114. # if _captial is not None:
  115. # if getUnifyMoney(_captial.group())>getUnifyMoney("5000万"):
  116. # if estiblishTime<="2015-10-09":
  117. item["legal_person"] = row.get("legal_person","")
  118. item["phone"] = row.get("phone","")
  119. item["actualCapital"] = actualCapital
  120. item["estiblishTime"] = row.get("estiblishTime","")
  121. _flag = True
  122. break
  123. if _flag:
  124. result_queue.put(item)
  125. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
  126. graph = pool_neo4j.getConnector()
  127. finded = graph.run(cql)
  128. data = json.loads(json.dumps(finded.data()))
  129. _count = data[0]["_c"]
  130. # list_project = []
  131. # for _data in data:
  132. # if _count<=3:
  133. # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  134. # if _data["project_name"] is not None:
  135. # list_project.append(_data["project_name"])
  136. # _count += 1
  137. item["count"] = _count
  138. pool_mongo.putConnector(mongo)
  139. pool_neo4j.putConnector(graph)
  140. # list_company = getCompanys()
  141. list_company = []
  142. filename = "../data/天眼查1(1).xlsx"
  143. df1 = pd.read_excel(filename)
  144. for item in df1["公司名称"]:
  145. list_company.append({"enterprise_name":item,"regCapital":"","legal_person":"","phone":"","industry":"","province":""})
  146. task_queue = queue.Queue()
  147. result_queue = queue.Queue()
  148. for item in list_company:
  149. task_queue.put(item)
  150. pool_mongo = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_mongodb)
  151. pool_neo4j = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_neo4j)
  152. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j)
  153. _mult.run()
  154. list_name = []
  155. list_actualCapital = []
  156. list_estiblishTime = []
  157. list_legal_person = []
  158. list_phone = []
  159. list_zb = []
  160. while(True):
  161. try:
  162. item = result_queue.get(False)
  163. list_name.append(item["enterprise_name"])
  164. list_actualCapital.append(item["actualCapital"])
  165. list_estiblishTime.append(item["estiblishTime"])
  166. list_legal_person.append(item["legal_person"])
  167. list_phone.append(item["phone"])
  168. list_zb.append(item["count"])
  169. except:
  170. break
  171. df = pd.DataFrame({"公司":list_name,"实缴":list_actualCapital,
  172. "注册时间":list_estiblishTime,"联系人":list_legal_person,"联系电话":list_phone,
  173. "中标次数":list_zb})
  174. df.to_excel("%s"%filename+"_export.xlsx",columns=["公司","实缴","注册时间","联系人","联系电话","中标次数"])
  175. def deal():
  176. def _handle(item,result_queue):
  177. graph = getConnect_neo4j()
  178. company_name = item["enterprise_name"]
  179. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc limit 3"%(company_name)
  180. finded = graph.run(cql)
  181. data = json.loads(json.dumps(finded.data()))
  182. _count = 1
  183. list_project = []
  184. for _data in data:
  185. if _count<=3:
  186. if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  187. list_project.append(_data["project_name"])
  188. _count += 1
  189. item["project"] = str(list_project)
  190. result_queue.put(item)
  191. file = "../data/北京行业_export.xls"
  192. df = pd.read_excel(file)
  193. list_company = []
  194. for _company,rep,industry,project,count,person,phone in zip(df["公司名字"],df["注册资金"],df["行业"],df["中标项目"],df["中标次数"],df["联系人"],df["联系电话"]):
  195. list_company.append({"enterprise_name":_company,"regCapital":rep,"legal_person":person,"phone":phone,"industry":industry,"province":"","count":count})
  196. task_queue = queue.Queue()
  197. result_queue = queue.Queue()
  198. for item in list_company:
  199. task_queue.put(item)
  200. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  201. _mult.run()
  202. list_name = []
  203. list_regCapital = []
  204. list_industry = []
  205. list_count = []
  206. list_person = []
  207. list_phone = []
  208. list_project = []
  209. while(True):
  210. try:
  211. _result = result_queue.get(False)
  212. list_name.append(_result["enterprise_name"])
  213. list_regCapital.append(_result["regCapital"])
  214. list_industry.append(_result["industry"])
  215. list_count.append(_result["count"])
  216. list_person.append(_result["legal_person"])
  217. list_phone.append(_result["phone"])
  218. list_project.append(_result["project"])
  219. except Exception as e:
  220. print(e)
  221. break
  222. df1 = pd.DataFrame({"公司名字":list_name,"注册资金":list_regCapital,"行业":list_industry,"中标项目":list_project,"中标次数":list_count,"联系人":list_person,"联系电话":list_phone})
  223. df1.to_excel("%s_export1.xls"%("北京行业"),columns=["公司名字","注册资金","行业","中标项目","中标次数","联系人","联系电话"])
  224. def deal1():
  225. def _handle(item,result_queue):
  226. graph = getConnect_neo4j()
  227. company_name = item["enterprise_name"]
  228. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc "%(company_name)
  229. finded = graph.run(cql)
  230. data = json.loads(json.dumps(finded.data()))
  231. _count = 0
  232. list_project = []
  233. for _data in data:
  234. if _count<=2:
  235. if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  236. list_project.append(_data["project_name"])
  237. _count += 1
  238. item["count"] = _count
  239. item["project"] = str(list_project)
  240. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company_name)
  241. print(cql)
  242. finded = graph.run(cql)
  243. finded_money = json.loads(json.dumps(finded.data()))
  244. whole_money = 0
  245. for _item in finded_money:
  246. if _item["r.price"] is not None:
  247. whole_money += getUnifyMoney(_item["r.price"])
  248. item["whole_money"] = str(whole_money)
  249. result_queue.put(item)
  250. # filename = "数据导出需求9.11(1)(1).xlsx"
  251. filename = "../data/新建 XLSX 工作表(1).xlsx"
  252. df = pd.read_excel(filename)
  253. list_company = []
  254. for _key in df.keys():
  255. print(_key,len(df[_key]))
  256. for _company in df["公司名称"]:
  257. list_company.append({"enterprise_name":_company,"regCapital":"","legal_person":"","phone":"","industry":"","province":"","count":0})
  258. task_queue = queue.Queue()
  259. result_queue = queue.Queue()
  260. for item in list_company:
  261. task_queue.put(item)
  262. _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  263. _mult.run()
  264. _dict_item = {}
  265. while(True):
  266. try:
  267. item = result_queue.get(False)
  268. if item["enterprise_name"]!="":
  269. _dict_item[item["enterprise_name"]] = item
  270. except Exception as e:
  271. print(str(e))
  272. break
  273. list_count = []
  274. list_project = []
  275. list_money = []
  276. list_zb = []
  277. for _company in df["公司名称"]:
  278. if _company in _dict_item:
  279. list_count.append(_dict_item[_company]["count"])
  280. list_project.append(_dict_item[_company]["project"])
  281. list_money.append(_dict_item[_company]["whole_money"])
  282. list_zb.append("是" if _dict_item[_company]["count"]>0 else "否")
  283. else:
  284. print(_company)
  285. list_count.append(0)
  286. list_project.append("")
  287. list_money.append("0")
  288. list_zb.append("否")
  289. print(len(list_count),len(list_project),len(list_money),len(list_zb))
  290. df2 = pd.DataFrame({"公司名称":df["公司名称"],"次数":list_count})
  291. df2.to_excel("%s_export.xls"%filename)
  292. # df1 = pd.DataFrame({"月份":df["月份"],"电话":df["电话"],"公司名字":df["公司名字"],"开通时间":df["开通时间"],
  293. # "到期时间":df["到期时间"],"客户公司注册时间":df["客户公司注册时间"],"客户公司注册资金":df["客户公司注册资金"],
  294. # "实际缴费资金":df["实际缴费资金"],"天眼查行业分类":df["天眼查行业分类"],"是否中标":list_zb,
  295. # "中标次数":list_count,"中标项目|3个":list_project,"中标金额":list_money,"客户设置关键词":df["客户设置关键词"],"客户搜索词":df["客户搜索词"].xls})
  296. # df1.to_excel("%s_补充.xls"%filename,columns=["月份","电话","公司名字", "开通时间" ,"到期时间" ,"客户公司注册时间" ,"客户公司注册资金" ,"实际缴费资金" ,"天眼查行业分类" ,"是否中标" ,"中标次数" ,"中标项目|3个" ,"中标金额" ,"客户设置关键词" ,"客户搜索词"])
  297. def deal3():
  298. filename = "../data/导出工厂.xlsx"
  299. df = pd.DataFrame(filename)
  300. count = 0
  301. for item in df["实缴"]:
  302. if getUnifyMoney(item)>getUnifyMoney("5000万"):
  303. count += 1
  304. print(count)
  305. def exportEnterpriseByName():
  306. df = pd.read_csv("../data/中标家具公司.csv",encoding="GBK")
  307. def _handle(item,result_queue,pool_ots):
  308. ots_client = pool_ots.getConnector()
  309. primary_key = [('name',str(item["name"]))]
  310. columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
  311. consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
  312. print(return_row)
  313. for _item in return_row.attribute_columns:
  314. if _item[0]=="contacts":
  315. a = json.loads(_item[1])
  316. for i in a:
  317. if i.get("mobile_no","")==item["phone"] or i.get("phone_no","")==item["phone"]:
  318. item["contact_person"] = i.get("contact_person","")
  319. else:
  320. item[_item[0]] = _item[1]
  321. list_dict = []
  322. for name,phone in zip(df["name"],df["phone"]):
  323. list_dict.append({"name":name,"phone":phone})
  324. task_queue = queue.Queue()
  325. for item in list_dict:
  326. task_queue.put(item)
  327. result_queue = queue.Queue()
  328. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  329. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_ots=pool_ots)
  330. mt.run()
  331. columns = ["name","contact_person","phone","reg_capital","actual_capital","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
  332. df_data = {}
  333. for _c in columns:
  334. df_data[_c] = []
  335. for item in list_dict:
  336. for _key in columns:
  337. df_data[_key].append(item.get(_key,""))
  338. df1 = pd.DataFrame(df_data)
  339. df1.to_csv("中标家具公司1.csv")
  340. def getCompanys():
  341. conn = getConnection_mysql()
  342. cursor = conn.cursor()
  343. sql = '''select C.login_id as 登陆名,B.company ,B.contactname as 联系人,B.phone as 联系电话 ,(select MLEVELNAME from sys_memberlevel where id =A.memberlevelid) as 会员等级,( select name from b2c_mall_staff_basic_info where userid=B.aftermarket) as 售后客服 from bxkc.bxkc_member_term A,bxkc.b2c_mall_staff_basic_info B,bxkc.b2c_user_login_info C
  344. where A.USERID=B.USERID and B.USERID=C.USERID and B.innerOrg like '广州%'
  345. and A.memberlevelid!=81 and A.status='01' and str_to_date('2020-11-20','%Y-%m-%d') between A.stime and A.etiem ;
  346. '''
  347. cursor.execute(sql)
  348. vol = cursor.description
  349. list_company = []
  350. rows = cursor.fetchall()
  351. for row in rows:
  352. _company = {}
  353. for _vol,_value in zip(vol,row):
  354. _name = _vol[0]
  355. _company[_name] = _value
  356. list_company.append(_company)
  357. return list_company
  358. def exportEnterprise_byindustry(page_time,
  359. columns = ["name","address","business_scope","province","city","district","reg_capital","phone","estiblish_time"],
  360. keywords = ["钢材","水泥","五金","水电","暖通","暖气","电缆"]):
  361. list_should_q = []
  362. for _key in keywords:
  363. list_should_q.append(WildcardQuery("industry","*%s*"%_key))
  364. list_should_q.append(WildcardQuery("nicknames","*%s*"%_key))
  365. key_query = BoolQuery(should_queries=list_should_q)
  366. #WildcardQuery("industry","*建筑*")
  367. ots_client = getConnect_ots()
  368. bool_query = BoolQuery(must_queries=[RangeQuery("bidi_id",0,include_lower=True),
  369. key_query,
  370. RangeQuery("estiblish_time",range_to="2017-01-01")])
  371. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  372. SearchQuery(bool_query, limit=100, get_total_count=True),
  373. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  374. all_rows = 0
  375. df_data = {}
  376. for key in columns:
  377. df_data[key] = []
  378. for row in rows:
  379. _dict = dict()
  380. for part in row:
  381. for item in part:
  382. _dict[item[0]] = item[1]
  383. for key in columns:
  384. df_data[key].append(_dict.get(key,""))
  385. # if "reg_capital" in _dict:
  386. # _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
  387. # if _money is not None:
  388. # if getUnifyMoney(_money.group())>2000000:
  389. # for key in columns:
  390. # df_data[key].append(_dict.get(key,""))
  391. all_rows += len(rows)
  392. # print(next_token)
  393. while(next_token):
  394. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  395. SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
  396. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  397. for row in rows:
  398. _dict = dict()
  399. for part in row:
  400. for item in part:
  401. _dict[item[0]] = item[1]
  402. for key in columns:
  403. df_data[key].append(_dict.get(key,""))
  404. # if "reg_capital" in _dict:
  405. # _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
  406. # if _money is not None:
  407. # if getUnifyMoney(_money.group())>2000000:
  408. # for key in columns:
  409. # df_data[key].append(_dict.get(key,""))
  410. all_rows += len(rows)
  411. print(all_rows,total_count,len(df_data[columns[0]]))
  412. df = pd.DataFrame(df_data)
  413. df.to_csv("../data/enterprise_2017_a.csv",columns=columns)
  414. def getTyc_company():
  415. root_path = ["G:/文档/tyc国企","G:/文档/tyc机构"]
  416. list_files = []
  417. for _path in root_path:
  418. for file in os.listdir(_path):
  419. list_files.append(os.path.join(_path,file))
  420. list_files = ["G:/文档/tyc机构\\高级搜索导出数据结果—自定义条件—天眼查(W20011656561610789770227).xlsx"]
  421. pool_mysql = ConnectorPool(method_init=getConnection_testmysql,init_num=10,max_num=30)
  422. task_queue = queue.Queue()
  423. result_queue = queue.Queue()
  424. for _file in list_files:
  425. task_queue.put(_file)
  426. def _handle(_file,task_queue,pool_mysql):
  427. print("handle",_file)
  428. conn = pool_mysql.getConnector()
  429. cursor = conn.cursor()
  430. df = pd.read_excel(_file,header=2)
  431. for name,social_credit,identification,regist_num,organization_code in zip(df["公司名称"],df["统一社会信用代码"],df["纳税人识别号"],df["注册号"],df["组织机构代码"]):
  432. try:
  433. sql = " insert into Enterprise(name,social_credit,identification,regist_num,organization_code) values ('%s','%s','%s','%s','%s')"%(name,social_credit,identification,regist_num,organization_code)
  434. cursor.execute(sql)
  435. except Exception as e:
  436. print("error")
  437. conn.commit()
  438. pool_mysql.putConnector(conn)
  439. mt = MultiThreadHandler(task_queue,_handle,result_queue,20,pool_mysql=pool_mysql)
  440. mt.run()
  441. def exportEnterprise_by_bidNum():
  442. ots_client = getConnect_ots()
  443. bool_query = BoolQuery(must_queries=[RangeQuery("tyc_id",1,include_lower=True),
  444. RangeQuery("bid_number",4,include_lower=True)
  445. ])
  446. columns = ["name"]
  447. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  448. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
  449. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  450. df_data = {}
  451. for _key in columns:
  452. df_data[_key] = []
  453. def getData(df_data,rows):
  454. list_dict = getRow_ots(rows)
  455. for _dict in list_dict:
  456. for _key in columns:
  457. _v = _dict.get(_key,"")
  458. if len(_v)>4:
  459. df_data[_key].append(_v)
  460. getData(df_data,rows)
  461. _count = len(rows)
  462. while(next_token):
  463. print("%d/%d"%(_count,total_count))
  464. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  465. SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
  466. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  467. getData(df_data,rows)
  468. _count += len(rows)
  469. df = pd.DataFrame(df_data)
  470. df.to_csv("../data/enterprise_bidinum.csv",columns=columns)
  471. def make_Legal_enterprise():
  472. import codecs
  473. def format(_e):
  474. if _e is None:
  475. return None
  476. if not isinstance(_e,str):
  477. return None
  478. if re.search("^[a-zA-Z0-9]+$",_e) is not None:
  479. return None
  480. if re.search("[<《]>-。\-\.\?]",_e) is not None:
  481. return None
  482. _e1 = re.sub("\s+","",_e.replace("(","(").replace(")",")"))
  483. if re.search("[省市区县乡镇]$",_e) is not None:
  484. return None
  485. if len(_e1)>=4:
  486. return _e1
  487. return None
  488. set_enterprise = set()
  489. df = pd.read_csv("../data/enterprise_bidinum.csv",encoding="GBK")
  490. _count = 0
  491. for _e in df["name"]:
  492. _count += 1
  493. if _count%10000==0:
  494. print(_count)
  495. _e1 = format(_e)
  496. if _e1 is not None:
  497. set_enterprise.add(_e1)
  498. conn = getConnection_testmysql()
  499. cursor = conn.cursor()
  500. sql = " select name from Enterprise "
  501. cursor.execute(sql)
  502. rows = cursor.fetchmany(10000)
  503. while rows:
  504. for row in rows:
  505. _count += 1
  506. if _count%10000==0:
  507. print(_count)
  508. _e = row[0]
  509. _e1 = format(_e)
  510. if _e1 is not None:
  511. set_enterprise.add(_e1)
  512. rows = cursor.fetchmany(10000)
  513. with codecs.open("../data/LEGAL_ENTERPRISE.txt","w",encoding="UTF8") as f:
  514. for _e in list(set_enterprise):
  515. f.write(_e+"\n")
  516. def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]):
  517. task_queue = queue.Queue()
  518. result_queue= queue.Queue()
  519. for _enterprise in list_enterprise:
  520. task_queue.put(_enterprise)
  521. def _handle(item,result_queue,pool_ots):
  522. ots_client = pool_ots.getConnector()
  523. primary_key = [("name",item)]
  524. consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
  525. dict_data = getRow_ots_primary(return_row)
  526. if dict_data is not None:
  527. result_queue.put({item:dict_data})
  528. pool_ots.putConnector(ots_client)
  529. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  530. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  531. mt.run()
  532. dict_enterprise = {}
  533. while True:
  534. try:
  535. _dict = result_queue.get(False)
  536. for k,v in _dict.items():
  537. dict_enterprise[k] = v
  538. except Exception as e:
  539. break
  540. return dict_enterprise
  541. def getOneContact(contacts,tojson=True,mobile_first=True):
  542. if tojson:
  543. list_contacts = json.loads(contacts)
  544. else:
  545. list_contacts = contacts
  546. mobile_person = ""
  547. mobile_no = ''
  548. phone_person = ""
  549. phone_no = ''
  550. for _contact in list_contacts:
  551. if _contact.get("mobile_no","")!="":
  552. mobile_person = _contact.get("contact_person","")
  553. mobile_no = _contact.get("mobile_no","")
  554. if _contact.get("phone_no","")!="":
  555. phone_person = _contact.get("phone_no","")
  556. phone_no = _contact.get("phone_no","")
  557. if mobile_first:
  558. return mobile_person,mobile_no
  559. return phone_person,phone_no
  560. if __name__=="__main__":
  561. # getTyc_company()
  562. exportEnterprise_by_bidNum()
  563. make_Legal_enterprise()