exportUtils.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. #coding:UTF8
  2. from utils.Utils import *
  3. from dataSource.source import getConnect_ots
  4. from utils.multiThread import MultiThreadHandler
  5. from queue import Queue
  6. import json
  7. from utils.hashUtil import aesCipher
  8. import inspect
  9. from tablestore import *
  10. set_columns = set()
  11. list_df_columns = []
  12. from Crypto.Cipher import PKCS1_v1_5 as Cipher_pksc1_v1_5
  13. from Crypto.PublicKey import RSA
  14. def rsa_encrpt(string, public_key):
  15. rsakey = RSA.importKey(public_key) # 读取公钥
  16. cipher = Cipher_pksc1_v1_5.new(rsakey)
  17. # 因为encryptor.encrypt方法其内部就实现了加密再次Base64加密的过程,所以这里实际是通过下面的1和2完成了JSEncrypt的加密方法
  18. encrypt_text = cipher.encrypt(string.encode()) # 1.对账号密码组成的字符串加密
  19. return encrypt_text
  20. def getOneContact(contacts,tojson=True,mobile_first=True,mobile_only=True,high_level_first=True):
  21. mobile_person = ""
  22. mobile_no = ''
  23. phone_person = ""
  24. phone_no = ''
  25. if contacts is None:
  26. return "",""
  27. try:
  28. if isinstance(contacts,str):
  29. list_contacts = json.loads(contacts)
  30. else:
  31. list_contacts = contacts
  32. list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
  33. for _contact in list_contacts:
  34. _level = _contact.get("level")
  35. is_mobile = _contact.get("is_mobile",0)
  36. if is_mobile==1:
  37. _contact["mobile_no"] = _contact.get("phone_no")
  38. if _contact.get("mobile_no","")!="":
  39. mobile_person = _contact.get("contact_person","")
  40. mobile_no = _contact.get("mobile_no","")
  41. if _level==40:
  42. if mobile_person!="":
  43. mobile_person += "(法人)"
  44. if _contact.get("phone_no","")!="":
  45. phone_person = _contact.get("contact_person","")
  46. phone_no = _contact.get("phone_no","")
  47. if _level==40:
  48. if phone_person!="":
  49. phone_person += "(法人)"
  50. if mobile_first:
  51. if mobile_no!="" and mobile_person!="":
  52. return mobile_person,mobile_no
  53. else:
  54. if mobile_only:
  55. return mobile_person,mobile_no
  56. except Exception as e:
  57. pass
  58. return phone_person,phone_no
  59. def getMobiles(contacts,to_json=True):
  60. if to_json:
  61. list_contacts = json.loads(contacts)
  62. else:
  63. list_contacts = contacts
  64. list_result = []
  65. for _c in list_contacts:
  66. if _c.get("mobile_no","")!="":
  67. list_result.append([_c.get("contact_person",""),_c.get("mobile_no")])
  68. return list_result
  69. def set_dict_item(_dict,name,v):
  70. _dict[name] = getLegal_str(v)
  71. if name not in set_columns:
  72. set_columns.add(name)
  73. list_df_columns.append(getLegal_str(name))
  74. def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
  75. _dict[name] = getLegal_str(v)
  76. if name not in set_columns1:
  77. set_columns1.add(name)
  78. list_df_columns1.append(getLegal_str(name))
  79. def getRowData_document(df_data,rows,set_line,list_keyword,dict_channel):
  80. # list_data = getRow_ots(rows)
  81. for row in rows:
  82. item = {}
  83. _dict = row
  84. set_dict_item(item,"docid",_dict.get("docid",""))
  85. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  86. set_dict_item(item,"公告内容",_dict.get("doctextcon",""))
  87. set_dict_item(item,"附件内容",_dict.get("attachmenttextcon",""))
  88. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  89. set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")+row.get("attachmenttextcon","")).replace("(","(").replace(")",")")))))))
  90. set_dict_item(item,"产品",_dict.get("product",""))
  91. set_dict_item(item,"省份",_dict.get("province",""))
  92. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  93. set_dict_item(item,"城市",_dict.get("city",""))
  94. set_dict_item(item,"区县",_dict.get("district",""))
  95. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  96. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  97. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  98. set_dict_item(item,"创建时间",_dict.get("crtime",""))
  99. set_dict_item(item,"招标方式",_dict.get("bidway",""))
  100. set_dict_item(item,"行业一级分类",_dict.get("industry",""))
  101. set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
  102. set_dict_item(item,"uuid",_dict.get("uuid"))
  103. set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  104. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  105. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  106. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  107. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  108. set_dict_item(item,"代理单位",_dict.get("agency",""))
  109. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  110. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  111. set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  112. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  113. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  114. sub_docs_json = _dict.get("sub_docs_json")
  115. set_tenderer = set()
  116. if sub_docs_json is not None:
  117. for _doc in json.loads(sub_docs_json):
  118. if "win_tenderer" in _doc:
  119. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  120. if "second_tenderer" in _doc:
  121. set_tenderer.add(_doc.get("second_tenderer"))
  122. if "third_tenderer" in _doc:
  123. set_tenderer.add(_doc.get("third_tenderer"))
  124. if "win_tenderee_manager" in _doc:
  125. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  126. if "win_tenderee_phone" in _doc:
  127. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  128. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  129. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  130. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  131. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  132. set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
  133. if "招标金额" not in item:
  134. set_dict_item(item,"招标金额","")
  135. if "中标金额" not in item:
  136. set_dict_item(item,"中标金额","")
  137. if "中标单位" not in item:
  138. set_dict_item(item,"中标单位","")
  139. if "中标单位联系人" not in item:
  140. set_dict_item(item,"中标单位联系人","")
  141. if "中标单位联系电话" not in item:
  142. set_dict_item(item,"中标单位联系电话","")
  143. # if item["中标单位"] not in set_enter:
  144. # continue
  145. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["中标金额"]))
  146. # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
  147. # continue
  148. # if _line in set_line:
  149. # continue
  150. # if _dict.get("docid","") in set_ig_docid:
  151. # continue
  152. # if item["招标金额"]=="":
  153. # continue
  154. # set_line.add(_line)
  155. for k,v in item.items():
  156. if k not in df_data:
  157. df_data[k] = []
  158. df_data[k].append(v)
  159. def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","industry","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code"]):
  160. task_queue = Queue()
  161. result_queue= Queue()
  162. for _enterprise in list_enterprise:
  163. task_queue.put(str(_enterprise))
  164. def _handle(item,result_queue,ots_client):
  165. try:
  166. primary_key = [("name",item)]
  167. consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
  168. dict_data = getRow_ots_primary(return_row)
  169. if dict_data is not None:
  170. bool_q = BoolQuery(must_queries=[TermQuery("enterprise_name",item),
  171. TermQuery("status",1)])
  172. rows,next_token,total_count,is_allowed = ots_client.search("enterprise_contact","enterprise_contact_index",
  173. SearchQuery(bool_q,limit=10),
  174. columns_to_get=ColumnsToGet(["contact_person","phone_no","is_mobile","level","is_legal_person","is_manager","is_shareholder"],ColumnReturnType.SPECIFIED))
  175. list_contact = getRow_ots(rows)
  176. dict_data["contacts"] = list_contact
  177. result_queue.put({item:dict_data})
  178. except Exception as e:
  179. traceback.print_exc()
  180. ots_client = getConnect_ots()
  181. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=50,ots_client=ots_client)
  182. mt.run()
  183. dict_enterprise = {}
  184. while True:
  185. try:
  186. _dict = result_queue.get(False)
  187. for k,v in _dict.items():
  188. dict_enterprise[k] = v
  189. except Exception as e:
  190. break
  191. return dict_enterprise
  192. def splitIntoList(_str,_splitkeys):
  193. list_words = []
  194. for _word in re.split(_splitkeys,_str):
  195. if _word.strip()=="":
  196. continue
  197. list_words.append(_word)
  198. return list_words
  199. def fixContactPerson(df_data,list_df_columns,get_legal_person=False):
  200. set_enterprise = set()
  201. if len(df_data.keys())>0:
  202. for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  203. set_enterprise.add(_tenderee)
  204. set_enterprise.add(_agency)
  205. set_enterprise.add(_win_tenderer)
  206. if "" in set_enterprise:
  207. set_enterprise.remove("")
  208. if None in set_enterprise:
  209. set_enterprise.remove(None)
  210. dict_enterprise = getDictEnterprise(list(set_enterprise),columns_to_get = ["procurement_system","company_org_type","reg_capital","actual_capital","contacts","estiblish_time","social_staff_num","zhong_biao_number","tou_biao_number","credit_code","legal_person_name","phone_number"])
  211. # conn = getConnection_oracle()
  212. # cursor = conn.cursor()
  213. if len(set_enterprise)>0:
  214. for _i in range(len(df_data["招标单位"])):
  215. _enterprise_name = df_data["招标单位"][_i]
  216. if df_data["招标联系人电话"][_i]=="":
  217. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  218. if contacts is not None:
  219. _person,_phone = getOneContact(contacts)
  220. df_data["招标联系人"][_i] = _person
  221. df_data["招标联系人电话"][_i] = _phone
  222. if "信用代码" not in df_data:
  223. df_data["信用代码"] = []
  224. df_data["信用代码"].append(dict_enterprise.get(_enterprise_name,{}).get("credit_code",""))
  225. if "招标人采购系统" not in df_data:
  226. df_data["招标人采购系统"] = []
  227. df_data["招标人采购系统"].append(dict_enterprise.get(_enterprise_name,{}).get("procurement_system",""))
  228. if "招标人类型" not in df_data:
  229. df_data["招标人类型"] = []
  230. df_data["招标人类型"].append(dict_enterprise.get(_enterprise_name,{}).get("company_org_type",""))
  231. # if "原网地址" not in df_data:
  232. # df_data["原网地址"] = []
  233. # if df_data["公告类别"][_i]=="招标公告":
  234. # table_name = "bxkc.T_ZHAO_BIAO_GONG_GAO"
  235. # else:
  236. # table_name = "bxkc.T_ZHONG_BIAO_XIN_XI"
  237. # sql = "select detail_link from %s where id='%s' "%(table_name,df_data["uuid"][_i])
  238. # cursor.execute(sql)
  239. # rows = cursor.fetchall()
  240. # if len(rows)>0:
  241. # df_data["原网地址"].append(rows[0][0])
  242. # else:
  243. # df_data["原网地址"].append("")
  244. _enterprise_name = df_data["代理单位"][_i]
  245. if df_data["代理联系人电话"][_i]=="":
  246. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  247. if contacts is not None:
  248. _person,_phone = getOneContact(contacts)
  249. df_data["代理联系人"][_i] = _person
  250. df_data["代理联系人电话"][_i] = _phone
  251. _enterprise_name = df_data["中标单位"][_i]
  252. if get_legal_person:
  253. _person = dict_enterprise.get(_enterprise_name,{}).get("legal_person_name","")
  254. _phone = dict_enterprise.get(_enterprise_name,{}).get("phone_number","")
  255. if len(_phone)==11 and _phone[0]=="1":
  256. df_data["中标单位联系人"][_i] = _person
  257. df_data["中标单位联系电话"][_i] = _phone
  258. else:
  259. if df_data["中标单位联系电话"][_i]=="":
  260. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  261. if contacts is not None:
  262. _person,_phone = getOneContact(contacts,mobile_only=True)
  263. df_data["中标单位联系人"][_i] = _person
  264. df_data["中标单位联系电话"][_i] = _phone
  265. list_df_columns.extend(['信用代码','招标人采购系统','招标人类型'])
  266. def generateBoolShouldQuery(list_field,list_should_words,cls):
  267. list_should_q = []
  268. assert isinstance(list_field,(list))
  269. assert isinstance(list_should_words,(list))
  270. assert cls in (TermQuery,MatchPhraseQuery,WildcardQuery,RangeQuery)
  271. for word in list_should_words:
  272. for field in list_field:
  273. if cls in (RangeQuery,):
  274. list_should_q.append(cls(field,*field))
  275. if cls in (WildcardQuery,):
  276. list_should_q.append(cls(field,"*%s*"%word))
  277. else:
  278. list_should_q.append(cls(field,word))
  279. return BoolQuery(should_queries=list_should_q)
  280. # excel 数据处理库
  281. import openpyxl
  282. # excel 数据样式设置类
  283. from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
  284. from openpyxl.styles import Border, Side, colors
  285. import pandas as pd
  286. from openpyxl.utils import get_column_letter
  287. from openpyxl.styles import Font, Alignment
  288. import os
  289. def my_border(t_border, b_border, l_border, r_border):
  290. border = Border(top=Side(border_style=t_border, color=colors.BLACK),
  291. bottom=Side(border_style=b_border, color=colors.BLACK),
  292. left=Side(border_style=l_border, color=colors.BLACK),
  293. right=Side(border_style=r_border, color=colors.BLACK))
  294. return border
  295. #初始化制定区域边框为所有框线
  296. def format_border(ws,s_column, s_index, e_column , e_index):
  297. for row in tuple(ws[s_column + str(s_index):e_column + str(e_index)]):
  298. for cell in row:
  299. cell.border = my_border('thin', 'thin', 'thin', 'thin')
  300. def adjust_excel(source,target,columns=["A","B","C","D"]):
  301. wb = openpyxl.load_workbook(source)
  302. for sheet in wb.sheetnames:
  303. ws = wb[sheet]
  304. df = pd.read_excel(source,sheet_name=sheet)
  305. # 把表头改到最后一行
  306. df.loc[len(df)]=list(df.columns)
  307. list_row_height = []
  308. for col in df.columns:
  309. # 获取列序号
  310. index = list(df.columns).index(col)
  311. # 获取行字母表头
  312. letter = get_column_letter(index+1)
  313. # 获取当前列最大宽度
  314. collen = df[col].apply(lambda x :len(str(x).encode())).max()
  315. # 设置列宽为最大长度比例
  316. _width = min(30,collen*0.9)
  317. ws.column_dimensions[letter].width = _width
  318. for i in df.index:
  319. # 设置单元格对齐方式 Alignment(horizontal=水平对齐模式,vertical=垂直对齐模式,text_rotation=旋转角度,wrap_text=是否自动换行)
  320. alignment = Alignment(horizontal='center', vertical='center', text_rotation=0, wrap_text=True)
  321. format_border(ws,columns[0], 0, columns[-1], len(df)) # 根据实际列数量修改
  322. for j in columns:
  323. ws[j+str(i+1)].alignment = alignment
  324. list_width = []
  325. for col in df.columns:
  326. # 获取列序号
  327. index = list(df.columns).index(col)
  328. # 获取行字母表头
  329. letter = get_column_letter(index+1)
  330. # 获取当前列最大宽度
  331. list_width.append(len(str(df[col][i]).encode()))
  332. ws.row_dimensions[index].height = 20+max(list_width)//30*5
  333. wb.save(target)
  334. def getDocument(list_query,columns,table_name="document",table_index="document_index",thread_count=30,sort_column="page_time"):
  335. task_queue = Queue()
  336. for _q in list_query:
  337. task_queue.put(_q)
  338. print("task_queue_size",task_queue.qsize())
  339. result_queue = Queue()
  340. list_row = []
  341. ots_client = getConnect_ots()
  342. def _handle(_dict,result_queue,ots_client):
  343. try:
  344. item = _dict.get("query")
  345. _limit = _dict.get("limit")
  346. keyword = _dict.get("keyword")
  347. l_rows = []
  348. rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
  349. SearchQuery(item,sort=Sort(sorters=[FieldSort(sort_column,SortOrder.DESC)]),limit=100,get_total_count=True),
  350. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  351. dict_row = getRow_ots(rows)
  352. if keyword is not None:
  353. for _row in dict_row:
  354. _row["keyword"] = keyword
  355. l_rows.extend(dict_row)
  356. log("total count:%d"%total_count)
  357. _count = len(dict_row)
  358. while next_token:
  359. if _limit and len(l_rows)>=_limit:
  360. break
  361. rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
  362. SearchQuery(item,next_token=next_token,limit=100,get_total_count=True),
  363. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  364. dict_row = getRow_ots(rows)
  365. if keyword is not None:
  366. for _row in dict_row:
  367. _row["keyword"] = keyword
  368. l_rows.extend(dict_row)
  369. _count += len(dict_row)
  370. print("%d/%d"%(_count,total_count))
  371. result_queue.put(l_rows)
  372. except Exception as e:
  373. traceback.print_exc()
  374. mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count,ots_client=ots_client)
  375. mt.run()
  376. while 1:
  377. try:
  378. dict_row = result_queue.get(False)
  379. list_row.extend(dict_row)
  380. except Exception as e:
  381. break
  382. return list_row
  383. class ExportEntity():
  384. def __init__(self,table,table_index,list_query,columns,just_get_totol_count=False):
  385. self.table = table
  386. self.table_index = table_index
  387. self.ots_client = getConnect_ots()
  388. self.task_queue = Queue()
  389. self.result_queue = Queue()
  390. for _q in list_query:
  391. self.task_queue.put(_q)
  392. def _handle(self,item,task_queue):
  393. self.ots_client.search(self.table,self.table_index,
  394. SearchQuery())
  395. def export(self):
  396. pass
  397. class A():
  398. pass
  399. def test(a,bc={"a":2}):
  400. pass
  401. if __name__ == '__main__':
  402. # print(dir(test))
  403. # _sign = inspect.signature(test)
  404. # print(_sign.parameters)
  405. # for _o in _sign.parameters:
  406. # print(_sign.parameters[_o].default)
  407. a = A()
  408. print(str(a))