extract_check.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651
  1. #coding=utf-8
  2. # evaluate为该方法的入口函数,必须用这个名字
  3. from odps.udf import annotate
  4. from odps.distcache import get_cache_archive
  5. from odps.distcache import get_cache_file
  6. from odps.udf import BaseUDTF
  7. from odps.udf import BaseUDAF
  8. from odps.distcache import get_cache_archive
  9. from odps.distcache import get_cache_file
  10. # 配置pandas依赖包
  11. def include_package_path(res_name):
  12. import os, sys
  13. archive_files = get_cache_archive(res_name)
  14. dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
  15. if '.dist_info' not in f.name], key=lambda v: len(v))
  16. sys.path.append(dir_names[0])
  17. return os.path.dirname(dir_names[0])
  18. # 可能出现类似RuntimeError: xxx has been blocked by sandbox
  19. # 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
  20. def include_file(file_name):
  21. import os, sys
  22. so_file = get_cache_file(file_name)
  23. sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
  24. def include_so(file_name):
  25. import os, sys
  26. so_file = get_cache_file(file_name)
  27. with open(so_file.name, 'rb') as fp:
  28. content=fp.read()
  29. so = open(file_name, "wb")
  30. so.write(content)
  31. so.flush()
  32. so.close()
  33. #初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
  34. def init_env(list_files,package_name):
  35. import os,sys
  36. if len(list_files)==1:
  37. so_file = get_cache_file(list_files[0])
  38. cmd_line = os.path.abspath(so_file.name)
  39. os.system("unzip -o %s -d %s"%(cmd_line,package_name))
  40. elif len(list_files)>1:
  41. cmd_line = "cat"
  42. for _file in list_files:
  43. so_file = get_cache_file(_file)
  44. cmd_line += " "+os.path.abspath(so_file.name)
  45. cmd_line += " > temp.zip"
  46. os.system(cmd_line)
  47. os.system("unzip -o temp.zip -d %s"%(package_name))
  48. # os.system("rm -rf %s/*.dist-info"%(package_name))
  49. # return os.listdir(os.path.abspath("local_package"))
  50. # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
  51. # os.system("source ~/.bashrc")
  52. sys.path.insert(0,os.path.abspath(package_name))
  53. # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
  54. def load_project():
  55. start_time = time.time()
  56. init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
  57. # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
  58. logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
  59. def load_vector():
  60. start_time = time.time()
  61. init_env(["wiki_128_word_embedding_new.vector.env"],".")
  62. logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
  63. start_time = time.time()
  64. init_env(["enterprise.zip.env"],".")
  65. # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
  66. logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
  67. start_time = time.time()
  68. init_env(["so.env"],".")
  69. logging.info("init so.env cost %d"%(time.time()-start_time))
  70. def load_py():
  71. start_time = time.time()
  72. # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
  73. include_package_path("envs_py37.env.zip")
  74. logging.info("init envs_py37 cost %d"%(time.time()-start_time))
  75. def multiLoadEnv():
  76. load_project()
  77. load_vector()
  78. load_py()
  79. import json
  80. class MyEncoder(json.JSONEncoder):
  81. def default(self, obj):
  82. if isinstance(obj, np.ndarray):
  83. return obj.tolist()
  84. elif isinstance(obj, bytes):
  85. return str(obj, encoding='utf-8')
  86. elif isinstance(obj, (np.float_, np.float16, np.float32,
  87. np.float64)):
  88. return float(obj)
  89. elif isinstance(obj,(np.int64)):
  90. return int(obj)
  91. return json.JSONEncoder.default(self, obj)
  92. @annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
  93. class f_json_extract_online(BaseUDTF):
  94. def __init__(self):
  95. import uuid
  96. global uuid
  97. import logging
  98. import datetime
  99. import numpy as np
  100. global json,MyEncoder,time,log,MyEncoder,np
  101. def process(self,page_time,doctitle,
  102. tenderee,tenderee_contact,tenderee_phone,agency,
  103. agency_contact,agency_phone,sub_docs_json,project_code,
  104. project_name,product,time_bidclose,time_bidopen,time_release,
  105. moneysource,person_review,bidway,punish,serviceTime):
  106. _dict = {}
  107. _dict["code"] = project_code if project_code is not None else ""
  108. _dict["name"] = project_name if project_name is not None else ""
  109. if product is not None and product!="":
  110. _dict["product"] = product.split(",")
  111. else:
  112. _dict["product"] = []
  113. _dict["time_bidclose"] = time_bidclose if time_bidclose is not None else ""
  114. _dict["time_bidopen"] = time_bidopen if time_bidopen is not None else ""
  115. _dict["time_release"] = time_release if time_release is not None else ""
  116. _dict["moneysource"] = moneysource if moneysource is not None else ""
  117. if person_review not in (None,''):
  118. _dict["person_review"] = person_review.split(",")
  119. else:
  120. _dict["person_review"] = []
  121. _dict["bidway"] = bidway if bidway is not None else ""
  122. _dict["serviceTime"] = serviceTime if serviceTime is not None else ""
  123. if punish not in (None,''):
  124. _punish = json.loads(punish)
  125. else:
  126. _punish = {}
  127. for k,v in _punish.items():
  128. _dict[k] = v
  129. if sub_docs_json not in (None,''):
  130. _docs = json.loads(sub_docs_json)
  131. else:
  132. _docs = [{}]
  133. set_comp_contact = set()
  134. if tenderee not in (None,"") and tenderee_contact not in (None,""):
  135. set_comp_contact.add("%s-%s-%s-%s"%("tenderee",tenderee,tenderee_contact,tenderee_phone))
  136. if agency not in (None,"") and agency_contact not in (None,""):
  137. set_comp_contact.add("%s-%s-%s-%s"%("agency",agency,agency_contact,agency_phone))
  138. set_pack_comp = set()
  139. if tenderee not in (None,""):
  140. set_pack_comp.add("%s-%s-%s"%("Project","tenderee",tenderee))
  141. if agency not in (None,""):
  142. set_pack_comp.add("%s-%s-%s"%("Project","agency",agency))
  143. set_pack_money = set()
  144. for _d in _docs:
  145. if len(_d.keys())>0:
  146. sub_project_name = _d.get("sub_project_name","Project")
  147. bidding_budget = float(_d.get("bidding_budget",0))
  148. win_tenderer = _d.get("win_tenderer","")
  149. win_bid_price = float(_d.get("win_bid_price",0))
  150. win_tenderer_manager = _d.get("win_tenderer_manager","")
  151. win_tenderer_phone = _d.get("win_tenderer_phone","")
  152. second_tenderer = _d.get("second_tenderer","")
  153. second_bid_price = float(_d.get("second_bid_price",0))
  154. second_tenderer_manager = _d.get("second_tenderer_manager","")
  155. second_tenderer_phone = _d.get("second_tenderer_phone","")
  156. third_tenderer = _d.get("third_tenderer","")
  157. third_bid_price = float(_d.get("third_bid_price",0))
  158. third_tenderer_manager = _d.get("third_tenderer_manager","")
  159. third_tenderer_phone = _d.get("third_tenderer_phone","")
  160. if win_tenderer not in (None,"") and win_tenderer_manager not in (None,""):
  161. set_comp_contact.add("%s-%s-%s-%s"%("win_tenderee",win_tenderer,win_tenderer_manager,win_tenderer_phone))
  162. if second_tenderer not in (None,"") and second_tenderer_manager not in (None,""):
  163. set_comp_contact.add("%s-%s-%s-%s"%("second_tenderer",second_tenderer,second_tenderer_manager,second_tenderer_phone))
  164. if third_tenderer not in (None,"") and third_tenderer_manager not in (None,""):
  165. set_comp_contact.add("%s-%s-%s-%s"%("third_tenderer",third_tenderer,third_tenderer_manager,third_tenderer_phone))
  166. if win_tenderer not in (None,""):
  167. set_pack_comp.add("%s-%s-%s"%(sub_project_name,"win_tenderer",win_tenderer))
  168. if second_tenderer not in (None,""):
  169. set_pack_comp.add("%s-%s-%s"%(sub_project_name,"second_tenderer",second_tenderer))
  170. if third_tenderer not in (None,""):
  171. set_pack_comp.add("%s-%s-%s"%(sub_project_name,"third_tenderer",third_tenderer))
  172. if bidding_budget>0:
  173. set_pack_money.add("%s-%s-%2f"%(sub_project_name,"bidding_budget",bidding_budget))
  174. if win_bid_price>0:
  175. set_pack_money.add("%s-%s-%2f"%(sub_project_name,"win_tenderer",win_bid_price))
  176. if second_bid_price>0:
  177. set_pack_money.add("%s-%s-%2f"%(sub_project_name,"second_tenderer",second_bid_price))
  178. if third_bid_price>0:
  179. set_pack_money.add("%s-%s-%2f"%(sub_project_name,"third_tenderer",third_bid_price))
  180. _dict["set_comp_contact"] = list(set_comp_contact)
  181. _dict["set_pack_comp"] = list(set_pack_comp)
  182. _dict["set_pack_money"] = list(set_pack_money)
  183. self.forward(json.dumps(_dict,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False))
  184. @annotate("string,string->string")
  185. class f_compair_extract(object):
  186. def __init__(self):
  187. import logging
  188. import re
  189. import json
  190. global logging,re,json
  191. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  192. def evaluate(self, json_online,json_result):
  193. dict_online = json.loads(json_online)
  194. dict_result = json.loads(json_result)
  195. logging.info(json_online)
  196. dict_test = {}
  197. set_comp_contact = set()
  198. set_pack_comp = set()
  199. set_pack_money = set()
  200. logging.info("1")
  201. for k,v in dict_result.items():
  202. if k in ["bidway","moneysource","time_bidclose","serviceTime","time_bidopen","time_release","name"]:
  203. dict_test[k] = v
  204. elif k in ["code"]:
  205. if len(v)>0:
  206. dict_test["code"] = v[0]
  207. else:
  208. dict_test["code"] = ""
  209. elif k in ["person_review","product"]:
  210. list_temp = v
  211. list_temp.sort(key=lambda x:x)
  212. dict_test[k] = list_temp
  213. elif k in ["punish"]:
  214. for k1,v1 in v.items():
  215. dict_test[k1] = v1
  216. elif k in ["prem"]:
  217. for _pack,_prem in v.items():
  218. bidding_budget = float(_prem.get("tendereeMoney",0))
  219. role_lists = _prem.get("roleList",[])
  220. if bidding_budget>0:
  221. set_pack_money.add("%s-%s-%2f"%(_pack,"bidding_budget",bidding_budget))
  222. for _role in role_lists:
  223. role_type = _role[0]
  224. role_name = _role[1]
  225. role_money = 0 if _role[2]=="" else float(_role[2])
  226. contact_list = _role[3]
  227. for _person,_phone in contact_list:
  228. set_comp_contact.add("%s-%s-%s-%s"%(role_type,role_name,_person,_phone))
  229. set_pack_comp.add("%s-%s-%s"%(_pack,role_type,role_name))
  230. if role_money >0:
  231. set_pack_money.add("%s-%s-%2f"%(_pack,role_type,role_money))
  232. dict_test["set_comp_contact"] = list(set_comp_contact)
  233. dict_test["set_pack_comp"] = list(set_pack_comp)
  234. dict_test["set_pack_money"] = list(set_pack_money)
  235. logging.info(dict_test)
  236. logging.info("2")
  237. dict_compair = {}
  238. set_keys_online = set(dict_online.keys())
  239. set_keys_test = set(dict_test.keys())
  240. union_keys = list(set_keys_online|set_keys_test)
  241. logging.info(str(union_keys))
  242. for _key in union_keys:
  243. logging.info(_key)
  244. v_online = dict_online.get(_key,"")
  245. v_test = dict_test.get(_key,"")
  246. logging.info(v_online)
  247. logging.info(v_test)
  248. if isinstance(v_online,list) or isinstance(v_test,list):
  249. logging.info("3")
  250. if v_online=="":
  251. v_online = []
  252. if v_test=="":
  253. v_test = []
  254. v_online.sort(key=lambda x:x)
  255. v_test.sort(key=lambda x:x)
  256. s_online = set(v_online)
  257. s_test = set(v_test)
  258. diff_count = len(s_online-s_test)+len(s_test-s_online)
  259. dict_compair[_key+"_diff"] = diff_count
  260. dict_compair[_key+"_online"] = v_online
  261. dict_compair[_key+"_test"] = v_test
  262. elif isinstance(v_online,str):
  263. logging.info("4")
  264. if v_online==v_test:
  265. diff_count = 0
  266. else:
  267. diff_count = 1
  268. dict_compair[_key+"_diff"] = diff_count
  269. dict_compair[_key+"_online"] = v_online
  270. dict_compair[_key+"_test"] = v_test
  271. return json.dumps(dict_compair,sort_keys=True,indent=4,ensure_ascii=False)
  272. import hashlib
  273. def getMD5(sourceHtml):
  274. if sourceHtml is not None and len(sourceHtml)>0:
  275. if isinstance(sourceHtml,str):
  276. bs = sourceHtml.encode()
  277. elif isinstance(sourceHtml,bytes):
  278. bs = sourceHtml
  279. else:
  280. return ""
  281. md5 = hashlib.md5()
  282. md5.update(bs)
  283. return md5.hexdigest()
  284. return ""
  285. def getFingerprint(sourceHtml):
  286. md5 = getMD5(sourceHtml)
  287. if md5!="":
  288. _fingerprint = "md5=%s"%(md5)
  289. else:
  290. _fingerprint = ""
  291. return _fingerprint
  292. @annotate("string,string->string")
  293. class f_getFingerprint(object):
  294. def __init__(self):
  295. import logging
  296. import re
  297. import json
  298. global logging,re,json
  299. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  300. def evaluate(self, doctitle,dochtmlcon):
  301. fingerprint = getFingerprint(doctitle+dochtmlcon)
  302. return fingerprint
  303. @annotate('bigint,string,string,string,string,string,string,string,string->string')
  304. class f_check_dumplicate(BaseUDAF):
  305. '''
  306. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  307. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  308. '''
  309. def __init__(self):
  310. import logging
  311. import json,re
  312. global json,logging,re
  313. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  314. def new_buffer(self):
  315. return [list()]
  316. def iterate(self, buffer,docid,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
  317. buffer[0].append({"docid":docid,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
  318. "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
  319. def merge(self, buffer, pbuffer):
  320. buffer[0].extend(pbuffer[0])
  321. def terminate(self, buffer):
  322. list_group = []
  323. list_group.append(buffer[0])
  324. return json.dumps(list_group,ensure_ascii=False)
  325. @annotate('string -> bigint,bigint,string,string,string,string,string,string,string,string')
  326. class f_check_dumplicate_group(BaseUDTF):
  327. '''
  328. 从最后的结果中获取组
  329. '''
  330. def __init__(self):
  331. import logging
  332. import json
  333. global json,logging
  334. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  335. def process(self,list_group):
  336. if list_group is not None:
  337. final_group = json.loads(list_group)
  338. logging.info(list_group)
  339. for _groups in final_group:
  340. for _group in _groups:
  341. self.forward(_groups[0]["docid"],_group["docid"],_group["doctitle"],_group["project_code"],_group["project_name"],_group["tenderee"],_group["agency"],_group["win_tenderer"],_group["bidding_budget"],_group["win_bid_price"])
  342. @annotate('string->bigint')
  343. class f_is_contain(BaseUDAF):
  344. '''
  345. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  346. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  347. '''
  348. def __init__(self):
  349. import logging
  350. import json,re
  351. global json,logging,re
  352. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  353. def new_buffer(self):
  354. return [list()]
  355. def iterate(self, buffer,doctitle):
  356. buffer[0].append(doctitle)
  357. def merge(self, buffer, pbuffer):
  358. buffer[0].extend(pbuffer[0])
  359. def terminate(self, buffer):
  360. is_contain = 1
  361. list_doctitle = buffer[0]
  362. main_doctitle = ""
  363. for _doctitle in list_doctitle:
  364. if _doctitle in main_doctitle or main_doctitle in _doctitle:
  365. if len(_doctitle)>len(main_doctitle):
  366. main_doctitle = _doctitle
  367. else:
  368. is_contain = 0
  369. break
  370. return is_contain
  371. def getSet(list_dict,key):
  372. _set = set()
  373. for item in list_dict:
  374. if key in item:
  375. if item[key]!='' and item[key] is not None:
  376. if re.search("^[\d\.]+$",item[key]) is not None:
  377. _set.add(str(float(item[key])))
  378. else:
  379. _set.add(str(item[key]))
  380. return _set
  381. def split_with_time(list_dict,sort_key,timedelta=86400*2):
  382. if len(list_dict)>0:
  383. if sort_key in list_dict[0]:
  384. list_dict.sort(key=lambda x:x[sort_key])
  385. list_group = []
  386. _begin = 0
  387. for i in range(len(list_dict)-1):
  388. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<timedelta:
  389. continue
  390. else:
  391. _group = []
  392. for j in range(_begin,i+1):
  393. _group.append(list_dict[j])
  394. if len(_group)>1:
  395. list_group.append(_group)
  396. _begin = i + 1
  397. if len(list_dict)>1:
  398. _group = []
  399. for j in range(_begin,len(list_dict)):
  400. _group.append(list_dict[j])
  401. if len(_group)>1:
  402. list_group.append(_group)
  403. return list_group
  404. return [list_dict]
  405. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
  406. class f_check_dumplicate_1(BaseUDAF):
  407. '''
  408. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  409. '''
  410. def __init__(self):
  411. import logging
  412. import json,re
  413. global json,logging,re
  414. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  415. def new_buffer(self):
  416. return [list()]
  417. def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
  418. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
  419. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  420. "contain_column":contain_column,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
  421. "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
  422. def merge(self, buffer, pbuffer):
  423. buffer[0].extend(pbuffer[0])
  424. def terminate(self, buffer):
  425. list_split = split_with_time(buffer[0],"page_time_stamp")
  426. list_group = []
  427. for _split in list_split:
  428. flag = True
  429. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  430. for _key in keys:
  431. logging.info(_key+str(getSet(_split,_key)))
  432. if len(getSet(_split,_key))>1:
  433. flag = False
  434. break
  435. MAX_CONTAIN_COLUMN = None
  436. #判断组内每条公告是否包含
  437. if flag:
  438. for _d in _split:
  439. contain_column = _d["contain_column"]
  440. if contain_column is not None and contain_column !="":
  441. if MAX_CONTAIN_COLUMN is None:
  442. MAX_CONTAIN_COLUMN = contain_column
  443. else:
  444. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  445. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  446. flag = False
  447. break
  448. MAX_CONTAIN_COLUMN = contain_column
  449. else:
  450. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  451. flag = False
  452. break
  453. if flag:
  454. if len(_split)>1:
  455. list_group.append(_split)
  456. return json.dumps(list_group)
  457. @annotate('string->string,string')
  458. class f_splitAttach(BaseUDTF):
  459. def __init__(self):
  460. import logging
  461. import time
  462. global time,logging
  463. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  464. logging.info("start init env")
  465. load_py()
  466. logging.info("init env done")
  467. from bs4 import BeautifulSoup
  468. global BeautifulSoup
  469. def process(self,dochtmlcon):
  470. doctextcon = ""
  471. attachmenttextcon = ""
  472. if dochtmlcon is not None:
  473. _soup = BeautifulSoup(dochtmlcon,"lxml")
  474. _find = _soup.find("div",attrs={"class":"richTextFetch"})
  475. if _find is not None:
  476. attachmenttextcon = _find.get_text()
  477. _find.decompose()
  478. doctextcon = _soup.get_text()
  479. self.forward(doctextcon,attachmenttextcon)
  480. def getTitleFromHtml(filemd5,_html):
  481. _soup = BeautifulSoup(_html,"lxml")
  482. _find = _soup.find("a",attrs={"data":filemd5})
  483. _title = ""
  484. if _find is not None:
  485. _title = _find.get_text()
  486. return _title
  487. def getSourceLinkFromHtml(filemd5,_html):
  488. _soup = BeautifulSoup(_html,"lxml")
  489. _find = _soup.find("a",attrs={"filelink":filemd5})
  490. filelink = ""
  491. if _find is None:
  492. _find = _soup.find("img",attrs={"filelink":filemd5})
  493. if _find is not None:
  494. filelink = _find.attrs.get("src","")
  495. else:
  496. filelink = _find.attrs.get("href","")
  497. return filelink
  498. def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
  499. new_attachments = json.loads(page_attachments)
  500. for _atta in new_attachments:
  501. fileMd5 = _atta.get("fileMd5")
  502. if fileMd5 is not None:
  503. fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
  504. fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
  505. _atta["fileTitle"] = fileTitle
  506. _atta["fileLink"] = fileLink
  507. print(new_attachments)
  508. return json.dumps(new_attachments,ensure_ascii=False)
  509. @annotate('string,string->string')
  510. class f_turnPageattachments(object):
  511. def evaluate(self,dochtmlcon,page_attachments):
  512. new_page_attachments = None
  513. if page_attachments is not None:
  514. if "fileMd5" in page_attachments:
  515. new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
  516. return new_page_attachments
  517. @annotate("string->string")
  518. class f_getRoles(BaseUDTF):
  519. def __init__(self):
  520. self.columns = ["win_tenderer","second_tenderer","third_tenderer"]
  521. pass
  522. # bidway名称统一规范
  523. def bidway_integrate(self,sub_docs_json):
  524. if sub_docs_json is not None:
  525. _docs = json.loads(sub_docs_json)
  526. for _doc in _docs:
  527. for _c in self.columns:
  528. if _doc.get(_c) is not None:
  529. self.forward(_doc.get(_c))
  530. def process(self,sub_docs_json):
  531. self.bidway_integrate(sub_docs_json)
  532. @annotate("string->string")
  533. class turn_bidway(BaseUDTF):
  534. def __init__(self):
  535. self.bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
  536. '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
  537. '招标方式:t公开': '公开招标', '竞价': '竞价',
  538. '竞标': '竞价', '电子竞价': '竞价',
  539. '电子书面竞投': '竞价', '单一来源': '单一来源',
  540. '网上竞价': '竞价', '公开招标': '公开招标',
  541. '询比': '询价', '定点采购': '其他',
  542. '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
  543. '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
  544. '比选': '其他', '比质比价': '其他', '分散采购': '其他',
  545. '内部邀标': '邀请招标', '邀请招标': '邀请招标',
  546. '网上招标': '公开招标', '非定向询价': '询价',
  547. '网络竞价': '竞价', '公开询价': '询价',
  548. '定点采购议价': '其他', '询单': '询价',
  549. '网上挂牌': '其他', '网上直购': '其他',
  550. '定向询价': '询价', '采购方式:公开': '公开招标',
  551. '磋商': '竞争性磋商', '公开招投标': '公开招标',
  552. '招标方式:√公开': '公开招标', '公开选取': '公开招标',
  553. '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
  554. '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
  555. '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
  556. '网上询价': '询价'}
  557. # bidway名称统一规范
  558. def bidway_integrate(self,bidway):
  559. integrate_name = self.bidway_dict.get(bidway,"其他")
  560. return integrate_name
  561. def process(self,bidway):
  562. new_bidway =self.bidway_integrate(bidway)
  563. self.forward(new_bidway)