documentDumplicate.py 91 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string,string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, doctextcon,extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  191. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  192. dict_pack = _extract.get("prem",{})
  193. for _key in dict_pack.keys():
  194. for _role in dict_pack[_key]["roleList"]:
  195. if isinstance(_role,list):
  196. _entity = _role[1]
  197. nlp_enterprise.append(_entity)
  198. if isinstance(_role,dict):
  199. _entity = _role["role_text"]
  200. nlp_enterprise.append(_entity)
  201. nlp_enterprise = list(set(nlp_enterprise))
  202. dict_entity = {"indoctextcon":nlp_enterprise,
  203. "notindoctextcon":nlp_enterprise_attachment}
  204. return json.dumps(dict_entity,ensure_ascii=False)
  205. @annotate("string->bigint")
  206. class f_get_extractCount(object):
  207. def __init__(self):
  208. import time
  209. global time
  210. import logging
  211. import json
  212. import re
  213. global json,logging,re
  214. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  215. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  216. def evaluate(self, extractjson):
  217. if extractjson is not None:
  218. _extract = json.loads(extractjson)
  219. else:
  220. _extract = {}
  221. dict_pack = _extract.get("prem",{})
  222. extract_count = 0
  223. list_code = _extract.get("code",[])
  224. if len(list_code)>0:
  225. project_code = list_code[0]
  226. else:
  227. project_code = ""
  228. project_name = _extract.get("name","")
  229. bidding_budget = ""
  230. win_tenderer = ""
  231. win_bid_price = ""
  232. for _key in dict_pack.keys():
  233. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  234. extract_count += 1
  235. if bidding_budget=="":
  236. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  237. for _role in dict_pack[_key]["roleList"]:
  238. if isinstance(_role,list):
  239. extract_count += 1
  240. if _role[2]!='' and float(_role[2])>0:
  241. extract_count += 1
  242. if _role[0]=="tenderee":
  243. tenderee = _role[1]
  244. if _role[0]=="win_tenderer":
  245. if win_tenderer=="":
  246. win_tenderer = _role[1]
  247. if _role[2]!='' and float(_role[2])>0:
  248. extract_count += 1
  249. if win_bid_price=="":
  250. win_bid_price = str(float(_role[2]))
  251. if _role[0]=="agency":
  252. agency = _role[1]
  253. if isinstance(_role,dict):
  254. extract_count += 1
  255. if "role_money" in _role:
  256. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  257. extract_count += 1
  258. if _role.get("role_name")=="tenderee":
  259. tenderee = _role["role_text"]
  260. if _role.get("role_name")=="win_tenderer":
  261. if win_tenderer=="":
  262. win_tenderer = _role["role_text"]
  263. if "role_money" in _role:
  264. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  265. extract_count += 1
  266. if win_bid_price=="":
  267. win_bid_price = str(float(_role["role_money"]["money"]))
  268. if _role["role_name"]=="agency":
  269. agency = _role["role_text"]
  270. if project_code!="":
  271. extract_count += 1
  272. if project_name!="":
  273. extract_count += 1
  274. return extract_count
  275. @annotate('string,string,string,string,string -> string,string,string,bigint')
  276. class f_decode_sub_docs_json(BaseUDTF):
  277. def __init__(self):
  278. import logging
  279. import json
  280. global json,logging
  281. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  282. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  283. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  284. extract_count = 0
  285. if project_code is not None and project_code!="":
  286. extract_count += 1
  287. if project_name is not None and project_name!="":
  288. extract_count += 1
  289. if tenderee is not None and tenderee!="":
  290. extract_count += 1
  291. if agency is not None and agency!="":
  292. extract_count += 1
  293. if sub_docs_json is not None:
  294. for sub_docs in json.loads(sub_docs_json):
  295. for _key_sub_docs in sub_docs.keys():
  296. extract_count += 1
  297. if _key_sub_docs in columns:
  298. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  299. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  300. if float(sub_docs[_key_sub_docs])>0:
  301. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  302. else:
  303. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  304. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  305. @annotate("string->bigint")
  306. class totimestamp(object):
  307. def __init__(self):
  308. import time
  309. global time
  310. import logging
  311. import json
  312. import re
  313. global json,logging,re
  314. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  315. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  316. def evaluate(self, str_time):
  317. try:
  318. logging.info(str_time)
  319. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  320. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  321. timeStamp = int(time.mktime(timeArray))
  322. return timeStamp
  323. else:
  324. return 0
  325. except Exception as e:
  326. return 0
  327. @annotate("string->string")
  328. class refind_name(object):
  329. def __init__(self):
  330. import logging
  331. import re
  332. global logging,re
  333. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  334. def evaluate(self, title):
  335. if title is not None:
  336. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  337. return ""
  338. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  339. class f_set_docid(BaseUDAF):
  340. '''
  341. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  342. '''
  343. def __init__(self):
  344. import json
  345. global json
  346. def new_buffer(self):
  347. return [[]]
  348. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  349. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  350. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  351. def merge(self, buffer, pbuffer):
  352. buffer[0].extend(pbuffer[0])
  353. def terminate(self, buffer):
  354. list_docs = buffer[0]
  355. list_docs.sort(key=lambda x:x["page_time_stamp"])
  356. list_group = []
  357. _begin = 0
  358. defind_count = 0
  359. if len(list_docs)>0:
  360. defind_count = list_docs[0]["defind_count"]
  361. print(defind_count)
  362. for i in range(len(list_docs)-1):
  363. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*2:
  364. continue
  365. else:
  366. _group = []
  367. _set_column = set()
  368. _set_tenderee = set()
  369. for j in range(_begin,i+1):
  370. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  371. _set_tenderee.add(list_docs[j]["tenderee"])
  372. _set_column.add(list_docs[j]["defind_column"])
  373. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  374. if len(_group)>=3 and len(_set_tenderee)>1:
  375. pass
  376. else:
  377. print(defind_count,len(_set_column))
  378. if len(_group)>1:
  379. if defind_count==2:
  380. if len(_set_column)>=2:
  381. list_group.append(_group)
  382. elif defind_count==1:
  383. if len(_set_column)==1:
  384. list_group.append(_group)
  385. elif defind_count==0:
  386. list_group.append(_group)
  387. _begin = i+1
  388. if len(list_docs)>1:
  389. _set_column = set()
  390. _set_tenderee = set()
  391. _group = []
  392. for j in range(_begin,len(list_docs)):
  393. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  394. _set_tenderee.add(list_docs[j]["tenderee"])
  395. _set_column.add(list_docs[j]["defind_column"])
  396. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  397. if len(_group)>=3 and len(_set_tenderee)>1:
  398. pass
  399. else:
  400. if len(_group)>1:
  401. if defind_count==2:
  402. if len(_set_column)>=2:
  403. list_group.append(_group)
  404. elif defind_count==1:
  405. if len(_set_column)==1:
  406. list_group.append(_group)
  407. elif defind_count==0:
  408. list_group.append(_group)
  409. return json.dumps(list_group)
  410. # def terminate(self, buffer):
  411. #
  412. #
  413. # list_docs = buffer[0]
  414. # if len(list_docs)>0:
  415. # defind_count = list_docs[0]["defind_count"]
  416. #
  417. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  418. #
  419. # list_group = []
  420. # for time_group in list_time_group:
  421. # _group = []
  422. # _set_column = set()
  423. # base_tenderee = ""
  424. # _set_tenderee = set()
  425. # for j in range(len(time_group)):
  426. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  427. # # if base_tenderee =="":
  428. # # base_tenderee = time_group[j]["tenderee"]
  429. # # _set_tenderee.add(time_group[j]["tenderee"])
  430. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  431. # # if simi<0.8:
  432. # # _set_tenderee.add(time_group[j]["tenderee"])
  433. #
  434. # _set_tenderee.add(time_group[j]["tenderee"])
  435. # _set_column.add(time_group[j]["defind_column"])
  436. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  437. #
  438. # if len(_group)>=3 and len(_set_tenderee)>1:
  439. # pass
  440. # else:
  441. # if len(_group)>1:
  442. # if defind_count==2:
  443. # if len(_set_column)>=2:
  444. # list_group.append(_group)
  445. # elif defind_count==1:
  446. # if len(_set_column)==1:
  447. # list_group.append(_group)
  448. # elif defind_count==0:
  449. # list_group.append(_group)
  450. #
  451. # return json.dumps(list_group)
  452. def isEmpty(_str):
  453. if _str is None or _str=="":
  454. return True
  455. return False
  456. @annotate('bigint->string')
  457. class f_group_fingerprint(BaseUDAF):
  458. def __init__(self):
  459. import json
  460. global json
  461. def new_buffer(self):
  462. return [[]]
  463. def iterate(self, buffer,docid):
  464. buffer[0].append(docid)
  465. def merge(self, buffer, pbuffer):
  466. buffer[0].extend(pbuffer[0])
  467. def terminate(self, buffer):
  468. list_docid = buffer[0]
  469. list_docid.sort(key=lambda x:x)
  470. return ",".join([str(a) for a in list_docid])
  471. @annotate('string->bigint,string')
  472. class f_ungroup_fingerprint(BaseUDTF):
  473. def process(self,dumplicates):
  474. list_docid = dumplicates.split(",")
  475. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  476. @annotate('bigint,bigint,string->string')
  477. class f_dump_probability(BaseUDAF):
  478. '''
  479. 合并组为一条记录
  480. '''
  481. def __init__(self):
  482. import json
  483. global json
  484. def new_buffer(self):
  485. return [[]]
  486. def iterate(self, buffer,docid,page_time_stamp,_type):
  487. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  488. def merge(self, buffer, pbuffer):
  489. buffer[0].extend(pbuffer[0])
  490. def terminate(self, buffer):
  491. list_dict = buffer[0]
  492. list_dict = list_dict[:10000]
  493. list_group = split_with_time(list_dict,sort_key="page_time_stamp",timedelta=86400*2)
  494. return json.dumps(list_group)
  495. @annotate('string -> bigint,bigint,bigint,bigint,string')
  496. class f_split_dumplicate_probability(BaseUDTF):
  497. def __init__(self):
  498. import logging
  499. import json
  500. global logging,json
  501. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  502. def process(self,list_group_str):
  503. logging.info("0")
  504. logging.info(list_group_str)
  505. if list_group_str is not None:
  506. logging.info("1")
  507. try:
  508. list_group = json.loads(list_group_str)
  509. logging.info("2")
  510. for _group in list_group:
  511. if len(_group)>0:
  512. _type = _group[0].get("type","")
  513. logging.info("3%d"%len(list_group))
  514. # _group.sort(key=lambda x:x["page_time_stamp"])
  515. _len = min(100,len(_group))
  516. for _index_i in range(_len):
  517. _count = 0
  518. for _index_j in range(_index_i+1,_len):
  519. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  520. break
  521. _count += 1
  522. _docid1 = _group[_index_i]["docid"]
  523. _docid2 = _group[_index_j]["docid"]
  524. if _docid1<_docid2:
  525. self.forward(_docid1,_docid2,1,_len,_type)
  526. else:
  527. self.forward(_docid2,_docid1,1,_len,_type)
  528. except Exception as e:
  529. logging(str(e))
  530. @annotate('bigint,bigint,string->string')
  531. class f_dumplicate_groupPairs(BaseUDAF):
  532. '''
  533. 合并组为一条记录
  534. '''
  535. def __init__(self):
  536. import json
  537. global json
  538. def new_buffer(self):
  539. return [[]]
  540. def iterate(self, buffer,is_exists,counts,_type):
  541. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  542. def merge(self, buffer, pbuffer):
  543. buffer[0].extend(pbuffer[0])
  544. def terminate(self, buffer):
  545. list_dict = buffer[0]
  546. list_dict = list_dict[:10000]
  547. return json.dumps(list_dict)
  548. def check_columns(tenderee_less,tenderee_greater,
  549. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  550. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  551. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  552. flag = True
  553. _set_tenderee = set()
  554. if tenderee_less is not None and tenderee_less!="":
  555. _set_tenderee.add(tenderee_less)
  556. if tenderee_greater is not None and tenderee_greater!="":
  557. _set_tenderee.add(tenderee_greater)
  558. if len(_set_tenderee)>1:
  559. return False
  560. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  561. if code_sim>0.6 and code_sim<1:
  562. return False
  563. #同批次不同编号
  564. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  565. _split_code_less = project_code_less.split("-")
  566. _split_code_greater = project_code_greater.split("-")
  567. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  568. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  569. return False
  570. _set_win_tenderer = set()
  571. if win_tenderer_less is not None and win_tenderer_less!="":
  572. _set_win_tenderer.add(win_tenderer_less)
  573. if win_tenderer_greater is not None and win_tenderer_greater!="":
  574. _set_win_tenderer.add(win_tenderer_greater)
  575. if len(_set_win_tenderer)>1:
  576. return False
  577. _set_win_bid_price = set()
  578. if win_bid_price_less is not None and win_bid_price_less!="":
  579. _set_win_bid_price.add(float(win_bid_price_less))
  580. if win_bid_price_greater is not None and win_bid_price_greater!="":
  581. _set_win_bid_price.add(float(win_bid_price_greater))
  582. if len(_set_win_bid_price)>1:
  583. return False
  584. _set_bidding_budget = set()
  585. if bidding_budget_less is not None and bidding_budget_less!="":
  586. _set_bidding_budget.add(float(bidding_budget_less))
  587. if bidding_budget_greater is not None and bidding_budget_greater!="":
  588. _set_bidding_budget.add(float(bidding_budget_greater))
  589. if len(_set_bidding_budget)>1:
  590. return False
  591. return True
  592. import math
  593. def featurnCount(_count,max_count=100):
  594. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  595. def getSimLevel(str1,str2):
  596. str1_null = False
  597. str2_null = False
  598. _v = 0
  599. if str1 is None or str1=="":
  600. str1_null = True
  601. if str2 is None or str2=="":
  602. str2_null = True
  603. if str1_null and str2_null:
  604. _v = 2
  605. elif str1_null and not str2_null:
  606. _v = 4
  607. elif not str1_null and str2_null:
  608. _v = 6
  609. elif not str1_null and not str2_null:
  610. if str1==str2:
  611. _v = 10
  612. else:
  613. _v = 0
  614. return _v
  615. def getLength(_str):
  616. return len(_str if _str is not None else "")
  617. def check_money(bidding_budget_less,bidding_budget_greater,
  618. win_bid_price_less,win_bid_price_greater):
  619. #check saming
  620. budget_is_same = ""
  621. price_is_same = ""
  622. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  623. budget_less = float(bidding_budget_less)
  624. budget_greater = float(bidding_budget_greater)
  625. if budget_less!=budget_greater:
  626. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  627. budget_is_same = True
  628. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  629. budget_is_same = True
  630. if budget_is_same=="":
  631. return False
  632. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  633. price_less = float(win_bid_price_less)
  634. price_greater = float(win_bid_price_greater)
  635. if price_less!=price_greater:
  636. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  637. price_is_same = True
  638. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  639. price_is_same = True
  640. if price_is_same=="":
  641. return False
  642. return True
  643. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  644. tenderee_less,tenderee_greater,
  645. agency_less,agency_greater,
  646. win_tenderer_less,win_tenderer_greater,
  647. similarity=0.85):
  648. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  649. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  650. if entity_less!=entity_greater:
  651. is_same = ''
  652. _sim = jaccard_score(entity_less,entity_greater)
  653. if _sim>similarity:
  654. is_same = True
  655. if is_same=='':
  656. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  657. is_same = True
  658. if is_same=='':
  659. return False
  660. return True
  661. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  662. return False
  663. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  664. return False
  665. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  666. return False
  667. return True
  668. def check_codes(project_codes_less,project_codes_greater):
  669. #check the similarity
  670. is_same = False
  671. is_sim = False
  672. for project_code_less in project_codes_less:
  673. for project_code_greater in project_codes_greater:
  674. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  675. if code_sim>0.6 and code_sim<1:
  676. is_sim = True
  677. if code_sim==1:
  678. is_same = True
  679. if is_same:
  680. return True
  681. if is_sim:
  682. return False
  683. return True
  684. def check_demand():
  685. return True
  686. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  687. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  688. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  689. num1_pattern = re.compile("[一二三四五六七八九]+")
  690. location_pattern = re.compile(".{1,2}[市区镇县村路]")
  691. building_pattern = "工程招标代理|工程设计|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  692. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  693. def check_doctitle(doctitle_refind_less,doctitle_refind_greater,codes_less=[],code_greater=[]):
  694. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  695. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  696. for _c in codes_less:
  697. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  698. for _c in code_greater:
  699. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  700. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  701. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  702. #check the package
  703. if doctitle_refind_less is None:
  704. doctitle_refind_less = ""
  705. if doctitle_refind_greater is None:
  706. doctitle_refind_greater = ""
  707. _pack1 = None
  708. _pack2 = None
  709. #if contain then pass
  710. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  711. return True
  712. #check the package in title
  713. _match = re.search(package_number_pattern,doctitle_refind_less)
  714. if _match is not None:
  715. _pack1 = _match.groupdict()["name"]
  716. _match = re.search(package_number_pattern,doctitle_refind_greater)
  717. if _match is not None:
  718. _pack2 = _match.groupdict()["name"]
  719. if _pack1 is not None and _pack2 is not None:
  720. if _pack1!=_pack2:
  721. return False
  722. #check the nums in title
  723. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  724. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  725. #check the nums,location,building in title
  726. for _p in [code_pattern]:
  727. num_all_l = re.findall(_p,doctitle_refind_less)
  728. num_all_g = re.findall(_p,doctitle_refind_greater)
  729. set_num_l = set()
  730. set_num_g = set()
  731. for _l in num_all_l:
  732. if re.search(num_pattern,_l) is not None:
  733. if _l.find(".")>0:
  734. set_num_l.add(_l)
  735. elif len(_l)<4:
  736. set_num_l.add(_l)
  737. for _g in num_all_g:
  738. if re.search(num_pattern,_g) is not None:
  739. if _g.find(".")>0:
  740. set_num_g.add(_g)
  741. elif len(_g)<4:
  742. set_num_g.add(_g)
  743. if len(set_num_l)>0 and len(set_num_g)>0:
  744. if len(set_num_l&set_num_g)!=len(set_num_l):
  745. return False
  746. #check location and keywords
  747. for _p in [num1_pattern,location_pattern,building_pattern]:
  748. num_all_l = re.findall(_p,doctitle_refind_less)
  749. num_all_g = re.findall(_p,doctitle_refind_greater)
  750. set_num_l = set(num_all_l)
  751. set_num_g = set(num_all_g)
  752. if len(set_num_l)==len(set_num_g):
  753. if len(set_num_l&set_num_g)!=len(set_num_l):
  754. return False
  755. return True
  756. def check_product(product_less,product_greater,split_char=","):
  757. if getLength(product_less)>0 and getLength(product_greater)>0:
  758. _product_l = product_less.split(split_char)
  759. _product_g = product_greater.split(split_char)
  760. for _l in _product_l:
  761. for _g in _product_g:
  762. if getSimilarityOfString(_l,_g)>=0.8:
  763. return True
  764. return False
  765. return True
  766. def check_package(package_less,package_greater,split_char=","):
  767. if getLength(package_less)>0 and getLength(package_greater)>0:
  768. _product_l = package_less.split(split_char)
  769. _product_g = package_greater.split(split_char)
  770. for _l in _product_l:
  771. for _g in _product_g:
  772. if _l==_g:
  773. return True
  774. return False
  775. return True
  776. def check_time(json_time_less,json_time_greater):
  777. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  778. if isinstance(json_time_less,dict):
  779. time_less = json_time_less
  780. else:
  781. time_less = json.loads(json_time_less)
  782. if isinstance(json_time_greater,dict):
  783. time_greater = json_time_greater
  784. else:
  785. time_greater = json.loads(json_time_greater)
  786. for k,v in time_less.items():
  787. if getLength(v)>0:
  788. v1 = time_greater.get(k,"")
  789. if getLength(v1)>0:
  790. if v!=v1:
  791. return False
  792. return True
  793. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  794. class f_dumplicate_featureMatrix(BaseUDTF):
  795. def __init__(self):
  796. import logging
  797. import json
  798. global logging,json
  799. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  800. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  801. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  802. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  803. #check the page_time by special docchannel
  804. if docchannel_less in (51,102,103,104,115,116,117):
  805. if doctitle_refine_less!=doctitle_refine_greater:
  806. if page_time_less!=page_time_greater:
  807. self.forward("[1-%s]"%(str(docchannel_less)),0)
  808. return
  809. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  810. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  811. return
  812. # if not check_codes([project_code_less],[project_code_greater]):
  813. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  814. # return
  815. if not check_demand():
  816. self.forward("[4-]",0)
  817. return
  818. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  819. tenderee_less,tenderee_greater,
  820. agency_less,agency_greater,
  821. win_tenderer_less,win_tenderer_greater):
  822. _error = ""
  823. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  824. _error += str(a)
  825. self.forward("[5-%s]"%_error,0)
  826. return
  827. if not check_money(bidding_budget_less,bidding_budget_greater,
  828. win_bid_price_less,win_bid_price_greater):
  829. _error = ""
  830. for a in [bidding_budget_less,bidding_budget_greater,
  831. win_bid_price_less,win_bid_price_greater]:
  832. _error += str(a)
  833. self.forward("[6-%s]"%_error,0)
  834. return
  835. if not check_product(product_less,product_greater):
  836. _error = "%s=%s"%(str(product_less),str(product_greater))
  837. self.forward("7-%s"%_error,0)
  838. return
  839. _context = json.loads(json_context)
  840. min_counts = 100
  841. dict_context = {}
  842. for item in _context:
  843. if item["counts"]<min_counts:
  844. min_counts = item["counts"]
  845. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  846. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  847. list_matrix = []
  848. #get the featurn of the context into matrix
  849. # for index_i in range(len(context_key)):
  850. # for index_j in range(index_i+1,len(context_key)):
  851. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  852. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  853. # list_matrix.append(_v)
  854. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  855. # for index_i in range(len(context3_key)):
  856. # for index_j in range(index_i+1,len(context3_key)):
  857. # for index_k in range(index_j+1,len(context3_key)):
  858. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  859. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  860. # list_matrix.append(_v)
  861. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  862. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  863. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  864. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  865. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  866. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  867. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  868. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  869. json_matrix = json.dumps(list_matrix)
  870. same_count = 0
  871. all_count = 8
  872. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  873. same_count += 1
  874. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  875. same_count += 1
  876. if getSimilarityOfString(agency_less,agency_greater)==1:
  877. same_count += 1
  878. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  879. same_count += 1
  880. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  881. same_count += 1
  882. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  883. same_count += 1
  884. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  885. same_count += 1
  886. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  887. same_count += 1
  888. base_prob = 0
  889. if min_counts<3:
  890. base_prob = 0.9
  891. elif min_counts<5:
  892. base_prob = 0.8
  893. elif min_counts<8:
  894. base_prob = 0.7
  895. else:
  896. base_prob = 0.6
  897. _prob = base_prob*same_count/all_count
  898. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  899. self.forward(json_matrix,_prob)
  900. return
  901. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  902. class f_redump_probability_final_check(BaseUDAF):
  903. '''
  904. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  905. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  906. '''
  907. def __init__(self):
  908. import logging
  909. import json,re
  910. global json,logging,re
  911. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  912. def new_buffer(self):
  913. return [list()]
  914. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  915. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  916. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  917. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  918. def merge(self, buffer, pbuffer):
  919. buffer[0].extend(pbuffer[0])
  920. def terminate(self, buffer):
  921. list_group = []
  922. the_group = buffer[0]
  923. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  924. _index = 0
  925. if len(the_group)>0:
  926. _index = 1
  927. while _index<len(the_group):
  928. document_greater = the_group[_index]
  929. docchannel_greater = document_greater["docchannel"]
  930. page_time_greater = document_greater["page_time"]
  931. doctitle_refine_greater = document_greater["doctitle_refine"]
  932. project_code_greater = document_greater["project_code"]
  933. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  934. tenderee_greater = document_greater["tenderee"]
  935. agency_greater = document_greater["agency"]
  936. win_tenderer_greater = document_greater["win_tenderer"]
  937. bidding_budget_greater = document_greater["bidding_budget"]
  938. win_bid_price_greater = document_greater["win_bid_price"]
  939. product_greater = document_greater["product"]
  940. package_greater = document_greater["package"]
  941. json_time_greater = document_greater["json_dicttime"]
  942. _less_index = 0
  943. while _less_index<_index:
  944. document_less = the_group[_less_index]
  945. docchannel_less = document_less["docchannel"]
  946. page_time_less = document_less["page_time"]
  947. doctitle_refine_less = document_less["doctitle_refine"]
  948. project_code_less = document_less["project_code"]
  949. nlp_enterprise_less = document_less["nlp_enterprise"]
  950. tenderee_less = document_less["tenderee"]
  951. agency_less = document_less["agency"]
  952. win_tenderer_less = document_less["win_tenderer"]
  953. bidding_budget_less = document_less["bidding_budget"]
  954. win_bid_price_less = document_less["win_bid_price"]
  955. product_less = document_less["product"]
  956. package_less = document_less["package"]
  957. json_time_less = document_less["json_dicttime"]
  958. check_result = {"pass":1}
  959. if docchannel_less in (51,102,103,104,115,116,117):
  960. if doctitle_refine_less!=doctitle_refine_greater:
  961. if page_time_less!=page_time_greater:
  962. check_result["docchannel"] = 0
  963. check_result["pass"] = 0
  964. else:
  965. check_result["docchannel"] = 2
  966. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  967. check_result["doctitle"] = 0
  968. check_result["pass"] = 0
  969. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  970. else:
  971. check_result["doctitle"] = 2
  972. #added check
  973. if not check_codes([project_code_less],[project_code_greater]):
  974. check_result["code"] = 0
  975. check_result["pass"] = 0
  976. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  977. else:
  978. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  979. check_result["code"] = 2
  980. else:
  981. check_result["code"] = 1
  982. if not check_product(product_less,product_greater):
  983. check_result["product"] = 0
  984. check_result["pass"] = 0
  985. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  986. else:
  987. if getLength(product_less)>0 and getLength(product_greater)>0:
  988. check_result["product"] = 2
  989. else:
  990. check_result["product"] = 1
  991. if not check_demand():
  992. check_result["pass"] = 0
  993. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  994. tenderee_less,tenderee_greater,
  995. agency_less,agency_greater,
  996. win_tenderer_less,win_tenderer_greater):
  997. check_result["entity"] = 0
  998. check_result["pass"] = 0
  999. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1000. else:
  1001. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1002. check_result["entity"] = 2
  1003. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1004. check_result["entity"] = 2
  1005. else:
  1006. check_result["entity"] = 1
  1007. if not check_money(bidding_budget_less,bidding_budget_greater,
  1008. win_bid_price_less,win_bid_price_greater):
  1009. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1010. check_result["money"] = 0
  1011. check_result["pass"] = 0
  1012. else:
  1013. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1014. check_result["money"] = 2
  1015. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1016. check_result["money"] = 2
  1017. else:
  1018. check_result["money"] = 1
  1019. #added check
  1020. if not check_package(package_less,package_greater):
  1021. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  1022. check_result["package"] = 0
  1023. check_result["pass"] = 0
  1024. else:
  1025. if getLength(package_less)>0 and getLength(package_greater)>0:
  1026. check_result["package"] = 2
  1027. else:
  1028. check_result["package"] = 1
  1029. #added check
  1030. if not check_time(json_time_less,json_time_greater):
  1031. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  1032. check_result["time"] = 0
  1033. check_result["pass"] = 0
  1034. else:
  1035. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1036. check_result["time"] = 2
  1037. else:
  1038. check_result["time"] = 1
  1039. if check_result.get("pass",0)==0:
  1040. logging.info(str(check_result))
  1041. if check_result.get("time",1)==0:
  1042. break
  1043. if check_result.get("money",1)==0:
  1044. break
  1045. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  1046. pass
  1047. else:
  1048. break
  1049. _less_index += 1
  1050. if _less_index!=_index:
  1051. break
  1052. _index += 1
  1053. dumplicates = ""
  1054. if _index>1:
  1055. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1056. final_group = the_group[:_index]
  1057. final_group.sort(key=lambda x:x["docid"])
  1058. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1059. _set = set()
  1060. for _d in final_group:
  1061. _docid = _d["docid"]
  1062. if _docid in _set:
  1063. continue
  1064. dumplicates += "%d,"%_docid
  1065. _set.add(_docid)
  1066. dumplicates = dumplicates[:-1]
  1067. return dumplicates
  1068. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  1069. class f_set_docid_binaryChart(BaseUDAF):
  1070. '''
  1071. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1072. '''
  1073. def __init__(self):
  1074. import json
  1075. global json
  1076. def new_buffer(self):
  1077. return [[]]
  1078. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  1079. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  1080. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  1081. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  1082. "agency":agency,"web_source_no":web_source_no})
  1083. def merge(self, buffer, pbuffer):
  1084. buffer[0].extend(pbuffer[0])
  1085. def terminate(self, buffer):
  1086. list_docs = buffer[0]
  1087. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*2)
  1088. list_group = []
  1089. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  1090. for _timeGroups in list_timeGroups:
  1091. list_empty = []
  1092. list_notEmpty = []
  1093. for _item in _timeGroups:
  1094. empty_flag = True
  1095. for _key in empty_key:
  1096. if not isEmpty(_item[_key]):
  1097. empty_flag = False
  1098. break
  1099. if empty_flag:
  1100. list_empty.append(_item)
  1101. else:
  1102. list_notEmpty.append(_item)
  1103. for _e in list_empty:
  1104. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  1105. _e_tenderee = _e["tenderee"]
  1106. for _ne in list_notEmpty:
  1107. if "set_webSource" not in _ne:
  1108. _ne["set_webSource"] = set()
  1109. _ne["set_webSource"].add(_ne["web_source_no"])
  1110. _suit = False
  1111. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  1112. _suit = True
  1113. elif isEmpty(_e_tenderee):
  1114. _suit = True
  1115. if _suit:
  1116. if _e["web_source_no"] not in _ne["set_webSource"]:
  1117. _ne["set_webSource"].add(_e["web_source_no"])
  1118. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  1119. break
  1120. if len(_group)>1:
  1121. list_group.append(_group)
  1122. return json.dumps(list_group)
  1123. def split_with_time(list_dict,sort_key,timedelta=86400*2):
  1124. if len(list_dict)>0:
  1125. if sort_key in list_dict[0]:
  1126. list_dict.sort(key=lambda x:x[sort_key])
  1127. list_group = []
  1128. _begin = 0
  1129. for i in range(len(list_dict)-1):
  1130. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  1131. continue
  1132. else:
  1133. _group = []
  1134. for j in range(_begin,i+1):
  1135. _group.append(list_dict[j])
  1136. if len(_group)>1:
  1137. list_group.append(_group)
  1138. _begin = i + 1
  1139. if len(list_dict)>1:
  1140. _group = []
  1141. for j in range(_begin,len(list_dict)):
  1142. _group.append(list_dict[j])
  1143. if len(_group)>1:
  1144. list_group.append(_group)
  1145. return list_group
  1146. return [list_dict]
  1147. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  1148. class f_set_docid_limitNum_contain(BaseUDAF):
  1149. '''
  1150. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  1151. '''
  1152. def __init__(self):
  1153. import logging
  1154. import json,re
  1155. global json,logging,re
  1156. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1157. def new_buffer(self):
  1158. return [list()]
  1159. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  1160. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  1161. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  1162. "contain_column":contain_column})
  1163. def merge(self, buffer, pbuffer):
  1164. buffer[0].extend(pbuffer[0])
  1165. def terminate(self, buffer):
  1166. list_split = split_with_time(buffer[0],"page_time_stamp")
  1167. list_group = []
  1168. for _split in list_split:
  1169. flag = True
  1170. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  1171. for _key in keys:
  1172. logging.info(_key+str(getSet(_split,_key)))
  1173. if len(getSet(_split,_key))>1:
  1174. flag = False
  1175. break
  1176. MAX_CONTAIN_COLUMN = None
  1177. #判断组内每条公告是否包含
  1178. if flag:
  1179. for _d in _split:
  1180. contain_column = _d["contain_column"]
  1181. if contain_column is not None and contain_column !="":
  1182. if MAX_CONTAIN_COLUMN is None:
  1183. MAX_CONTAIN_COLUMN = contain_column
  1184. else:
  1185. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  1186. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  1187. flag = False
  1188. break
  1189. MAX_CONTAIN_COLUMN = contain_column
  1190. else:
  1191. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  1192. flag = False
  1193. break
  1194. if flag:
  1195. if len(_split)>1:
  1196. _group = []
  1197. for _item in _split:
  1198. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  1199. list_group.append(_group)
  1200. return json.dumps(list_group)
  1201. @annotate('bigint->string')
  1202. class f_stamp_squence(BaseUDAF):
  1203. '''
  1204. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1205. '''
  1206. def __init__(self):
  1207. import json
  1208. global json
  1209. import logging
  1210. global logging
  1211. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1212. def new_buffer(self):
  1213. return [set()]
  1214. def iterate(self, buffer,page_time_stamp):
  1215. buffer[0].add(page_time_stamp)
  1216. def merge(self, buffer, pbuffer):
  1217. buffer[0] |= pbuffer[0]
  1218. def terminate(self, buffer):
  1219. if 0 in buffer[0]:
  1220. buffer[0].remove(0)
  1221. list_stamp = list(buffer[0])
  1222. list_stamp.sort(key=lambda x:x)
  1223. list_stamp_final = []
  1224. _begin = 0
  1225. _time_decase = 86400*2
  1226. logging.info(str(list_stamp))
  1227. for _index in range(len(list_stamp)-1):
  1228. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  1229. continue
  1230. else:
  1231. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  1232. _begin = _index+1
  1233. if len(list_stamp)>0:
  1234. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  1235. return json.dumps(list_stamp_final)
  1236. @annotate("bigint,string->bigint")
  1237. class in_stamp(object):
  1238. def __init__(self):
  1239. import logging
  1240. import re
  1241. import json
  1242. global logging,re,json
  1243. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1244. def evaluate(self, page_time_stamp,json_stamp):
  1245. list_stamp = json.loads(json_stamp)
  1246. int_flag = 0
  1247. for item in list_stamp:
  1248. if page_time_stamp <item[0]:
  1249. break
  1250. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  1251. int_flag = 1
  1252. break
  1253. return int_flag
  1254. def getConfidence(rule_id):
  1255. if rule_id ==0:
  1256. return 30
  1257. elif rule_id >=1 and rule_id <30:
  1258. return 20
  1259. else:
  1260. return 10
  1261. @annotate('string,string -> string')
  1262. class f_splitStr(BaseUDTF):
  1263. '''
  1264. 将多个组拆解成多条记录
  1265. '''
  1266. def __init__(self):
  1267. import logging
  1268. import json
  1269. global json,logging
  1270. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1271. def process(self, str_split,_split):
  1272. try:
  1273. for _s in str_split.split(_split):
  1274. self.forward(_s)
  1275. except Exception as e:
  1276. pass
  1277. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  1278. class f_split_group_single(BaseUDTF):
  1279. '''
  1280. 将多个组拆解成多条记录
  1281. '''
  1282. def __init__(self):
  1283. import logging
  1284. import json
  1285. global json,logging
  1286. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1287. def process(self, json_set_docid,rule_id):
  1288. list_group = json.loads(json_set_docid)
  1289. for item in list_group:
  1290. if len(item)>100:
  1291. item.sort(key=lambda x:x["docid"],reverse=True)
  1292. index_i = 0
  1293. for index_j in range(1,len(item)):
  1294. if item[index_i]["docid"]!=item[index_j]["docid"]:
  1295. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1296. else:
  1297. for index_i in range(len(item)):
  1298. for index_j in range(len(item)):
  1299. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  1300. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1301. @annotate('bigint,string->string')
  1302. class group_document(BaseUDAF):
  1303. '''
  1304. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1305. '''
  1306. def __init__(self):
  1307. import json
  1308. global json
  1309. def new_buffer(self):
  1310. return [[]]
  1311. def iterate(self, buffer,id,json_set_docid):
  1312. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  1313. def merge(self, buffer, pbuffer):
  1314. buffer[0].extend(pbuffer[0])
  1315. def terminate(self, buffer):
  1316. return json.dumps(buffer[0])
  1317. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  1318. class decare_document(BaseUDTF):
  1319. '''
  1320. 将多个组拆解成多条记录
  1321. '''
  1322. def __init__(self):
  1323. import logging
  1324. import json
  1325. global json,logging
  1326. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1327. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  1328. #y=x,少掉近一半的数据
  1329. if group_id1>=group_id2:
  1330. list_doc1 = json.loads(json_list_doc1)
  1331. list_doc2 = json.loads(json_list_doc2)
  1332. for _doc1 in list_doc1:
  1333. for _doc2 in list_doc2:
  1334. #同一个重复group不做判断
  1335. if _doc1["id"]!=_doc2["id"]:
  1336. #判断两个group是否有重复
  1337. _set1 = set()
  1338. for _item1 in _doc1["json_set_docid"]:
  1339. _set1.add(_item1["docid"])
  1340. _set2 = set()
  1341. for _item2 in _doc2["json_set_docid"]:
  1342. _set2.add(_item2["docid"])
  1343. if len(_set1&_set2)>0:
  1344. new_json_set_docid = _doc1["json_set_docid"]
  1345. for _item2 in _doc2["json_set_docid"]:
  1346. if _item2["docid"] not in _set1:
  1347. new_json_set_docid.append(_item2)
  1348. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  1349. def getBestDocid(list_pair):
  1350. # [docid1,extract_count1,docid2,extract_count2]
  1351. # list_pair.sort(key=lambda x:x[3],reverse=True)
  1352. # _max_count = max(list_pair[0][3],list_pair[0][1])
  1353. # set_candidate = set()
  1354. # if list_pair[0][1]==_max_count:
  1355. # set_candidate.add(list_pair[0][0])
  1356. # for item in list_pair:
  1357. # if item[3]==_max_count:
  1358. # set_candidate.add(item[2])
  1359. # else:
  1360. # break
  1361. # list_candidate = list(set_candidate)
  1362. # list_candidate.sort(key=lambda x:x)
  1363. new_pair = []
  1364. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  1365. for item in list_pair:
  1366. new_pair.append([item[0],item[2],item[3]])
  1367. new_pair.sort(key=lambda x:x[1])
  1368. new_pair.sort(key=lambda x:x[2],reverse=True)
  1369. return new_pair[0][1]
  1370. @annotate('bigint,bigint,bigint,bigint->string')
  1371. class choose_document(BaseUDAF):
  1372. '''
  1373. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1374. '''
  1375. def __init__(self):
  1376. import json
  1377. global json
  1378. def new_buffer(self):
  1379. return [[]]
  1380. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  1381. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  1382. def merge(self, buffer, pbuffer):
  1383. buffer[0].extend(pbuffer[0])
  1384. def terminate(self, buffer):
  1385. list_pair = buffer[0]
  1386. _set = set()
  1387. for item in buffer[0]:
  1388. _set.add(str(item[2]))
  1389. list_dumplicate = list(_set)
  1390. best_docid = getBestDocid(list_pair)
  1391. if best_docid==list_pair[0][0]:
  1392. save_flag = 1
  1393. else:
  1394. save_flag = 0
  1395. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  1396. @annotate('string -> bigint,string')
  1397. class f_get_choose_document(BaseUDTF):
  1398. '''
  1399. 将多个组拆解成多条记录
  1400. '''
  1401. def __init__(self):
  1402. import logging
  1403. import json
  1404. global json,logging
  1405. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1406. def process(self,json_choose):
  1407. if json_choose is None:
  1408. self.forward(1,None)
  1409. else:
  1410. _choose = json.loads(json_choose)
  1411. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  1412. @annotate('string->bigint')
  1413. class f_get_codes_count(object):
  1414. def evaluate(self,extract_json):
  1415. if extract_json is None or extract_json=="":
  1416. extract_json = "{}"
  1417. _extract = json.loads(extract_json)
  1418. _codes = _extract.get("code",[])
  1419. return len(_codes)
  1420. @annotate('string->string')
  1421. class f_get_codes(object):
  1422. def evaluate(self,extract_json):
  1423. if extract_json is None or extract_json=="":
  1424. extract_json = "{}"
  1425. _extract = json.loads(extract_json)
  1426. _codes = _extract.get("code",[])
  1427. return ",".join(_codes)
  1428. @annotate('bigint,bigint,bigint,bigint->string')
  1429. class group_document_bestFirst(BaseUDAF):
  1430. '''
  1431. 将组里面最优的放在前面
  1432. '''
  1433. def __init__(self):
  1434. import json
  1435. global json
  1436. def new_buffer(self):
  1437. return [[]]
  1438. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  1439. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  1440. def merge(self, buffer, pbuffer):
  1441. buffer[0].extend(pbuffer[0])
  1442. def terminate(self, buffer):
  1443. list_pair = buffer[0]
  1444. _set = set()
  1445. for item in buffer[0]:
  1446. _set.add(item[2])
  1447. _set.add(list_pair[0][0])
  1448. best_docid = getBestDocid(list_pair)
  1449. _set.remove(best_docid)
  1450. list_dumplicate = list(_set)
  1451. list_dumplicate.sort(key=lambda x:x)
  1452. list_dumplicate.insert(0,best_docid)
  1453. list_dumplicate_str = []
  1454. for item in list_dumplicate:
  1455. list_dumplicate_str.append(str(item))
  1456. return ",".join(list_dumplicate_str)
  1457. @annotate('string -> bigint,string')
  1458. class f_get_best_dumplicates(BaseUDTF):
  1459. '''
  1460. 得到每个分组中最优的那一条及其重复记录
  1461. '''
  1462. def __init__(self):
  1463. import logging
  1464. import json
  1465. global json,logging
  1466. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1467. def process(self,list_dumplicate_str):
  1468. if list_dumplicate_str is None or list_dumplicate_str=='':
  1469. pass
  1470. else:
  1471. list_dumplicate = list_dumplicate_str.split(",")
  1472. if len(list_dumplicate)>0:
  1473. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  1474. else:
  1475. pass
  1476. @annotate('bigint,bigint->string')
  1477. class bridge2group(BaseUDAF):
  1478. '''
  1479. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1480. '''
  1481. def __init__(self):
  1482. import json
  1483. global json
  1484. def new_buffer(self):
  1485. return [set()]
  1486. def iterate(self, buffer,docid1,docid2):
  1487. buffer[0].add(docid1)
  1488. buffer[0].add(docid2)
  1489. def merge(self, buffer, pbuffer):
  1490. buffer[0] |= pbuffer[0]
  1491. def terminate(self, buffer):
  1492. list_pair = list(buffer[0])
  1493. list_pair.sort(key=lambda x:x,reverse=True)
  1494. return json.dumps(list_pair)
  1495. @annotate('string -> bigint,bigint')
  1496. class group2bridge(BaseUDTF):
  1497. '''
  1498. 将多个组拆解成多条记录
  1499. '''
  1500. def __init__(self):
  1501. import logging
  1502. import json
  1503. global json,logging
  1504. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1505. def process(self,json_list_docid):
  1506. list_docid = json.loads(json_list_docid)
  1507. for _docid in list_docid:
  1508. self.forward(list_docid[-1],_docid)
  1509. @annotate('string->string')
  1510. class to_url(object):
  1511. def evaluate(self,_s):
  1512. if _s is None or _s=="":
  1513. return
  1514. else:
  1515. list_l = []
  1516. for l in _s.split(","):
  1517. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  1518. return ",".join(list_l)
  1519. @annotate('bigint,bigint,string -> bigint')
  1520. class f_get_dump_docid(BaseUDTF):
  1521. '''
  1522. 将多个组拆解成多条记录
  1523. '''
  1524. def __init__(self):
  1525. import logging
  1526. import json
  1527. global json,logging
  1528. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1529. def process(self,docid,save_flag,dumplicates):
  1530. if save_flag==0:
  1531. self.forward(docid)
  1532. if dumplicates is not None:
  1533. list_docid = dumplicates.split(",")
  1534. if len(list_docid)>0:
  1535. for _docid in list_docid[1:]:
  1536. self.forward(int(_docid))
  1537. else:
  1538. if dumplicates is not None:
  1539. list_docid = dumplicates.split(",")
  1540. if len(list_docid)>0:
  1541. for _docid in list_docid:
  1542. self.forward(int(_docid))
  1543. @annotate('string -> bigint,bigint')
  1544. class f_get_docid(BaseUDTF):
  1545. '''
  1546. 将多个组拆解成多条记录
  1547. '''
  1548. def __init__(self):
  1549. import logging
  1550. import json
  1551. global json,logging
  1552. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1553. def process(self,json_set_docid):
  1554. team_id = 0
  1555. if json_set_docid is not None:
  1556. list_docses = json.loads(json_set_docid)
  1557. for list_docs in list_docses:
  1558. team_id += 1
  1559. for item in list_docs:
  1560. self.forward(team_id,item["docid"])
  1561. @annotate("string->bigint")
  1562. class get_count_dump(object):
  1563. def __init__(self):
  1564. import logging
  1565. import re
  1566. global logging,re
  1567. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1568. def evaluate(self, title):
  1569. _count = 0
  1570. if title is not None:
  1571. _count = len(title.split(","))
  1572. return _count
  1573. def getSet(list_dict,key):
  1574. _set = set()
  1575. for item in list_dict:
  1576. if key in item:
  1577. if item[key]!='' and item[key] is not None:
  1578. if re.search("^\d[\d\.]*$",item[key]) is not None:
  1579. _set.add(str(float(item[key])))
  1580. else:
  1581. _set.add(str(item[key]))
  1582. return _set
  1583. def getDiffIndex(list_dict,key,confidence=100):
  1584. '''
  1585. 优化为相似度判断
  1586. :param list_dict:
  1587. :param key:
  1588. :param confidence:
  1589. :return:
  1590. '''
  1591. # _set = set()
  1592. # for _i in range(len(list_dict)):
  1593. # item = list_dict[_i]
  1594. # if item["confidence"]>=confidence:
  1595. # continue
  1596. # if key in item:
  1597. # if item[key]!='' and item[key] is not None:
  1598. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  1599. # _set.add(str(float(item[key])))
  1600. # else:
  1601. # _set.add(str(item[key]))
  1602. # if len(_set)>1:
  1603. # return _i
  1604. # ==============================
  1605. _set = set()
  1606. _set_m = set()
  1607. base_s = ""
  1608. for _i in range(len(list_dict)):
  1609. item = list_dict[_i]
  1610. if item["confidence"]>=confidence:
  1611. continue
  1612. if key in item:
  1613. if item[key]!='' and item[key] is not None:
  1614. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  1615. _m = float(item[key])
  1616. if _m>100000:
  1617. _m = _m//10000*10000
  1618. _set_m.add(str(_m))
  1619. else:
  1620. _s = str(item[key])
  1621. if base_s=="":
  1622. base_s = _s
  1623. else:
  1624. simi = getSimilarityOfString(base_s,_s)
  1625. if simi<0.8:
  1626. return _i
  1627. if len(_set_m)>1:
  1628. return _i
  1629. return len(list_dict)
  1630. @annotate('bigint,string -> bigint,bigint')
  1631. class f_getGroup_dumpFinal(BaseUDTF):
  1632. '''
  1633. 从最后的结果中获取组
  1634. '''
  1635. def __init__(self):
  1636. import logging
  1637. import json
  1638. global json,logging
  1639. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1640. def process(self,docid,dumplicates):
  1641. self.forward(int(docid),int(docid))
  1642. if dumplicates is not None:
  1643. list_docids = dumplicates.split(",")
  1644. for _docid in list_docids:
  1645. self.forward(int(docid),int(_docid))
  1646. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  1647. class f_redump_limit_num(BaseUDAF):
  1648. '''
  1649. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1650. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1651. '''
  1652. def __init__(self):
  1653. import logging
  1654. import json,re
  1655. global json,logging,re
  1656. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1657. def new_buffer(self):
  1658. return [list()]
  1659. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  1660. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  1661. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  1662. "extract_count2":extract_count2,"confidence":confidence})
  1663. def merge(self, buffer, pbuffer):
  1664. buffer[0].extend(pbuffer[0])
  1665. def terminate(self, buffer):
  1666. list_group = []
  1667. the_group = buffer[0]
  1668. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1669. if len(the_group)>5:
  1670. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  1671. else:
  1672. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  1673. final_group = []
  1674. #置信度
  1675. list_key_index = []
  1676. for _k in keys:
  1677. if _k=="doctitle":
  1678. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1679. else:
  1680. list_key_index.append(getDiffIndex(the_group,_k))
  1681. _index = min(list_key_index)
  1682. if _index>1:
  1683. main_docid = the_group[0]["main_docid"]
  1684. for item in the_group[:_index]:
  1685. if item["docid"]!=main_docid:
  1686. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  1687. # stay = True
  1688. # for _key in keys:
  1689. # if len(getSet(the_group,_key))>1:
  1690. # stay = False
  1691. # break
  1692. #
  1693. # if stay:
  1694. # main_docid = the_group[0]["main_docid"]
  1695. # for item in the_group:
  1696. # if item["docid"]!=main_docid:
  1697. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  1698. return json.dumps(final_group)
  1699. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  1700. class f_get_dumpFinal_checked(BaseUDTF):
  1701. '''
  1702. 从最后的结果中获取组
  1703. '''
  1704. def __init__(self):
  1705. import logging
  1706. import json
  1707. global json,logging
  1708. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1709. def process(self,list_group):
  1710. if list_group is not None:
  1711. final_group = json.loads(list_group)
  1712. for _group in final_group:
  1713. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  1714. @annotate('string -> bigint')
  1715. class f_getDumplicateDocids(BaseUDTF):
  1716. '''
  1717. 从最后的结果中获取组
  1718. '''
  1719. def __init__(self):
  1720. import logging
  1721. import json
  1722. global json,logging
  1723. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1724. def process(self,dumplicates):
  1725. list_docids = dumplicates.split(",")
  1726. for _d in list_docids:
  1727. self.forward(int(_d))
  1728. def jaccard_score(source,target):
  1729. source_set = set([s for s in source])
  1730. target_set = set([s for s in target])
  1731. if len(source_set)==0 or len(target_set)==0:
  1732. return 0
  1733. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  1734. def getSimilarityOfString(str1,str2):
  1735. _set1 = set()
  1736. _set2 = set()
  1737. if str1 is not None:
  1738. for i in range(1,len(str1)):
  1739. _set1.add(str1[i-1:i+1])
  1740. for i in range(2,len(str1)):
  1741. _set1.add(str1[i-2:i+1])
  1742. if str2 is not None:
  1743. for i in range(1,len(str2)):
  1744. _set2.add(str2[i-1:i+1])
  1745. for i in range(2,len(str2)):
  1746. _set2.add(str2[i-2:i+1])
  1747. _len = max(1,min(len(_set1),len(_set2)))
  1748. return len(_set1&_set2)/_len
  1749. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  1750. class f_is_legal(object):
  1751. def __init__(self):
  1752. import logging
  1753. import re
  1754. global logging,re
  1755. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1756. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  1757. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  1758. return 0
  1759. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  1760. return 0
  1761. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  1762. return 0
  1763. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  1764. return 0
  1765. _sim = getSimilarityOfString(project_code1,project_code2)
  1766. if _sim>0.7 and _sim<1:
  1767. return 0
  1768. return 1
  1769. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  1770. class f_autorule_group(BaseUDAF):
  1771. '''
  1772. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1773. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1774. '''
  1775. def __init__(self):
  1776. import logging
  1777. import json,re
  1778. global json,logging,re
  1779. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1780. def new_buffer(self):
  1781. return [list()]
  1782. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  1783. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  1784. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  1785. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  1786. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  1787. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  1788. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  1789. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  1790. def merge(self, buffer, pbuffer):
  1791. buffer[0].extend(pbuffer[0][:100])
  1792. buffer[0] = buffer[0][:100]
  1793. def getSameKeys(self,_dict1,_dict2):
  1794. list_keys = []
  1795. for k,v in _dict1.items():
  1796. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  1797. continue
  1798. v2 = _dict2.get(k,"")
  1799. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  1800. list_keys.append(k)
  1801. list_keys.sort(key=lambda x:x)
  1802. return "=".join(list_keys)
  1803. def terminate(self, buffer):
  1804. list_group = []
  1805. the_group = buffer[0]
  1806. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1807. if len(the_group)>5:
  1808. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  1809. else:
  1810. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  1811. #置信度
  1812. list_key_index = []
  1813. for _k in keys:
  1814. if _k=="doctitle":
  1815. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1816. else:
  1817. list_key_index.append(getDiffIndex(the_group,_k))
  1818. final_group = []
  1819. _index = min(list_key_index)
  1820. if _index>1:
  1821. for item in the_group[:_index]:
  1822. final_group.append(item)
  1823. list_rules = []
  1824. for i in range(len(final_group)):
  1825. for j in range(i+1,len(final_group)):
  1826. _dict1 = final_group[i]
  1827. _dict2 = final_group[j]
  1828. _rule = self.getSameKeys(_dict1,_dict2)
  1829. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  1830. return json.dumps(list_rules)
  1831. @annotate('string -> string,bigint,bigint')
  1832. class f_autorule_group_extract(BaseUDTF):
  1833. '''
  1834. 从最后的结果中获取组
  1835. '''
  1836. def __init__(self):
  1837. import logging
  1838. import json
  1839. global json,logging
  1840. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1841. def process(self,rules_json):
  1842. list_rules = json.loads(rules_json)
  1843. for _rule in list_rules:
  1844. self.forward(_rule[0],_rule[1],_rule[2])
  1845. if __name__ == '__main__':
  1846. # _str1 = "SXXY-ZBP-GG-2020002"
  1847. # _str2 = "SXXY-ZBP-GG-2020002"
  1848. # print(getSimilarityOfString(_str1,_str2))
  1849. print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  1850. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  1851. # print(check_product(None,None))
  1852. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  1853. # print(check_money("550.0","440.0","",""))
  1854. # for i in range(0,2):
  1855. # print(i)
  1856. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  1857. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  1858. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  1859. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  1860. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  1861. # if _match is not None:
  1862. # print(_match.groupdict()["name"])
  1863. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  1864. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  1865. # c = f_get_extractCount()
  1866. # _json = '''
  1867. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  1868. # '''
  1869. # c = f_get_nlp_enterprise()
  1870. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  1871. # print(c.evaluate(_json))
  1872. # c = f_set_docid()f_get_single_merged_bychannel
  1873. # _s = '''
  1874. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1875. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1876. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1877. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  1878. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1879. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  1880. # '''
  1881. # buffer = c.new_buffer()
  1882. # for _line in _s.split("\n"):
  1883. # _line = _line.strip()
  1884. # if _line=="":
  1885. # continue
  1886. # l_column = _line.split("\t")
  1887. # print(l_column)
  1888. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  1889. # page_time_stamp = int(page_time_stamp)
  1890. # extract_count = int(extract_count)
  1891. # num = 1
  1892. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  1893. # print(c.terminate(buffer))