documentMerge.py 138 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.distcache import get_cache_archive
  4. from odps.distcache import get_cache_file
  5. from odps.udf import BaseUDTF,BaseUDAF
  6. import threading
  7. import logging
  8. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  9. import time
  10. import json
  11. from uuid import uuid4
  12. import traceback
  13. import re
  14. project_uuid = "uuid"
  15. project_docids = "docids"
  16. project_zhao_biao_page_time = "zhao_biao_page_time"
  17. project_zhong_biao_page_time = "zhong_biao_page_time"
  18. project_page_time = "page_time"
  19. project_doctextcon = "doctextcon"
  20. project_area = "area"
  21. project_province = "province"
  22. project_city = "city"
  23. project_district = "district"
  24. project_info_type = "info_type"
  25. project_industry = "industry"
  26. project_qcodes = "qcodes"
  27. project_project_name = "project_name"
  28. project_project_code = "project_code"
  29. project_project_codes = "project_codes"
  30. project_project_addr = "project_addr"
  31. project_tenderee = "tenderee"
  32. project_tenderee_addr = "tenderee_addr"
  33. project_tenderee_phone = "tenderee_phone"
  34. project_tenderee_contact = "tenderee_contact"
  35. project_agency = "agency"
  36. project_agency_phone = "agency_phone"
  37. project_agency_contact = "agency_contact"
  38. project_sub_project_name = "sub_project_name"
  39. project_sub_project_code = "sub_project_code"
  40. project_bidding_budget = "bidding_budget"
  41. project_win_tenderer = "win_tenderer"
  42. project_win_bid_price = "win_bid_price"
  43. project_win_tenderer_manager = "win_tenderer_manager"
  44. project_win_tenderer_phone = "win_tenderer_phone"
  45. project_second_tenderer = "second_tenderer"
  46. project_second_bid_price = "second_bid_price"
  47. project_second_tenderer_manager = "second_tenderer_manager"
  48. project_second_tenderer_phone = "second_tenderer_phone"
  49. project_third_tenderer = "third_tenderer"
  50. project_third_bid_price = "third_bid_price"
  51. project_third_tenderer_manager = "third_tenderer_manager"
  52. project_third_tenderer_phone = "third_tenderer_phone"
  53. project_procurement_system = "procurement_system"
  54. project_bidway = "bidway"
  55. project_dup_data = "dup_data"
  56. project_docid_number = "docid_number"
  57. project_project_dynamics = "project_dynamic"
  58. project_product = "product"
  59. project_moneysource = "moneysource"
  60. project_service_time = "service_time"
  61. project_time_bidclose = "time_bidclose"
  62. project_time_bidopen = "time_bidopen"
  63. project_time_bidstart = "time_bidstart"
  64. project_time_commencement = "time_commencement"
  65. project_time_completion = "time_completion"
  66. project_time_earnest_money_start = "time_earnest_money_start"
  67. project_time_earnest_money_end = "time_earnest_money_end"
  68. project_time_get_file_end = "time_get_file_end"
  69. project_time_get_file_start = "time_get_file_start"
  70. project_time_publicity_end = "time_publicity_end"
  71. project_time_publicity_start = "time_publicity_start"
  72. project_time_registration_end = "time_registration_end"
  73. project_time_registration_start = "time_registration_start"
  74. project_time_release = "time_release"
  75. project_dup_docid = "dup_docid"
  76. project_info_source = "info_source"
  77. project_delete_uuid = "delete_uuid"
  78. project_nlp_enterprise = "nlp_enterprise"
  79. project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
  80. project_update_time = "update_time"
  81. project_tmp_attrs = "tmp_attrs"
  82. document_partitionkey = "partitionkey"
  83. document_docid = "docid"
  84. document_dochtmlcon = "dochtmlcon"
  85. document_doctextcon = "doctextcon"
  86. document_doctitle = "doctitle"
  87. document_attachmenttextcon = "attachmenttextcon"
  88. document_attachment_path = "page_attachments"
  89. document_attachment_path_filemd5 = "fileMd5"
  90. document_attachment_path_fileTitle = "fileTitle"
  91. document_attachment_path_fileLink = "fileLink"
  92. document_crtime = "crtime"
  93. document_status = "status"
  94. document_page_time = "page_time"
  95. document_attachment_extract_status = "attachment_extract_status"
  96. document_web_source_no = "web_source_no"
  97. document_fingerprint = "fingerprint"
  98. document_opertime = "opertime"
  99. document_docchannel = "docchannel"
  100. document_original_docchannel = "original_docchannel"
  101. document_life_docchannel = "life_docchannel"
  102. document_area = "area"
  103. document_province = "province"
  104. document_city = "city"
  105. document_district = "district"
  106. document_extract_json = "extract_json"
  107. document_bidway = "bidway"
  108. document_industry = "industry"
  109. document_info_type = "info_type"
  110. document_qcodes = "qcodes"
  111. document_project_name = "project_name"
  112. document_project_code = "project_code"
  113. document_project_codes = "project_codes"
  114. document_tenderee = "tenderee"
  115. document_tenderee_addr = "tenderee_addr"
  116. document_tenderee_phone = "tenderee_phone"
  117. document_tenderee_contact = "tenderee_contact"
  118. document_agency = "agency"
  119. document_agency_phone = "agency_phone"
  120. document_agency_contact = "agency_contact"
  121. document_product = "product"
  122. document_moneysource = "moneysource"
  123. document_service_time = "service_time"
  124. document_time_bidclose = "time_bidclose"
  125. document_time_bidopen = "time_bidopen"
  126. document_time_bidstart = "time_bidstart"
  127. document_time_commencement = "time_commencement"
  128. document_time_completion = "time_completion"
  129. document_time_earnest_money_start = "time_earnest_money_start"
  130. document_time_earnest_money_end = "time_earnest_money_end"
  131. document_time_get_file_end = "time_get_file_end"
  132. document_time_get_file_start = "time_get_file_start"
  133. document_time_publicity_end = "time_publicity_end"
  134. document_time_publicity_start = "time_publicity_start"
  135. document_time_registration_end = "time_registration_end"
  136. document_time_registration_start = "time_registration_start"
  137. document_time_release = "time_release"
  138. document_info_source = "info_source"
  139. document_nlp_enterprise = "nlp_enterprise"
  140. document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
  141. document_tmp_partitionkey = "partitionkey"
  142. document_tmp_docid = "docid"
  143. document_tmp_dochtmlcon = "dochtmlcon"
  144. document_tmp_doctextcon = "doctextcon"
  145. document_tmp_doctitle = "doctitle"
  146. document_tmp_attachmenttextcon = "attachmenttextcon"
  147. document_tmp_attachment_path = "page_attachments"
  148. document_tmp_attachment_path_filemd5 = "fileMd5"
  149. document_tmp_attachment_path_fileTitle = "fileTitle"
  150. document_tmp_attachment_path_fileLink = "fileLink"
  151. document_tmp_uuid = "uuid"
  152. document_tmp_crtime = "crtime"
  153. document_tmp_status = "status"
  154. document_tmp_tenderee = "tenderee"
  155. document_tmp_agency = "agency"
  156. document_tmp_project_code = "project_code"
  157. document_tmp_product = "product"
  158. document_tmp_project_name = "project_name"
  159. document_tmp_doctitle_refine = "doctitle_refine"
  160. document_tmp_extract_count = "extract_count"
  161. document_tmp_sub_docs_json = "sub_docs_json"
  162. document_tmp_save = "save"
  163. document_tmp_dup_docid = "dup_docid"
  164. document_tmp_merge_uuid = "merge_uuid"
  165. document_tmp_projects = "projects"
  166. document_tmp_page_time = "page_time"
  167. document_tmp_attachment_extract_status = "attachment_extract_status"
  168. document_tmp_web_source_no = "web_source_no"
  169. document_tmp_web_source_name = "web_source_name"
  170. document_tmp_fingerprint = "fingerprint"
  171. document_tmp_opertime = "opertime"
  172. document_tmp_docchannel = "docchannel"
  173. document_tmp_original_docchannel = "original_docchannel"
  174. document_tmp_extract_json = "extract_json"
  175. document_tmp_industry_json = "industry_json"
  176. document_tmp_other_json = "other_json"
  177. document_tmp_time_bidclose = "time_bidclose"
  178. document_tmp_time_bidopen = "time_bidopen"
  179. document_tmp_time_completion = "time_completion"
  180. document_tmp_time_earnest_money_end = "time_earnest_money_end"
  181. document_tmp_time_earnest_money_start = "time_earnest_money_start"
  182. document_tmp_time_get_file_end = "time_get_file_end"
  183. document_tmp_time_get_file_start = "time_get_file_start"
  184. document_tmp_time_publicity_end = "time_publicity_end"
  185. document_tmp_time_publicity_start = "time_publicity_start"
  186. document_tmp_time_registration_end = "time_registration_end"
  187. document_tmp_time_registration_start = "time_registration_start"
  188. document_tmp_time_release = "time_release"
  189. def log(msg):
  190. logging.info(msg)
  191. # 配置pandas依赖包
  192. def include_package_path(res_name):
  193. import os, sys
  194. archive_files = get_cache_archive(res_name)
  195. dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
  196. if '.dist_info' not in f.name], key=lambda v: len(v))
  197. _path = dir_names[0].split(".zip/files")[0]+".zip/files"
  198. log("add path:%s"%(_path))
  199. sys.path.append(_path)
  200. return os.path.dirname(dir_names[0])
  201. # 可能出现类似RuntimeError: xxx has been blocked by sandbox
  202. # 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
  203. def include_file(file_name):
  204. import os, sys
  205. so_file = get_cache_file(file_name)
  206. sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
  207. def include_so(file_name):
  208. import os, sys
  209. so_file = get_cache_file(file_name)
  210. with open(so_file.name, 'rb') as fp:
  211. content=fp.read()
  212. so = open(file_name, "wb")
  213. so.write(content)
  214. so.flush()
  215. so.close()
  216. #初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
  217. def init_env(list_files,package_name):
  218. import os,sys
  219. if len(list_files)==1:
  220. so_file = get_cache_file(list_files[0])
  221. cmd_line = os.path.abspath(so_file.name)
  222. os.system("unzip -o %s -d %s"%(cmd_line,package_name))
  223. elif len(list_files)>1:
  224. cmd_line = "cat"
  225. for _file in list_files:
  226. so_file = get_cache_file(_file)
  227. cmd_line += " "+os.path.abspath(so_file.name)
  228. cmd_line += " > temp.zip"
  229. os.system(cmd_line)
  230. os.system("unzip -o temp.zip -d %s"%(package_name))
  231. # os.system("rm -rf %s/*.dist-info"%(package_name))
  232. # return os.listdir(os.path.abspath("local_package"))
  233. # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
  234. # os.system("source ~/.bashrc")
  235. sys.path.insert(0,os.path.abspath(package_name))
  236. # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
  237. import platform
  238. def getSet(list_dict,key):
  239. _set = set()
  240. for item in list_dict:
  241. if key in item:
  242. if item[key]!='' and item[key] is not None:
  243. if re.search("^[\d\.]+$",item[key]) is not None:
  244. _set.add(str(float(item[key])))
  245. else:
  246. _set.add(str(item[key]))
  247. return _set
  248. def popNoneFromDict(_dict):
  249. list_pop = []
  250. for k,v in _dict.items():
  251. if v is None or v=="":
  252. list_pop.append(k)
  253. for k in list_pop:
  254. _dict.pop(k)
  255. return _dict
  256. def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
  257. group_num = 1
  258. if more_than_one:
  259. group_num = 2
  260. if len(list_dict)>0:
  261. if (isinstance(list_dict[0],dict) and sort_key in list_dict[0]) or (isinstance(list_dict[0],list) and isinstance(sort_key,int) and sort_key<len(list_dict[0])):
  262. list_dict.sort(key=lambda x:x[sort_key])
  263. list_group = []
  264. _begin = 0
  265. for i in range(len(list_dict)-1):
  266. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  267. continue
  268. else:
  269. _group = []
  270. for j in range(_begin,i+1):
  271. _group.append(list_dict[j])
  272. if len(_group)>1:
  273. list_group.append(_group)
  274. _begin = i + 1
  275. if len(list_dict)>=group_num:
  276. _group = []
  277. for j in range(_begin,len(list_dict)):
  278. _group.append(list_dict[j])
  279. if len(_group)>0:
  280. list_group.append(_group)
  281. return list_group
  282. return [list_dict]
  283. @annotate('bigint,bigint,string,string,string,string,string,string,bigint->string')
  284. class f_merge_rule_limit_num_contain_greater(BaseUDAF):
  285. '''
  286. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  287. '''
  288. def __init__(self):
  289. import logging
  290. import json,re
  291. global json,logging,re
  292. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  293. def new_buffer(self):
  294. return [list()]
  295. def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,greater_column,MAX_NUM):
  296. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
  297. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  298. "contain_column":contain_column,"greater_column":greater_column,"MAX_NUM":MAX_NUM})
  299. def merge(self, buffer, pbuffer):
  300. buffer[0].extend(pbuffer[0])
  301. def terminate(self, buffer):
  302. MAX_NUM = 5
  303. if len(buffer[0])>0:
  304. MAX_NUM = buffer[0][0]["MAX_NUM"]
  305. list_split = split_with_time(buffer[0],"page_time_stamp")
  306. list_group = []
  307. for _split in list_split:
  308. flag = True
  309. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  310. dict_set = {}
  311. for _key in keys:
  312. dict_set[_key] = set()
  313. if len(_split)>MAX_NUM:
  314. flag = False
  315. else:
  316. for _key in keys:
  317. logging.info(_key+str(getSet(_split,_key)))
  318. if len(getSet(_split,_key))>1:
  319. flag = False
  320. break
  321. MAX_CONTAIN_COLUMN = None
  322. #判断组内每条公告是否包含
  323. if flag:
  324. for _d in _split:
  325. contain_column = _d["contain_column"]
  326. if contain_column is not None and contain_column !="":
  327. if MAX_CONTAIN_COLUMN is None:
  328. MAX_CONTAIN_COLUMN = contain_column
  329. else:
  330. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  331. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  332. flag = False
  333. break
  334. MAX_CONTAIN_COLUMN = contain_column
  335. else:
  336. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  337. flag = False
  338. break
  339. if len(getSet(_split,"greater_column"))==1:
  340. flag = False
  341. break
  342. if flag:
  343. _set_docid = set()
  344. for item in _split:
  345. _set_docid.add(item["docid"])
  346. if len(_set_docid)>1:
  347. list_group.append(list(_set_docid))
  348. return json.dumps(list_group)
  349. def getDiffIndex(list_dict,key):
  350. _set = set()
  351. for _i in range(len(list_dict)):
  352. item = list_dict[_i]
  353. if key in item:
  354. if item[key]!='' and item[key] is not None:
  355. if re.search("^\d[\d\.]*$",item[key]) is not None:
  356. _set.add(str(float(item[key])))
  357. else:
  358. _set.add(str(item[key]))
  359. if len(_set)>1:
  360. return _i
  361. return len(list_dict)
  362. @annotate('bigint,bigint,string,string,string,string,string,string,string,bigint->string')
  363. class f_remege_limit_num_contain(BaseUDAF):
  364. '''
  365. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  366. '''
  367. def __init__(self):
  368. import logging
  369. import json,re
  370. global json,logging,re
  371. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  372. def new_buffer(self):
  373. return [list()]
  374. def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
  375. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
  376. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  377. "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence})
  378. def merge(self, buffer, pbuffer):
  379. buffer[0].extend(pbuffer[0])
  380. def getNotLikeSet(self,_dict,column_name):
  381. column_value = _dict.get(column_name,None)
  382. _set = set()
  383. if column_value is not None:
  384. for _i in range(1,len(column_value)):
  385. _set.add(column_value[_i-1:_i+1])
  386. _dict["notLike_set"] = _set
  387. def getSimilarity(self,_set1,_set2):
  388. _sum = max([1,min([len(_set1),len(_set2)])])
  389. return len(_set1&_set2)/_sum
  390. def terminate(self, buffer):
  391. list_group = []
  392. the_group = buffer[0]
  393. SIM_PROB = 0.6
  394. for _d in the_group:
  395. self.getNotLikeSet(_d,"notLike_column")
  396. #判断多个值与否
  397. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  398. re_merge = False
  399. for _key in keys:
  400. if len(getSet(the_group,_key))>1:
  401. re_merge = True
  402. break
  403. #判断是否相似而不相同
  404. re_merge_sim = False
  405. for _i1 in range(0,len(the_group)):
  406. for _j1 in range(_i1+1,len(the_group)):
  407. _set1 = the_group[_i1]["notLike_set"]
  408. _set2 = the_group[_j1]["notLike_set"]
  409. _sim = self.getSimilarity(_set1,_set2)
  410. if _sim>SIM_PROB and _sim<1:
  411. re_merge_sim = True
  412. break
  413. contain_keys = ["contain_column1","contain_column2"]
  414. logging.info(the_group)
  415. logging.info(str(re_merge)+str(re_merge_sim))
  416. if re_merge or re_merge_sim:
  417. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  418. the_group.sort(key=lambda x:x["page_time_stamp"])
  419. #重新成组
  420. dict_docid_doc = {}
  421. for _doc in the_group:
  422. dict_docid_doc[_doc["docid"]] = _doc
  423. for _doc in the_group:
  424. merge_flag = False
  425. for _index in range(len(list_group)):
  426. _g = list_group[_index]
  427. hit_count = 0
  428. dict_temp = dict()
  429. #多个值的异常
  430. if re_merge:
  431. for _c_key in contain_keys:
  432. dict_temp[_c_key] = _g[_c_key]
  433. if _g[_c_key] is not None and _doc[_c_key] is not None:
  434. if len(_g[_c_key])>len(_doc[_c_key]):
  435. if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
  436. dict_temp[_c_key] = _g[_c_key]
  437. hit_count += 1
  438. else:
  439. if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
  440. dict_temp[_c_key] = _doc[_c_key]
  441. _g[_c_key] = _doc[_c_key]
  442. hit_count += 1
  443. else:
  444. hit_count = 1
  445. # if hit_count==len(contain_keys):
  446. if hit_count>0:
  447. _flag_sim = False
  448. #相似而不相同的异常
  449. if re_merge_sim:
  450. for _docid in _g["docid"]:
  451. tmp_d = dict_docid_doc[_docid]
  452. _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
  453. if _sim>SIM_PROB and _sim<1:
  454. _flag_sim = True
  455. if not _flag_sim:
  456. for _c_key in dict_temp.keys():
  457. _g[_c_key] = dict_temp[_c_key]
  458. _g["docid"].append(_doc["docid"])
  459. merge_flag = True
  460. break
  461. if not merge_flag:
  462. _dict = dict()
  463. _dict["docid"] = [_doc["docid"]]
  464. for _c_key in contain_keys:
  465. _dict[_c_key] = _doc[_c_key]
  466. list_group.append(_dict)
  467. final_group = []
  468. #判断是否符合一个值
  469. for _group in list_group:
  470. _split = []
  471. for _docid in _group["docid"]:
  472. _split.append(dict_docid_doc[_docid])
  473. #通过置信度排序,尽可能保留组
  474. _split.sort(key=lambda x:x["confidence"],reverse=True)
  475. #置信度
  476. list_key_index = []
  477. for _k in keys:
  478. list_key_index.append(getDiffIndex(_split,_k))
  479. _index = min(list_key_index)
  480. final_group.append([_c["docid"] for _c in _split[:_index]])
  481. for _c in _split[_index:]:
  482. final_group.append([_c["docid"]])
  483. #若是找到两个以上,则全部单独成组,否则成一组
  484. # _flag = True
  485. # for _key in keys:
  486. # if len(getSet(_split,_key))>1:
  487. # _flag = False
  488. # break
  489. # if not _flag:
  490. # for _docid in _group["docid"]:
  491. # final_group.append([_docid])
  492. # else:
  493. # final_group.append(list(set(_group["docid"])))
  494. else:
  495. final_group = [list(set([item["docid"] for item in the_group]))]
  496. log(str(final_group))
  497. return json.dumps(final_group)
  498. def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
  499. _time = time.strftime(format,time.localtime())
  500. return _time
  501. @annotate('bigint->string')
  502. class f_get_single_merged_bychannel(BaseUDTF):
  503. def process(self,docid):
  504. _d = {"data":{str(docid):[]},"process_time":getCurrent_date()}
  505. self.forward(json.dumps(_d))
  506. @annotate('string->string')
  507. class f_get_single_merged_docids(object):
  508. def evaluate(self,_json):
  509. if _json!="" and _json is not None:
  510. _d = json.loads(_json)
  511. _keys = _d.get("data",{}).keys()
  512. return ",".join(list(_keys))
  513. return ""
  514. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint,bigint,string->string')
  515. class f_remege_limit_num_contain_bychannel(BaseUDAF):
  516. '''f_remege_limit_num_contain_bychannel
  517. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  518. '''
  519. def __init__(self):
  520. import logging
  521. import json,re
  522. global json,logging,re
  523. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  524. def new_buffer(self):
  525. return [list()]
  526. def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence,extract_count,json_dicttime):
  527. _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
  528. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  529. "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence,
  530. "extract_count":extract_count,"json_dicttime":json_dicttime}
  531. buffer[0].append(_dict)
  532. def merge(self, buffer, pbuffer):
  533. buffer[0].extend(pbuffer[0])
  534. def getNotLikeSet(self,_dict,column_name):
  535. column_value = _dict.get(column_name,None)
  536. _set = set()
  537. if column_value is not None:
  538. for _i in range(1,len(column_value)):
  539. _set.add(column_value[_i-1:_i+1])
  540. _dict["notLike_set"] = _set
  541. def getSimilarity(self,_set1,_set2):
  542. _sum = max([1,min([len(_set1),len(_set2)])])
  543. return len(_set1&_set2)/_sum
  544. def difftimecount(self,_dict1,_dict2):
  545. _count = 0
  546. for k,v in _dict1.items():
  547. if v is not None and v!="":
  548. v1 = _dict2.get(k)
  549. if v1 is not None and v1!="":
  550. if v!=v1:
  551. _count += 1
  552. return _count
  553. def splitByTimezone(self,list_dict,_key):
  554. cluster_docid = []
  555. dict_docid_key = {}
  556. dict_docid = {}
  557. for _dict in list_dict:
  558. if _dict.get(_key,"") is None or _dict.get(_key,"")=="":
  559. dict_docid_key[_dict.get("docid")] = {}
  560. else:
  561. dict_docid_key[_dict.get("docid")] = json.loads(_dict.get(_key))
  562. dict_docid[_dict.get("docid")] = _dict
  563. for _dict in list_dict:
  564. _find = False
  565. for _cl in cluster_docid:
  566. _legal = True
  567. for _c in _cl:
  568. if self.difftimecount(dict_docid_key.get(_c),dict_docid_key.get(_dict.get("docid")))>0:
  569. _legal = False
  570. break
  571. if _legal:
  572. _cl.append(_dict.get("docid"))
  573. _find = True
  574. if not _find:
  575. cluster_docid.append([_dict.get("docid")])
  576. _result = []
  577. for _cl in cluster_docid:
  578. _r = []
  579. for _c in _cl:
  580. _r.append(dict_docid.get(_c))
  581. _result.append(_r)
  582. return _result
  583. def terminate(self, buffer):
  584. list_group = []
  585. the_group = buffer[0]
  586. SIM_PROB = 0.6
  587. for _d in the_group:
  588. self.getNotLikeSet(_d,"notLike_column")
  589. #判断多个值与否
  590. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  591. re_merge = False
  592. for _key in keys:
  593. if len(getSet(the_group,_key))>1:
  594. log("has_more_than_one:%s"%str(getSet(the_group,_key)))
  595. re_merge = True
  596. break
  597. #判断是否相似而不相同
  598. re_merge_sim = False
  599. for _i1 in range(0,len(the_group)):
  600. for _j1 in range(_i1+1,len(the_group)):
  601. _set1 = the_group[_i1]["notLike_set"]
  602. _set2 = the_group[_j1]["notLike_set"]
  603. _sim = self.getSimilarity(_set1,_set2)
  604. if _sim>SIM_PROB and _sim<1:
  605. re_merge_sim = True
  606. break
  607. contain_keys = ["contain_column1","contain_column2"]
  608. logging.info(the_group)
  609. logging.info(str(re_merge)+str(re_merge_sim))
  610. #重新成组
  611. dict_docid_doc = {}
  612. for _doc in the_group:
  613. dict_docid_doc[_doc["docid"]] = _doc
  614. if re_merge or re_merge_sim:
  615. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  616. the_group.sort(key=lambda x:x["page_time_stamp"])
  617. for _doc in the_group:
  618. merge_flag = False
  619. for _index in range(len(list_group)):
  620. _g = list_group[_index]
  621. hit_count = 0
  622. dict_temp = dict()
  623. #多个值的异常
  624. if re_merge:
  625. for _c_key in contain_keys:
  626. dict_temp[_c_key] = _g[_c_key]
  627. if _g[_c_key] is not None and _doc[_c_key] is not None:
  628. if len(_g[_c_key])>len(_doc[_c_key]):
  629. if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
  630. dict_temp[_c_key] = _g[_c_key]
  631. hit_count += 1
  632. else:
  633. if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
  634. dict_temp[_c_key] = _doc[_c_key]
  635. _g[_c_key] = _doc[_c_key]
  636. hit_count += 1
  637. else:
  638. hit_count = 1
  639. # if hit_count==len(contain_keys):
  640. if hit_count>0:
  641. _flag_sim = False
  642. #相似而不相同的异常
  643. if re_merge_sim:
  644. for _docid in _g["docid"]:
  645. tmp_d = dict_docid_doc[_docid]
  646. _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
  647. if _sim>SIM_PROB and _sim<1:
  648. _flag_sim = True
  649. if not _flag_sim:
  650. for _c_key in dict_temp.keys():
  651. _g[_c_key] = dict_temp[_c_key]
  652. _g["docid"].append(_doc["docid"])
  653. merge_flag = True
  654. break
  655. if not merge_flag:
  656. _dict = dict()
  657. _dict["docid"] = [_doc["docid"]]
  658. for _c_key in contain_keys:
  659. _dict[_c_key] = _doc[_c_key]
  660. list_group.append(_dict)
  661. final_group = []
  662. #判断是否符合一个值
  663. for _group in list_group:
  664. _split = []
  665. for _docid in _group["docid"]:
  666. _split.append(dict_docid_doc[_docid])
  667. #通过置信度排序,尽可能保留组
  668. _split.sort(key=lambda x:x["confidence"],reverse=True)
  669. #置信度
  670. list_key_index = []
  671. for _k in keys:
  672. list_key_index.append(getDiffIndex(_split,_k))
  673. _index = min(list_key_index)
  674. final_group.append([_c["docid"] for _c in _split[:_index]])
  675. for _c in _split[_index:]:
  676. final_group.append([_c["docid"]])
  677. #若是找到两个以上,则全部单独成组,否则成一组
  678. # _flag = True
  679. # for _key in keys:
  680. # if len(getSet(_split,_key))>1:
  681. # _flag = False
  682. # break
  683. # if not _flag:
  684. # for _docid in _group["docid"]:
  685. # final_group.append([_docid])
  686. # else:
  687. # final_group.append(list(set(_group["docid"])))
  688. else:
  689. final_group = [list(set([item["docid"] for item in the_group]))]
  690. log("%s--%s"%("final_group",str(final_group)))
  691. #每个channel选择一篇公告
  692. final_group_channel = []
  693. for _group in final_group:
  694. dict_channel_id = {}
  695. otherChannel = 10000
  696. for _docid in _group:
  697. _channel = dict_docid_doc[_docid].get("docchannel")
  698. if _channel in [114,115,116,117]:
  699. otherChannel += 1
  700. _channel = otherChannel
  701. if _channel not in dict_channel_id:
  702. dict_channel_id[_channel] = []
  703. dict_channel_id[_channel].append({"docid":_docid,"page_time_stamp":dict_docid_doc[_docid].get("page_time_stamp"),
  704. "extract_count":dict_docid_doc[_docid].get("extract_count"),
  705. "json_dicttime":dict_docid_doc[_docid].get("json_dicttime")})
  706. #根据日期进行切分
  707. new_dict_channel_id = {}
  708. log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
  709. for k,v in dict_channel_id.items():
  710. list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
  711. log(list_time_docids)
  712. for _l in list_time_docids:
  713. list_t = self.splitByTimezone(_l,"json_dicttime")
  714. for _t in list_t:
  715. otherChannel += 1
  716. new_dict_channel_id[otherChannel] = _t
  717. log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
  718. channel_dict = {}
  719. for k,v in new_dict_channel_id.items():
  720. v.sort(key=lambda x:x["docid"])
  721. v.sort(key=lambda x:x["extract_count"],reverse=True)
  722. channel_dict[v[0]["docid"]] = []
  723. for _docs in v[1:]:
  724. channel_dict[v[0]["docid"]].append(_docs["docid"])
  725. _d = {"data":channel_dict,"process_time":getCurrent_date()}
  726. final_group_channel.append(_d)
  727. return json.dumps(final_group_channel)
  728. @annotate('string -> string')
  729. class f_get_remerge_group_channel(BaseUDTF):
  730. '''
  731. 将多个组拆解成多条记录
  732. '''
  733. def __init__(self):
  734. import logging
  735. import json
  736. global json,logging
  737. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  738. def process(self,json_remerge):
  739. if json_remerge is not None:
  740. list_group = json.loads(json_remerge)
  741. for _group in list_group:
  742. self.forward(json.dumps(_group))
  743. @annotate('string -> string')
  744. class f_get_remerge_group(BaseUDTF):
  745. '''
  746. 将多个组拆解成多条记录
  747. '''
  748. def __init__(self):
  749. import logging
  750. import json
  751. global json,logging
  752. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  753. def process(self,json_remerge):
  754. if json_remerge is not None:
  755. list_group = json.loads(json_remerge)
  756. for _group in list_group:
  757. l_g = list(set(_group))
  758. l_g.sort(key=lambda x:x)
  759. list_docid = [str(_docid) for _docid in l_g]
  760. self.forward(",".join(list_docid))
  761. @annotate('bigint,bigint,string->string')
  762. class f_merge_probability(BaseUDAF):
  763. '''
  764. 合并组为一条记录
  765. '''
  766. def __init__(self):
  767. import json
  768. global json
  769. def new_buffer(self):
  770. return [[]]
  771. def iterate(self, buffer,docid,page_time_stamp,_type):
  772. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  773. def merge(self, buffer, pbuffer):
  774. buffer[0].extend(pbuffer[0])
  775. def terminate(self, buffer):
  776. list_dict = buffer[0]
  777. list_dict = list_dict[:10000]
  778. list_group = split_with_time(list_dict,sort_key="page_time_stamp",timedelta=86400*120)
  779. return json.dumps(list_group)
  780. @annotate('string -> bigint,bigint,bigint,bigint,string')
  781. class f_split_merge_probability(BaseUDTF):
  782. def __init__(self):
  783. import logging
  784. import json
  785. global logging,json
  786. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  787. def process(self,list_group_str):
  788. logging.info("0")
  789. logging.info(list_group_str)
  790. if list_group_str is not None:
  791. logging.info("1")
  792. try:
  793. list_group = json.loads(list_group_str)
  794. logging.info("2")
  795. for _group in list_group:
  796. if len(_group)>0:
  797. _type = _group[0].get("type","")
  798. logging.info("3%d"%len(list_group))
  799. # _group.sort(key=lambda x:x["page_time_stamp"])
  800. _len = min(100,len(_group))
  801. for _index_i in range(_len):
  802. _count = 0
  803. for _index_j in range(_index_i+1,_len):
  804. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  805. break
  806. _count += 1
  807. _docid1 = _group[_index_i]["docid"]
  808. _docid2 = _group[_index_j]["docid"]
  809. if _docid1<_docid2:
  810. self.forward(_docid1,_docid2,1,_len,_type)
  811. else:
  812. self.forward(_docid2,_docid1,1,_len,_type)
  813. except Exception as e:
  814. logging(str(e))
  815. @annotate('bigint,bigint,string->string')
  816. class f_merge_groupPairs(BaseUDAF):
  817. '''
  818. 合并组为一条记录
  819. '''
  820. def __init__(self):
  821. import json
  822. global json
  823. def new_buffer(self):
  824. return [[]]
  825. def iterate(self, buffer,is_exists,counts,_type):
  826. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  827. def merge(self, buffer, pbuffer):
  828. buffer[0].extend(pbuffer[0])
  829. def terminate(self, buffer):
  830. list_dict = buffer[0]
  831. list_dict = list_dict[:10000]
  832. return json.dumps(list_dict)
  833. @annotate("string -> bigint,bigint,bigint")
  834. class f_merge_getLabel(BaseUDTF):
  835. def __init__(self):
  836. import logging
  837. import json
  838. global logging,json
  839. def process(self,str_docids):
  840. if str_docids is not None:
  841. list_docids = [int(i) for i in str_docids.split(",")]
  842. list_docids.sort(key=lambda x:x)
  843. _len = min(100,len(list_docids))
  844. for index_i in range(_len):
  845. docid_less = list_docids[index_i]
  846. for index_j in range(index_i+1,_len):
  847. docid_greater = list_docids[index_j]
  848. self.forward(docid_less,docid_greater,1)
  849. def getSimilarityOfString(str1,str2,nums=2):
  850. _set1 = set()
  851. _set2 = set()
  852. if str1 is None:
  853. str1 = ""
  854. if str2 is None:
  855. str2 = ""
  856. if len(str1)<=nums or len(str2)<=nums:
  857. if str1!=str2:
  858. return 0.8
  859. else:
  860. return 1
  861. if str1 is not None:
  862. for i in range(nums,min(1000,len(str1))):
  863. _set1.add(str1[i-nums:i+1])
  864. if str2 is not None:
  865. for i in range(nums,min(1000,len(str2))):
  866. _set2.add(str2[i-nums:i+1])
  867. _len = max(1,min(len(_set1),len(_set2)))
  868. return len(_set1&_set2)/_len
  869. def check_columns(tenderee_less,tenderee_greater,
  870. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  871. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  872. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  873. flag = True
  874. _set_tenderee = set()
  875. if tenderee_less is not None and tenderee_less!="":
  876. _set_tenderee.add(tenderee_less)
  877. if tenderee_greater is not None and tenderee_greater!="":
  878. _set_tenderee.add(tenderee_greater)
  879. if len(_set_tenderee)>1:
  880. return False
  881. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  882. if code_sim>0.6 and code_sim<1:
  883. return False
  884. #同批次不同编号
  885. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  886. _split_code_less = project_code_less.split("-")
  887. _split_code_greater = project_code_greater.split("-")
  888. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  889. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  890. return False
  891. _set_win_tenderer = set()
  892. if win_tenderer_less is not None and win_tenderer_less!="":
  893. _set_win_tenderer.add(win_tenderer_less)
  894. if win_tenderer_greater is not None and win_tenderer_greater!="":
  895. _set_win_tenderer.add(win_tenderer_greater)
  896. if len(_set_win_tenderer)>1:
  897. return False
  898. _set_win_bid_price = set()
  899. if win_bid_price_less is not None and win_bid_price_less!="":
  900. _set_win_bid_price.add(float(win_bid_price_less))
  901. if win_bid_price_greater is not None and win_bid_price_greater!="":
  902. _set_win_bid_price.add(float(win_bid_price_greater))
  903. if len(_set_win_bid_price)>1:
  904. return False
  905. _set_bidding_budget = set()
  906. if bidding_budget_less is not None and bidding_budget_less!="":
  907. _set_bidding_budget.add(float(bidding_budget_less))
  908. if bidding_budget_greater is not None and bidding_budget_greater!="":
  909. _set_bidding_budget.add(float(bidding_budget_greater))
  910. if len(_set_bidding_budget)>1:
  911. return False
  912. return True
  913. def getSimLevel(str1,str2):
  914. str1_null = False
  915. str2_null = False
  916. _v = 0
  917. if str1 is None or str1=="":
  918. str1_null = True
  919. if str2 is None or str2=="":
  920. str2_null = True
  921. if str1_null and str2_null:
  922. _v = 2
  923. elif str1_null and not str2_null:
  924. _v = 4
  925. elif not str1_null and str2_null:
  926. _v = 6
  927. elif not str1_null and not str2_null:
  928. if str1==str2:
  929. _v = 10
  930. else:
  931. _v = 0
  932. return _v
  933. import math
  934. def featurnCount(_count,max_count=100):
  935. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  936. def getLength(_str):
  937. return len(_str if _str is not None else "")
  938. @annotate("string->bigint")
  939. class f_get_min_counts(object):
  940. def evaluate(self,json_context):
  941. _context = json.loads(json_context)
  942. min_counts = 100
  943. for item in _context:
  944. if item["counts"]<min_counts:
  945. min_counts = item["counts"]
  946. return min_counts
  947. @annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  948. class f_merge_featureMatrix(BaseUDTF):
  949. def __init__(self):
  950. import logging
  951. import json
  952. global logging,json
  953. def process(self,json_context,tenderee_less,tenderee_greater,
  954. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  955. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  956. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  957. if not check_columns(tenderee_less,tenderee_greater,
  958. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  959. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  960. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  961. return
  962. _context = json.loads(json_context)
  963. min_counts = 100
  964. dict_context = {}
  965. for item in _context:
  966. if item["counts"]<min_counts:
  967. min_counts = item["counts"]
  968. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  969. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  970. list_matrix = []
  971. for index_i in range(len(context_key)):
  972. for index_j in range(index_i+1,len(context_key)):
  973. _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  974. _v = featurnCount(dict_context.get(_key,[0,0])[1])
  975. list_matrix.append(_v)
  976. context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  977. for index_i in range(len(context3_key)):
  978. for index_j in range(index_i+1,len(context3_key)):
  979. for index_k in range(index_j+1,len(context3_key)):
  980. _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  981. _v = featurnCount(dict_context.get(_key,[0,0])[1])
  982. list_matrix.append(_v)
  983. list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  984. list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  985. list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  986. list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  987. list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  988. list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  989. list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  990. list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  991. # set_tenderer = set()
  992. # if tenderee_less is not None and tenderee_less!="":
  993. # set_tenderer.add(tenderee_less)
  994. # if tenderee_greater is not None and tenderee_greater!="":
  995. # set_tenderer.add(tenderee_greater)
  996. #
  997. # set_win_tenderer = set()
  998. # if win_tenderer_less is not None and win_tenderer_less!="":
  999. # set_win_tenderer.add(win_tenderer_less)
  1000. # if win_tenderer_greater is not None and win_tenderer_greater!="":
  1001. # set_win_tenderer.add(win_tenderer_greater)
  1002. #
  1003. # set_bidding_budget = set()
  1004. # if bidding_budget_less is not None and bidding_budget_less!="":
  1005. # set_bidding_budget.add(bidding_budget_less)
  1006. # if bidding_budget_greater is not None and bidding_budget_greater!="":
  1007. # set_bidding_budget.add(bidding_budget_greater)
  1008. #
  1009. # set_win_bid_price = set()
  1010. # if win_bid_price_less is not None and win_bid_price_less!="":
  1011. # set_win_bid_price.add(win_bid_price_less)
  1012. # if win_bid_price_greater is not None and win_bid_price_greater!="":
  1013. # set_win_bid_price.add(win_bid_price_greater)
  1014. json_matrix = json.dumps(list_matrix)
  1015. same_project_code = False
  1016. if project_code_less==project_code_greater and getLength(project_code_less)>0:
  1017. same_project_code = True
  1018. same_project_name = False
  1019. if project_name_less==project_name_greater and getLength(project_name_less)>0:
  1020. same_project_name = True
  1021. same_doctitle_refine = False
  1022. if doctitle_refine_less==doctitle_refine_greater and getLength(doctitle_refine_less)>0:
  1023. same_doctitle_refine = True
  1024. same_tenderee = False
  1025. if tenderee_less==tenderee_greater and getLength(tenderee_less)>0:
  1026. same_tenderee = True
  1027. same_agency = False
  1028. if agency_less==agency_greater and getLength(agency_less)>0:
  1029. same_agency = True
  1030. same_bidding_budget = False
  1031. if bidding_budget_less==bidding_budget_greater and getLength(bidding_budget_less)>0:
  1032. same_bidding_budget = True
  1033. same_win_tenderer = False
  1034. if win_tenderer_less==win_tenderer_greater and getLength(win_tenderer_less)>0:
  1035. same_win_tenderer = True
  1036. same_win_bid_price = False
  1037. if win_bid_price_less==win_bid_price_greater and getLength(win_bid_price_less)>0:
  1038. same_win_bid_price = True
  1039. contain_doctitle = False
  1040. if getLength(doctitle_refine_less)>0 and getLength(doctitle_refine_greater)>0 and (doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
  1041. contain_doctitle = True
  1042. contain_project_name = False
  1043. if getLength(project_name_less)>0 and getLength(project_name_greater)>0 and (project_name_less in project_name_greater or project_name_greater in project_name_less):
  1044. contain_project_name = True
  1045. total_money_less = 0 if getLength(bidding_budget_less)==0 else float(bidding_budget_less)+0 if getLength(win_bid_price_less)==0 else float(win_bid_price_less)
  1046. total_money_greater = 0 if getLength(bidding_budget_greater)==0 else float(bidding_budget_greater) +0 if getLength(win_bid_price_greater)==0 else float(win_bid_price_greater)
  1047. if min_counts<10:
  1048. _prob = 0.9
  1049. if same_project_code and same_win_tenderer and same_tenderee:
  1050. self.forward(json_matrix,_prob)
  1051. return
  1052. if same_tenderee and same_project_name and same_win_tenderer:
  1053. self.forward(json_matrix,_prob)
  1054. return
  1055. if same_tenderee and same_doctitle_refine and same_win_tenderer:
  1056. self.forward(json_matrix,_prob)
  1057. return
  1058. if same_tenderee and same_win_bid_price and same_win_tenderer:
  1059. self.forward(json_matrix,_prob)
  1060. return
  1061. if same_project_code and same_win_bid_price and same_win_tenderer:
  1062. self.forward(json_matrix,_prob)
  1063. return
  1064. if same_project_name and same_win_bid_price and same_win_tenderer:
  1065. self.forward(json_matrix,_prob)
  1066. return
  1067. if same_doctitle_refine and same_win_bid_price and same_win_tenderer:
  1068. self.forward(json_matrix,_prob)
  1069. return
  1070. if same_doctitle_refine and same_bidding_budget and same_win_tenderer:
  1071. self.forward(json_matrix,_prob)
  1072. return
  1073. if same_tenderee and same_doctitle_refine and same_win_tenderer:
  1074. self.forward(json_matrix,_prob)
  1075. return
  1076. if same_tenderee and same_project_code and same_project_name:
  1077. self.forward(json_matrix,_prob)
  1078. return
  1079. if same_tenderee and same_project_code and same_doctitle_refine:
  1080. self.forward(json_matrix,_prob)
  1081. return
  1082. if same_tenderee and same_bidding_budget and same_project_code:
  1083. self.forward(json_matrix,_prob)
  1084. return
  1085. if same_tenderee and same_bidding_budget and same_doctitle_refine:
  1086. self.forward(json_matrix,_prob)
  1087. return
  1088. if same_tenderee and same_bidding_budget and same_project_name:
  1089. self.forward(json_matrix,_prob)
  1090. return
  1091. if same_doctitle_refine and same_project_code and same_project_name:
  1092. self.forward(json_matrix,_prob)
  1093. return
  1094. if min_counts<=5:
  1095. _prob = 0.8
  1096. if same_project_code and same_tenderee:
  1097. self.forward(json_matrix,_prob)
  1098. return
  1099. if same_project_code and same_win_tenderer:
  1100. self.forward(json_matrix,_prob)
  1101. return
  1102. if same_project_name and same_project_code:
  1103. self.forward(json_matrix,_prob)
  1104. return
  1105. if same_project_code and same_doctitle_refine:
  1106. self.forward(json_matrix,_prob)
  1107. return
  1108. if total_money_less==total_money_greater and total_money_less>100000:
  1109. if same_win_tenderer and (same_win_bid_price or same_bidding_budget):
  1110. self.forward(json_matrix,_prob)
  1111. return
  1112. if same_project_code and same_bidding_budget:
  1113. self.forward(json_matrix,_prob)
  1114. return
  1115. if same_project_code and same_win_bid_price:
  1116. self.forward(json_matrix,_prob)
  1117. return
  1118. if same_bidding_budget and same_win_bid_price and (contain_project_name or contain_doctitle):
  1119. self.forward(json_matrix,_prob)
  1120. return
  1121. if min_counts<=3:
  1122. _prob = 0.7
  1123. if same_project_name or same_project_code or same_doctitle_refine or contain_doctitle or contain_project_name:
  1124. self.forward(json_matrix,_prob)
  1125. return
  1126. self.forward(json_matrix,0)
  1127. class MergePredictor():
  1128. def __init__(self):
  1129. self.input_size = 46
  1130. self.output_size = 2
  1131. self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
  1132. self.bias = np.array([16.79706382751465, -13.713337898254395])
  1133. # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
  1134. def activation(self,vec,_type):
  1135. if _type=="relu":
  1136. _vec = np.array(vec)
  1137. return _vec*(_vec>0)
  1138. if _type=="tanh":
  1139. return np.tanh(vec)
  1140. if _type=="softmax":
  1141. _vec = np.array(vec)
  1142. _exp = np.exp(_vec)
  1143. return _exp/np.sum(_exp)
  1144. def predict(self,input):
  1145. _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
  1146. # print(self.model.predict(np.array(input).reshape(-1,46)))
  1147. return _out
  1148. @annotate('string,double -> double')
  1149. class f_getMergeProb(BaseUDTF):
  1150. def __init__(self):
  1151. import json
  1152. include_package_path("numpy-1.18.zip")
  1153. import numpy as np
  1154. global json,np
  1155. self.mp = MergePredictor()
  1156. def process(self,json_matrix,pre_prob):
  1157. if not pre_prob>0.5:
  1158. _matrix = json.loads(json_matrix)
  1159. _prob = self.mp.predict(_matrix)[0][1]
  1160. else:
  1161. _prob = pre_prob
  1162. if _prob>0.5:
  1163. self.forward(float(_prob))
  1164. @annotate('string -> bigint,bigint')
  1165. class f_check_remerge_channel(BaseUDTF):
  1166. '''
  1167. 将多个组拆解成多条记录
  1168. '''
  1169. def __init__(self):
  1170. import logging
  1171. import json
  1172. global json,logging
  1173. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1174. def process(self,json_remerge):
  1175. if json_remerge is not None:
  1176. list_group = json.loads(json_remerge)
  1177. for _group in list_group:
  1178. _keys = _group.get("data").keys()
  1179. if len(_keys)>0:
  1180. main_docid = int(list(_keys)[0])
  1181. for k,v in _group.get("data",{}).items():
  1182. self.forward(main_docid,int(k))
  1183. for _v in v:
  1184. self.forward(main_docid,int(_v))
  1185. @annotate('string -> bigint,bigint')
  1186. class f_check_remerge(BaseUDTF):
  1187. '''
  1188. 将多个组拆解成多条记录
  1189. '''
  1190. def __init__(self):
  1191. import logging
  1192. import json
  1193. global json,logging
  1194. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1195. def process(self,json_remerge):
  1196. if json_remerge is not None:
  1197. list_group = json.loads(json_remerge)
  1198. for _group in list_group:
  1199. for _docid in _group:
  1200. self.forward(_group[-1],_docid)
  1201. def getConfidence(rule_id):
  1202. if rule_id >=1 and rule_id <=20:
  1203. return 30
  1204. elif rule_id>=31 and rule_id<=50:
  1205. return 20
  1206. else:
  1207. return 10
  1208. @annotate('string,bigint -> bigint,bigint,bigint')
  1209. class f_arrange_group_single(BaseUDTF):
  1210. '''
  1211. 将多个组拆解成多条记录
  1212. '''
  1213. def __init__(self):
  1214. import logging
  1215. import json
  1216. global json,logging
  1217. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1218. def process(self,json_set_docid,rule_id):
  1219. if json_set_docid is not None:
  1220. list_group = json.loads(json_set_docid)
  1221. for _group in list_group:
  1222. for index_i in range(len(_group)):
  1223. for index_j in range(len(_group)):
  1224. # if index_i!=index_j and _group[index_i]!=_group[index_j]:
  1225. if index_i!=index_j:
  1226. self.forward(_group[index_i],_group[index_j],getConfidence(rule_id))
  1227. @annotate('bigint,bigint->string')
  1228. class f_get_merge_docids(BaseUDAF):
  1229. '''
  1230. 合并组为一条记录
  1231. '''
  1232. def __init__(self):
  1233. import json
  1234. global json
  1235. def new_buffer(self):
  1236. return [set()]
  1237. def iterate(self, buffer,docid1,docid2):
  1238. buffer[0].add(docid1)
  1239. buffer[0].add(docid2)
  1240. def merge(self, buffer, pbuffer):
  1241. buffer[0] |= pbuffer[0]
  1242. def terminate(self, buffer):
  1243. set_docid = buffer[0]
  1244. list_docid = list(set_docid)
  1245. list_docid.sort(key=lambda x:x)
  1246. list_docid_str = []
  1247. for _docid in list_docid:
  1248. list_docid_str.append(str(_docid))
  1249. return ",".join(list_docid_str)
  1250. @annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
  1251. class f_encode_time(object):
  1252. def evaluate(self,time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release):
  1253. _dict = {"time_bidclose":time_bidclose,"time_bidopen":time_bidopen,"time_bidstart":time_bidstart,
  1254. "time_commencement":time_commencement,"time_completion":time_completion,"time_earnest_money_end":time_earnest_money_end,
  1255. "time_earnest_money_start":time_earnest_money_start,"time_get_file_end":time_get_file_end,"time_get_file_start":time_get_file_start,
  1256. "time_publicity_end":time_publicity_end,"time_publicity_start":time_publicity_start,"time_registration_end":time_registration_end,
  1257. "time_registration_start":time_registration_start,"time_release":time_release}
  1258. _encode = json.dumps(_dict)
  1259. return _encode
  1260. @annotate('string,string -> string,string')
  1261. class f_decode_ruwei(BaseUDTF):
  1262. def __init__(self):
  1263. import logging
  1264. import json
  1265. global json,logging
  1266. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1267. def process(self, page_time,sub_docs_json):
  1268. if sub_docs_json is not None:
  1269. for sub_docs in json.loads(sub_docs_json):
  1270. if sub_docs.get("win_tenderer","")!="":
  1271. self.forward(page_time,sub_docs.get("win_tenderer",""))
  1272. if sub_docs.get("second_tenderer","")!="":
  1273. self.forward(page_time,sub_docs.get("second_tenderer",""))
  1274. if sub_docs.get("third_tenderer","")!="":
  1275. self.forward(page_time,sub_docs.get("third_tenderer",""))
  1276. @annotate('string,string -> bigint,string')
  1277. class f_get_docid_uuid(BaseUDTF):
  1278. def __init__(self):
  1279. import logging
  1280. import json
  1281. global json,logging
  1282. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1283. def process(self, uuid,docids):
  1284. log("%s-%s"%(str(uuid),str(docids)))
  1285. if docids is not None and docids!="":
  1286. l_docid = docids.split(",")
  1287. for _docid in l_docid:
  1288. try:
  1289. self.forward(int(_docid),uuid)
  1290. except Exception as e:
  1291. pass
  1292. @annotate('string,string->string')
  1293. class f_concat_str(BaseUDAF):
  1294. '''
  1295. 合并组为一条记录
  1296. '''
  1297. def __init__(self):
  1298. import json
  1299. global json
  1300. def new_buffer(self):
  1301. return [[]]
  1302. def iterate(self, buffer,_str,concat_str):
  1303. buffer[0].append([_str,concat_str])
  1304. def merge(self, buffer, pbuffer):
  1305. buffer[0].extend(pbuffer[0])
  1306. def terminate(self, buffer):
  1307. list_str_concat = buffer[0]
  1308. list_str = [a[0] for a in list_str_concat]
  1309. concat_str = ","
  1310. if len(list_str_concat)>0:
  1311. concat_str = list_str_concat[0][1]
  1312. return concat_str.join(list_str)
  1313. def generate_common_properties(list_docs):
  1314. '''
  1315. #通用属性生成
  1316. :param list_docis:
  1317. :return:
  1318. '''
  1319. #计数法选择
  1320. choose_dict = {}
  1321. project_dict = {}
  1322. for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
  1323. for _doc in list_docs:
  1324. _value = _doc.get(_key,"")
  1325. if _value!="":
  1326. if _key not in choose_dict:
  1327. choose_dict[_key] = {}
  1328. if _value not in choose_dict[_key]:
  1329. choose_dict[_key][_value] = 0
  1330. choose_dict[_key][_value] += 1
  1331. _find = False
  1332. dict_count = {}
  1333. for _doc in list_docs:
  1334. for _key in [document_district,document_city,document_province,document_area]:
  1335. loc = _doc.get(_key,"未知")
  1336. if loc not in ('全国','未知',"0"):
  1337. if loc not in dict_count:
  1338. dict_count[loc] = 0
  1339. dict_count[loc] += 1
  1340. list_loc = []
  1341. for _doc in list_docs:
  1342. _d = {"count":0}
  1343. for _key in [document_district,document_city,document_province,document_area]:
  1344. loc = _doc.get(_key,"未知")
  1345. _d[_key] = loc
  1346. _d["count"] += dict_count.get(loc,0)
  1347. if _key==document_district and loc not in ("全国","未知",""):
  1348. _d["count"] += 1
  1349. if _key==document_city and loc not in ("全国","未知",""):
  1350. _d["count"] += 1
  1351. if _key==document_province and loc not in ("全国","未知",""):
  1352. _d["count"] += 1
  1353. if _key==document_area and loc not in ("全国","未知",""):
  1354. _d["count"] += 1
  1355. list_loc.append(_d)
  1356. list_loc.sort(key=lambda x:x.get("count",0),reverse=True)
  1357. if len(list_loc)>0:
  1358. project_dict[document_district] = _doc.get(document_district)
  1359. project_dict[document_city] = _doc.get(document_city)
  1360. project_dict[document_province] = _doc.get(document_province)
  1361. project_dict[document_area] = _doc.get(document_area)
  1362. _find = True
  1363. # print(dict_count)
  1364. # print(len(list_docs))
  1365. # print("list_loc",list_loc,project_dict)
  1366. #会导致省市错乱
  1367. # for _key in [document_district,document_city,document_province,document_area]:
  1368. # area_dict = {}
  1369. # for _doc in list_docs:
  1370. # loc = _doc.get(_key,"未知")
  1371. # if loc not in ('全国','未知',"0"):
  1372. # if loc not in area_dict:
  1373. # area_dict[loc] = 0
  1374. # area_dict[loc] += 1
  1375. # list_loc = []
  1376. # for k,v in area_dict.items():
  1377. # list_loc.append([k,v])
  1378. # list_loc.sort(key=lambda x:x[1],reverse=True)
  1379. # if len(list_loc)>0:
  1380. # project_dict[document_district] = _doc.get(document_district)
  1381. # project_dict[document_city] = _doc.get(document_city)
  1382. # project_dict[document_province] = _doc.get(document_province)
  1383. # project_dict[document_area] = _doc.get(document_area)
  1384. # _find = True
  1385. # break
  1386. # if not _find:
  1387. # if len(list_docs)>0:
  1388. # project_dict[document_district] = list_docs[0].get(document_district)
  1389. # project_dict[document_city] = list_docs[0].get(document_city)
  1390. # project_dict[document_province] = list_docs[0].get(document_province)
  1391. # project_dict[document_area] = list_docs[0].get(document_area)
  1392. for _key,_value in choose_dict.items():
  1393. _l = []
  1394. for k,v in _value.items():
  1395. _l.append([k,v])
  1396. _l.sort(key=lambda x:x[1],reverse=True)
  1397. if len(_l)>0:
  1398. _v = _l[0][0]
  1399. if _v in ('全国','未知'):
  1400. if len(_l)>1:
  1401. _v = _l[1][0]
  1402. project_dict[_key] = _v
  1403. list_dynamics = []
  1404. docid_number = 0
  1405. visuable_docids = []
  1406. zhao_biao_page_time = ""
  1407. zhong_biao_page_time = ""
  1408. list_codes = []
  1409. list_product = []
  1410. p_page_time = ""
  1411. remove_docids = set()
  1412. set_nlp_enterprise = set()
  1413. set_nlp_enterprise_attachment = set()
  1414. for _doc in list_docs:
  1415. table_name = _doc.get("table_name")
  1416. status = _doc.get(document_status,0)
  1417. _save = _doc.get(document_tmp_save,1)
  1418. doctitle = _doc.get(document_doctitle,"")
  1419. docchannel = _doc.get(document_docchannel)
  1420. page_time = _doc.get(document_page_time,"")
  1421. _docid = _doc.get(document_docid)
  1422. _bidway = _doc.get(document_bidway,"")
  1423. _docchannel = _doc.get(document_life_docchannel,0)
  1424. project_codes = _doc.get(document_project_codes)
  1425. product = _doc.get(document_product)
  1426. sub_docs = _doc.get("sub_docs",[])
  1427. is_multipack = True if len(sub_docs)>1 else False
  1428. extract_count = _doc.get(document_tmp_extract_count,0)
  1429. try:
  1430. set_nlp_enterprise |= set(json.loads(_doc.get(document_nlp_enterprise,"[]")))
  1431. set_nlp_enterprise_attachment |= set(json.loads(_doc.get(document_nlp_enterprise_attachment,"[]")))
  1432. except Exception as e:
  1433. traceback.print_exc()
  1434. if product is not None:
  1435. list_product.extend(product.split(","))
  1436. if project_codes is not None:
  1437. _c = project_codes.split(",")
  1438. list_codes.extend(_c)
  1439. if p_page_time=="":
  1440. p_page_time = page_time
  1441. if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
  1442. zhao_biao_page_time = page_time
  1443. if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
  1444. zhong_biao_page_time = page_time
  1445. is_visuable = 0
  1446. if table_name=="document":
  1447. if status>=201 and status<=300:
  1448. docid_number +=1
  1449. visuable_docids.append(str(_docid))
  1450. is_visuable = 1
  1451. else:
  1452. remove_docids.add(str(_docid))
  1453. else:
  1454. if _save==1:
  1455. docid_number +=1
  1456. visuable_docids.append(str(_docid))
  1457. is_visuable = 1
  1458. else:
  1459. remove_docids.add(str(_docid))
  1460. list_dynamics.append({document_docid:_docid,
  1461. document_doctitle:doctitle,
  1462. document_docchannel:_docchannel,
  1463. document_bidway:_bidway,
  1464. document_page_time:page_time,
  1465. document_status:201 if is_visuable==1 else 401,
  1466. "is_multipack":is_multipack,
  1467. document_tmp_extract_count:extract_count
  1468. }
  1469. )
  1470. project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  1471. project_dict[project_docid_number] = docid_number
  1472. project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
  1473. if zhao_biao_page_time !="":
  1474. project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
  1475. if zhong_biao_page_time !="":
  1476. project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
  1477. project_dict[project_project_codes] = ",".join(list(set(list_codes)))
  1478. project_dict[project_page_time] = p_page_time
  1479. project_dict[project_product] = ",".join(list(set(list_product)))
  1480. project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
  1481. project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
  1482. return project_dict
  1483. def generate_packages_properties(list_docs):
  1484. '''
  1485. 生成分包属性
  1486. :param list_docs:
  1487. :return:
  1488. '''
  1489. list_properties = []
  1490. set_key = set()
  1491. for _doc in list_docs:
  1492. _dict = {}
  1493. sub_docs = _doc.get("sub_docs")
  1494. if sub_docs is not None:
  1495. for _d in sub_docs:
  1496. sub_project_code = _d.get(project_sub_project_code,"")
  1497. sub_project_name = _d.get(project_sub_project_name,"")
  1498. win_tenderer = _d.get(project_win_tenderer,"")
  1499. win_bid_price = _d.get(project_win_bid_price,"")
  1500. if sub_project_name=="Project":
  1501. win_exists = False
  1502. win_price_exists = False
  1503. win_sum = 0
  1504. for _d1 in sub_docs:
  1505. if _d.get(project_sub_project_name,"")=="Project":
  1506. continue
  1507. if _d1.get(project_win_tenderer,"")==win_tenderer:
  1508. win_exists = True
  1509. if _d1.get(project_win_tenderer,"")==win_tenderer and _d1.get(project_win_bid_price,"")!="":
  1510. win_sum += float(_d1.get(project_win_bid_price,0))
  1511. if _d1.get(project_win_bid_price,"")==win_bid_price:
  1512. win_price_exists = True
  1513. if win_exists and (win_price_exists or win_bid_price=="" or float(win_bid_price)==0 or float(win_bid_price)==win_sum):
  1514. continue
  1515. _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
  1516. if _key in set_key:
  1517. continue
  1518. set_key.add(_key)
  1519. list_properties.append(_d)
  1520. return list_properties
  1521. def generate_projects(list_docs):
  1522. '''
  1523. #通过公告生成projects
  1524. :param list_docids:
  1525. :return:
  1526. '''
  1527. #判断标段数
  1528. list_projects = []
  1529. project_dict = generate_common_properties(list_docs)
  1530. list_package_properties = generate_packages_properties(list_docs)
  1531. #生成包数据
  1532. for _pp in list_package_properties:
  1533. _pp.update(project_dict)
  1534. list_projects.append(_pp)
  1535. return list_projects
  1536. @annotate("string->bigint")
  1537. class totimestamp(object):
  1538. def __init__(self):
  1539. import time
  1540. global time
  1541. import logging
  1542. import json
  1543. import re
  1544. global json,logging,re
  1545. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  1546. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1547. def evaluate(self, str_time):
  1548. try:
  1549. logging.info(str_time)
  1550. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  1551. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  1552. timeStamp = int(time.mktime(timeArray))
  1553. return timeStamp
  1554. else:
  1555. return 0
  1556. except Exception as e:
  1557. return 0
  1558. @annotate('bigint,string,string,bigint,string,bigint,string,string,string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
  1559. class f_generate_projects_from_document(BaseUDTF):
  1560. def __init__(self):
  1561. import logging
  1562. import json
  1563. global json,logging
  1564. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1565. self.ToTimeStamp = totimestamp()
  1566. def process(self, docid,
  1567. extract_json,
  1568. doctitle,
  1569. save,
  1570. bidway,
  1571. status,
  1572. page_time,
  1573. info_source,
  1574. fingerprint,
  1575. docchannel,
  1576. life_docchannel,
  1577. area,
  1578. province,
  1579. city,
  1580. district,
  1581. sub_docs_json,
  1582. industry,
  1583. info_type,
  1584. qcodes,
  1585. project_name,
  1586. project_code,
  1587. tenderee,
  1588. tenderee_addr,
  1589. tenderee_phone,
  1590. tenderee_contact,
  1591. agency,
  1592. agency_phone,
  1593. agency_contact,
  1594. procurement_system,
  1595. project_codes,
  1596. product,
  1597. moneysource,
  1598. time_bidclose,
  1599. time_bidopen,
  1600. time_bidstart,
  1601. time_commencement,
  1602. time_completion,
  1603. time_earnest_money_start,
  1604. time_earnest_money_end,
  1605. time_get_file_end,
  1606. time_get_file_start,
  1607. time_publicity_end,
  1608. time_publicity_start,
  1609. time_registration_end,
  1610. time_registration_start,
  1611. time_release,
  1612. extract_count,
  1613. uuids):
  1614. attrs_dict = {}
  1615. _extract = {}
  1616. try:
  1617. attrs_dict["sub_docs"] = json.loads(sub_docs_json)
  1618. _extract = json.loads(extract_json)
  1619. except Exception as e:
  1620. pass
  1621. attrs_dict[document_nlp_enterprise] = json.dumps(_extract.get(document_nlp_enterprise,[]),ensure_ascii=False)
  1622. attrs_dict[document_nlp_enterprise_attachment] = json.dumps(_extract.get(document_nlp_enterprise_attachment,[]),ensure_ascii=False)
  1623. attrs_dict[document_docid] = docid
  1624. attrs_dict[document_doctitle] = doctitle
  1625. attrs_dict[document_tmp_save] = save
  1626. attrs_dict[document_bidway] = bidway
  1627. attrs_dict[document_status] = status
  1628. attrs_dict[document_page_time] = page_time
  1629. attrs_dict[document_info_source] = info_source
  1630. attrs_dict[document_fingerprint] = fingerprint
  1631. attrs_dict[document_docchannel] = docchannel
  1632. if life_docchannel is not None:
  1633. attrs_dict[document_life_docchannel] = life_docchannel
  1634. else:
  1635. attrs_dict[document_life_docchannel] = docchannel
  1636. attrs_dict[document_area] = area
  1637. attrs_dict[document_province] = province
  1638. attrs_dict[document_city] = city
  1639. attrs_dict[document_district] = district
  1640. attrs_dict[document_tmp_sub_docs_json] = sub_docs_json
  1641. attrs_dict[document_industry] = industry
  1642. attrs_dict[document_info_type] = info_type
  1643. attrs_dict[document_qcodes] = qcodes
  1644. attrs_dict[document_project_name] = project_name
  1645. attrs_dict[document_project_code] = project_code
  1646. attrs_dict[document_tenderee] = tenderee
  1647. attrs_dict[document_tenderee_addr] = tenderee_addr
  1648. attrs_dict[document_tenderee_phone] = tenderee_phone
  1649. attrs_dict[document_tenderee_contact] = tenderee_contact
  1650. attrs_dict[document_agency] = agency
  1651. attrs_dict[document_agency_phone] = agency_phone
  1652. attrs_dict[document_agency_contact] = agency_contact
  1653. attrs_dict[project_procurement_system] = procurement_system
  1654. attrs_dict[document_project_codes] = project_codes
  1655. attrs_dict[document_product] = product
  1656. attrs_dict[document_moneysource] = moneysource
  1657. attrs_dict[document_time_bidclose] = time_bidclose
  1658. attrs_dict[document_time_bidopen] = time_bidopen
  1659. attrs_dict[document_time_bidstart] = time_bidstart
  1660. attrs_dict[document_time_commencement] = time_commencement
  1661. attrs_dict[document_time_completion] = time_completion
  1662. attrs_dict[document_time_earnest_money_start] = time_earnest_money_start
  1663. attrs_dict[document_time_earnest_money_end] = time_earnest_money_end
  1664. attrs_dict[document_time_get_file_end] = time_get_file_end
  1665. attrs_dict[document_time_get_file_start] = time_get_file_start
  1666. attrs_dict[document_time_publicity_end] = time_publicity_end
  1667. attrs_dict[document_time_publicity_start] = time_publicity_start
  1668. attrs_dict[document_time_registration_end] = time_registration_end
  1669. attrs_dict[document_time_registration_start] = time_registration_start
  1670. attrs_dict[document_time_release] = time_release
  1671. attrs_dict[document_tmp_extract_count] = _extract.get(document_tmp_extract_count,0)
  1672. attrs_dict["table_name"] = "document"
  1673. list_projects = generate_projects([attrs_dict])
  1674. if len(list_projects)>0:
  1675. list_projects[0][project_delete_uuid] = uuids if uuids is not None else ""
  1676. log(str(list_projects))
  1677. for _project in list_projects:
  1678. _uuid = uuid4().hex
  1679. docids = _project.get(project_docids,"")
  1680. page_time = _project.get(project_page_time,"")
  1681. project_name = _project.get(project_project_name,"")
  1682. project_codes = _project.get(project_project_codes,"")
  1683. tenderee = _project.get(project_tenderee,"")
  1684. agency = _project.get(project_agency,"")
  1685. bidding_budget = float(_project.get(project_bidding_budget,-1))
  1686. win_tenderer = _project.get(project_win_tenderer,"")
  1687. win_bid_price = float(_project.get(project_win_bid_price,-1))
  1688. product = _project.get(project_product,"")
  1689. attrs_json = json.dumps(_project,ensure_ascii=False)
  1690. list_codes = project_codes.split(",")
  1691. page_time_stamp = self.ToTimeStamp.evaluate(page_time)
  1692. if len(list_codes)==0:
  1693. list_codes.append("")
  1694. list_product = product.split(",")
  1695. if len(list_product)==0:
  1696. list_product.append("")
  1697. for _i in range(min(max(len(list_codes),len(list_product)),20)):
  1698. _project_code = list_codes[_i%len(list_codes)]
  1699. _product = list_product[_i%len(list_product)]
  1700. self.forward(_uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
  1701. @annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
  1702. class f_generate_projects_from_project(BaseUDTF):
  1703. def __init__(self):
  1704. import logging
  1705. import json
  1706. global json,logging
  1707. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1708. self.ToTimeStamp = totimestamp()
  1709. def process(self, uuid,
  1710. docids,
  1711. zhao_biao_page_time,
  1712. zhong_biao_page_time,
  1713. page_time,
  1714. area,
  1715. province,
  1716. city,
  1717. district,
  1718. info_type,
  1719. industry,
  1720. qcodes,
  1721. project_name,
  1722. project_code,
  1723. project_codes,
  1724. project_addr,
  1725. tenderee,
  1726. tenderee_addr,
  1727. tenderee_phone,
  1728. tenderee_contact,
  1729. agency,
  1730. agency_phone,
  1731. agency_contact,
  1732. sub_project_name,
  1733. sub_project_code,
  1734. bidding_budget,
  1735. win_tenderer,
  1736. win_bid_price,
  1737. win_tenderer_manager,
  1738. win_tenderer_phone,
  1739. second_tenderer,
  1740. second_bid_price,
  1741. second_tenderer_manager,
  1742. second_tenderer_phone,
  1743. third_tenderer,
  1744. third_bid_price,
  1745. third_tenderer_manager,
  1746. third_tenderer_phone,
  1747. procurement_system,
  1748. bidway,
  1749. dup_data,
  1750. docid_number,
  1751. project_dynamic,
  1752. product,
  1753. moneysource,
  1754. service_time,
  1755. time_bidclose,
  1756. time_bidopen,
  1757. time_bidstart,
  1758. time_commencement,
  1759. time_completion,
  1760. time_earnest_money_start,
  1761. time_earnest_money_end,
  1762. time_get_file_end,
  1763. time_get_file_start,
  1764. time_publicity_end,
  1765. time_publicity_start,
  1766. time_registration_end,
  1767. time_registration_start,
  1768. time_release,
  1769. dup_docid,
  1770. info_source,
  1771. nlp_enterprise,
  1772. nlp_enterprise_attachment,
  1773. update_time):
  1774. attrs_dict = {}
  1775. attrs_dict[project_uuid] = uuid
  1776. attrs_dict[project_docids] = docids
  1777. attrs_dict[project_zhao_biao_page_time] = zhao_biao_page_time
  1778. attrs_dict[project_zhong_biao_page_time] = zhong_biao_page_time
  1779. attrs_dict[project_page_time] = page_time
  1780. attrs_dict[project_area] = area
  1781. attrs_dict[project_province] = province
  1782. attrs_dict[project_city] = city
  1783. attrs_dict[project_district] = district
  1784. attrs_dict[project_info_type] = info_type
  1785. attrs_dict[project_industry] = industry
  1786. attrs_dict[project_qcodes] = qcodes
  1787. attrs_dict[project_project_name] = project_name
  1788. attrs_dict[project_project_code] = project_code
  1789. attrs_dict[project_project_codes] = project_codes
  1790. attrs_dict[project_project_addr] = project_addr
  1791. attrs_dict[project_tenderee] = tenderee
  1792. attrs_dict[project_tenderee_addr] = tenderee_addr
  1793. attrs_dict[project_tenderee_phone] = tenderee_phone
  1794. attrs_dict[project_tenderee_contact] = tenderee_contact
  1795. attrs_dict[project_agency] = agency
  1796. attrs_dict[project_agency_phone] = agency_phone
  1797. attrs_dict[project_agency_contact] = agency_contact
  1798. attrs_dict[project_sub_project_name] = sub_project_name
  1799. attrs_dict[project_sub_project_code] = sub_project_code
  1800. attrs_dict[project_bidding_budget] = bidding_budget
  1801. attrs_dict[project_win_tenderer] = win_tenderer
  1802. attrs_dict[project_win_bid_price] = win_bid_price
  1803. attrs_dict[project_win_tenderer_manager] = win_tenderer_manager
  1804. attrs_dict[project_win_tenderer_phone] = win_tenderer_phone
  1805. attrs_dict[project_second_tenderer] = second_tenderer
  1806. attrs_dict[project_second_bid_price] = second_bid_price
  1807. attrs_dict[project_second_tenderer_manager] = second_tenderer_manager
  1808. attrs_dict[project_second_tenderer_phone] = second_tenderer_phone
  1809. attrs_dict[project_third_tenderer] = third_tenderer
  1810. attrs_dict[project_third_bid_price] = third_bid_price
  1811. attrs_dict[project_third_tenderer_manager] = third_tenderer_manager
  1812. attrs_dict[project_third_tenderer_phone] = third_tenderer_phone
  1813. attrs_dict[project_procurement_system] = procurement_system
  1814. attrs_dict[project_bidway] = bidway
  1815. attrs_dict[project_dup_data] = dup_data
  1816. attrs_dict[project_docid_number] = docid_number
  1817. attrs_dict[project_project_dynamics] = project_dynamic
  1818. attrs_dict[project_product] = product
  1819. attrs_dict[project_moneysource] = moneysource
  1820. attrs_dict[project_service_time] = service_time
  1821. attrs_dict[project_time_bidclose] = time_bidclose
  1822. attrs_dict[project_time_bidopen] = time_bidopen
  1823. attrs_dict[project_time_bidstart] = time_bidstart
  1824. attrs_dict[project_time_commencement] = time_commencement
  1825. attrs_dict[project_time_completion] = time_completion
  1826. attrs_dict[project_time_earnest_money_start] = time_earnest_money_start
  1827. attrs_dict[project_time_earnest_money_end] = time_earnest_money_end
  1828. attrs_dict[project_time_get_file_end] = time_get_file_end
  1829. attrs_dict[project_time_get_file_start] = time_get_file_start
  1830. attrs_dict[project_time_publicity_end] = time_publicity_end
  1831. attrs_dict[project_time_publicity_start] = time_publicity_start
  1832. attrs_dict[project_time_registration_end] = time_registration_end
  1833. attrs_dict[project_time_registration_start] = time_registration_start
  1834. attrs_dict[project_time_release] = time_release
  1835. attrs_dict[project_dup_docid] = dup_docid
  1836. attrs_dict[project_info_source] = info_source
  1837. attrs_dict[project_nlp_enterprise] = nlp_enterprise
  1838. attrs_dict[project_nlp_enterprise_attachment] = nlp_enterprise_attachment
  1839. attrs_dict[project_update_time] = update_time
  1840. popNoneFromDict(attrs_dict)
  1841. attrs_json = json.dumps(attrs_dict,ensure_ascii=False)
  1842. if bidding_budget is None:
  1843. bidding_budget = -1
  1844. if win_bid_price is None:
  1845. win_bid_price = -1
  1846. if project_codes is None:
  1847. project_codes = ""
  1848. list_codes = project_codes.split(",")
  1849. page_time_stamp = self.ToTimeStamp.evaluate(page_time)
  1850. if len(list_codes)==0:
  1851. list_codes.append("")
  1852. if product is None:
  1853. product = ""
  1854. list_product = product.split(",")
  1855. if len(list_product)==0:
  1856. list_product.append("")
  1857. for _i in range(min(max(len(list_codes),len(list_product)),20)):
  1858. _project_code = list_codes[_i%len(list_codes)]
  1859. _product = list_product[_i%len(list_product)]
  1860. self.forward(uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
  1861. def appendKeyvalueCount(list_projects,keys=[project_tenderee,project_agency,project_win_tenderer,project_win_bid_price,project_bidding_budget,project_product]):
  1862. for _proj in list_projects:
  1863. _count = 0
  1864. for k in keys:
  1865. v = _proj.get(k,"")
  1866. if isinstance(v,str):
  1867. if v is not None and v!="":
  1868. _count += 1
  1869. elif isinstance(v,(int,float)):
  1870. if v>0:
  1871. _count += 1
  1872. _proj["keyvaluecount"] = _count
  1873. def dumplicate_projects(list_projects,b_log=False):
  1874. '''
  1875. 对多标段项目进行去重
  1876. :return:
  1877. '''
  1878. appendKeyvalueCount(list_projects)
  1879. list_projects.sort(key=lambda x:str(x.get(project_page_time,"")))
  1880. list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
  1881. cluster_projects = list_projects[:50]
  1882. _count = 10
  1883. print("dumplicate projects rest",len(cluster_projects))
  1884. while _count>0:
  1885. _count -= 1
  1886. _update = False
  1887. list_p = []
  1888. # log("================")
  1889. # for _p in cluster_projects:
  1890. # log("docids:%s"%(_p.get(project_docids,"")))
  1891. _c = 0
  1892. for _pp in cluster_projects:
  1893. _c += 1
  1894. _find = False
  1895. list_prob = []
  1896. for _p in list_p:
  1897. is_check,_prob = check_merge_rule(_p,_pp,b_log,return_prob=True)
  1898. list_prob.append([_p,is_check,_prob])
  1899. list_prob.sort(key=lambda x:x[2],reverse=True)
  1900. if len(list_prob)>0:
  1901. _p,is_check,_prob = list_prob[0]
  1902. if is_check:
  1903. update_projects_by_project(_pp,[_p])
  1904. _find = True
  1905. _update = True
  1906. if not _find:
  1907. list_p.append(_pp)
  1908. if len(cluster_projects)==len(list_p):
  1909. break
  1910. cluster_projects = list_p
  1911. print("dumplicate projects rest",len(cluster_projects))
  1912. return cluster_projects
  1913. def update_projects_by_project(project_dict,projects):
  1914. _dict = {}
  1915. #更新公共属性
  1916. for k,v in project_dict.items():
  1917. if k in (project_project_dynamics,project_page_time,project_sub_project_name,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment):
  1918. continue
  1919. for _proj in projects:
  1920. if k not in _proj:
  1921. _dict[k] = v
  1922. else:
  1923. _v = _proj.get(k)
  1924. if type(v)==type(_v):
  1925. if isinstance(_v,str):
  1926. if _v in ('',"未知","全国"):
  1927. _dict[k] = v
  1928. elif isinstance(_v,(int,float)):
  1929. if _v==0:
  1930. _dict[k] = v
  1931. for _proj in projects:
  1932. _proj.update(_dict)
  1933. if str(_proj.get(project_page_time,""))<str(project_dict.get(project_page_time,"")):
  1934. _proj[project_page_time] = project_dict.get(project_page_time,"")
  1935. if project_dict.get(project_sub_project_name) is not None and project_dict.get(project_sub_project_name) not in {"","Project"}:
  1936. if not (_proj.get(project_sub_project_name) is not None and _proj.get(project_sub_project_name) not in {"","Project"}):
  1937. _proj[project_sub_project_name] = project_dict.get(project_sub_project_name)
  1938. #拼接属性
  1939. append_dict = {}
  1940. set_docid = set()
  1941. set_product = set()
  1942. set_code = set()
  1943. set_uuid = set()
  1944. set_delete_uuid = set()
  1945. set_nlp_enterprise = set()
  1946. set_nlp_enterprise_attachment = set()
  1947. for _proj in projects:
  1948. _docids = _proj.get(project_docids,"")
  1949. _codes = _proj.get(project_project_codes,"")
  1950. _product = _proj.get(project_product,"")
  1951. _uuid = _proj.get(project_uuid,"")
  1952. delete_uuid = _proj.get(project_delete_uuid,"")
  1953. set_docid = set_docid | set(_docids.split(","))
  1954. set_code = set_code | set(_codes.split(","))
  1955. set_product = set_product | set(_product.split(","))
  1956. set_uuid = set_uuid | set(_uuid.split(","))
  1957. set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
  1958. try:
  1959. set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
  1960. set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
  1961. except Exception as e:
  1962. pass
  1963. set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
  1964. set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
  1965. set_product = set_product | set(project_dict.get(project_product,"").split(","))
  1966. set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
  1967. set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
  1968. try:
  1969. set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
  1970. set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
  1971. except Exception as e:
  1972. pass
  1973. append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
  1974. append_dict[project_docid_number] = len(set_docid)
  1975. append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""][:30])
  1976. append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
  1977. append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
  1978. append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
  1979. append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
  1980. append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
  1981. dict_dynamic = {}
  1982. set_docid = set()
  1983. for _proj in projects:
  1984. _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
  1985. for _dy in _dynamic:
  1986. _docid = _dy.get("docid")
  1987. dict_dynamic[_docid] = _dy
  1988. _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
  1989. for _dy in _dynamic:
  1990. _docid = _dy.get("docid")
  1991. dict_dynamic[_docid] = _dy
  1992. list_dynamics = []
  1993. for k,v in dict_dynamic.items():
  1994. list_dynamics.append(v)
  1995. list_dynamics.sort(key=lambda x:str(x.get(document_page_time,"")))
  1996. append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
  1997. for _proj in projects:
  1998. _proj.update(append_dict)
  1999. def getTimeStamp(page_time):
  2000. try:
  2001. return time.mktime(time.strptime(page_time,'%Y-%m-%d'))
  2002. except Exception as e:
  2003. return 0
  2004. def timeAdd(_time,days,format="%Y-%m-%d",minutes=0):
  2005. try:
  2006. a = time.mktime(time.strptime(_time,format))+86400*days+60*minutes
  2007. _time1 = time.strftime(format,time.localtime(a))
  2008. return _time1
  2009. except Exception as e:
  2010. return None
  2011. # def timeAdd(_time,days):
  2012. # try:
  2013. # a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
  2014. #
  2015. # _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
  2016. # return _time1
  2017. # except Exception as e:
  2018. # return None
  2019. def check_time_merge(json_time_less,json_time_greater,b_log,set_time_key=set([project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start])):
  2020. same_count = 0
  2021. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  2022. if isinstance(json_time_less,dict):
  2023. time_less = json_time_less
  2024. else:
  2025. time_less = json.loads(json_time_less)
  2026. if isinstance(json_time_greater,dict):
  2027. time_greater = json_time_greater
  2028. else:
  2029. time_greater = json.loads(json_time_greater)
  2030. for k,v in time_less.items():
  2031. if k in set_time_key:
  2032. if getLength(v)>0:
  2033. v1 = time_greater.get(k,"")
  2034. if getLength(v1)>0:
  2035. _dis = getTimeStamp(v[:10])-getTimeStamp(v1[:10])
  2036. if _dis>86400*5 or _dis<-86400*5:
  2037. if b_log:
  2038. log("check time failed %s-%s-%s"%(str(k),str(v),str(v1)))
  2039. return -1
  2040. else:
  2041. same_count += 1
  2042. if same_count>0:
  2043. return 1
  2044. return 0
  2045. def check_product_merge(product,product_to_merge,b_log):
  2046. #check product
  2047. set_product = set([a for a in product.split(",") if a!=""])
  2048. set_product_to_merge = set([a for a in product_to_merge.split(",") if a!=""])
  2049. if len(set_product)>0 and len(set_product_to_merge)>0:
  2050. if len(set_product&set_product_to_merge)==0:
  2051. if b_log:
  2052. log("check product failed %s===%s"%(str(product),str(product_to_merge)))
  2053. return -1
  2054. return 1
  2055. return 0
  2056. def check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit):
  2057. page_time_stamp = getTimeStamp(page_time)
  2058. page_time_to_merge_stamp = getTimeStamp(page_time_to_merge)
  2059. if page_time_stamp is not None and page_time_to_merge_stamp is not None:
  2060. _dis = max(page_time_stamp,page_time_to_merge_stamp)-min(page_time_stamp,page_time_to_merge_stamp)
  2061. if _dis>time_limit:
  2062. if b_log:
  2063. log("check page_time_dis failed %s===%s"%(str(page_time),str(page_time_to_merge)))
  2064. return -1
  2065. if _dis<time_limit//8:
  2066. return 1
  2067. return 0
  2068. def check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log):
  2069. #判断项目名称
  2070. if project_dynamics is not None and project_dynamics_to_merge is not None:
  2071. try:
  2072. project_dynamics = json.loads(project_dynamics)
  2073. project_dynamics_to_merge = json.loads(project_dynamics_to_merge)
  2074. for _d in project_dynamics:
  2075. _title1 = _d.get(document_doctitle,"")
  2076. _title1 = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _title1)
  2077. for _dm in project_dynamics_to_merge:
  2078. _title2 = _dm.get(document_doctitle,"")
  2079. _title2 = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _title2)
  2080. _sim = getSimilarityOfString(_title1,_title2)
  2081. # log("title1,title2 %s==%s"%(_title1,_title2))
  2082. if _sim>0.8:
  2083. return 1
  2084. if len(_title1)>15 and len(_title2)>15:
  2085. if _sim<0.5:
  2086. return -1
  2087. except Exception as e:
  2088. pass
  2089. return 0
  2090. def check_project_name_merge(project_name,project_name_to_merge,b_log):
  2091. #判断项目名称
  2092. project_name = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', project_name)
  2093. project_name_to_merge = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', project_name_to_merge)
  2094. _sim = getSimilarityOfString(project_name,project_name_to_merge)
  2095. if _sim>0.7:
  2096. return 1
  2097. if len(project_name)>15 and len(project_name_to_merge)>15:
  2098. if _sim<0.7:
  2099. if b_log:
  2100. log("check project_name failed %s %s===%s"%(str(_sim),str(project_name),str(project_name_to_merge)))
  2101. return -1
  2102. return 1
  2103. return 0
  2104. def check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,_proj,_proj_to_merge,b_log):
  2105. if getLength(zhong_biao_page_time)>0:
  2106. bidopen = _proj.get(project_time_bidopen)
  2107. if getLength(bidopen)==0:
  2108. bidopen = _proj.get(project_time_bidclose)
  2109. if getLength(bidopen)>0 and bidopen>zhong_biao_page_time:
  2110. zhong_biao_page_time = bidopen
  2111. if getLength(zhong_biao_page_time_to_merge)>0:
  2112. bidopen_to_merge = _proj_to_merge.get(project_time_bidopen)
  2113. if getLength(bidopen_to_merge)==0:
  2114. bidopen_to_merge = _proj_to_merge.get(project_time_bidclose)
  2115. if getLength(bidopen_to_merge)>0 and bidopen_to_merge>zhong_biao_page_time_to_merge:
  2116. zhong_biao_page_time_to_merge = bidopen_to_merge
  2117. if (getLength(zhong_biao_page_time)>0 and getLength(zhao_biao_page_time_to_merge)>0 and zhong_biao_page_time<zhao_biao_page_time_to_merge) or (getLength(zhong_biao_page_time_to_merge)>0 and getLength(zhao_biao_page_time)>0 and zhong_biao_page_time_to_merge<zhao_biao_page_time):
  2118. if b_log:
  2119. log("check zhaobiao zhongbiao page_time failed %s=%s===%s=%s"%(str(zhao_biao_page_time),str(zhong_biao_page_time),str(zhao_biao_page_time_to_merge),str(zhong_biao_page_time_to_merge)))
  2120. return -1
  2121. return 1
  2122. def check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,project_dynamics,project_dynamics_to_merge,b_log,package_number_pattern = re.compile("((包|标[段号的包]|分?包|包组|项目)编?号?[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")):
  2123. #check sub_project_name
  2124. sub_project_name = str(sub_project_name).replace("Project","")
  2125. sub_project_name_to_merge = str(sub_project_name_to_merge).replace("Project","")
  2126. _set = set([a for a in [sub_project_name,sub_project_name_to_merge] if a!=""])
  2127. if sub_project_name!="" and sub_project_name_to_merge!="":
  2128. if len(_set)>1:
  2129. if b_log:
  2130. log("check sub_project_name failed %s===%s"%(str(sub_project_name),str(sub_project_name_to_merge)))
  2131. return -1
  2132. return 1
  2133. if project_dynamics is not None and project_dynamics_to_merge is not None:
  2134. try:
  2135. project_dynamics = json.loads(project_dynamics)
  2136. project_dynamics_to_merge = json.loads(project_dynamics_to_merge)
  2137. set_title_name = set()
  2138. set_title_name_to_merge = set()
  2139. for _d in project_dynamics:
  2140. _title1 = _d.get(document_doctitle,"")
  2141. _title_name = None
  2142. _title_name_search = re.search(package_number_pattern,_title1)
  2143. if _title_name_search is not None:
  2144. _title_name = _title_name_search.group()
  2145. _title_name = re.sub("[^0-9A-Za-z一二三四五六七八九十]",'',_title_name)
  2146. if _title_name!="":
  2147. set_title_name.add(_title_name)
  2148. for _dm in project_dynamics_to_merge:
  2149. _title2 = _dm.get(document_doctitle,"")
  2150. _title_name = None
  2151. _title_name_search = re.search(package_number_pattern,_title2)
  2152. if _title_name_search is not None:
  2153. _title_name = _title_name_search.group()
  2154. _title_name = re.sub("[^0-9A-Za-z一二三四五六七八九十]",'',_title_name)
  2155. if _title_name!="":
  2156. set_title_name_to_merge.add(_title_name)
  2157. if len(set_title_name)>0 and len(set_title_name_to_merge)>0:
  2158. if len(set_title_name&set_title_name_to_merge)==0:
  2159. if b_log:
  2160. log("check sub_project_name title set failed %s===%s"%(str(set_title_name),str(set_title_name_to_merge)))
  2161. return -1
  2162. else:
  2163. return 1
  2164. except Exception as e:
  2165. traceback.print_exc()
  2166. return 0
  2167. def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log):
  2168. _set1 = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
  2169. if len(_set1)>1:
  2170. if tenderee in enterprise_to_merge or tenderee_to_merge in enterprise:
  2171. pass
  2172. else:
  2173. if getSimilarityOfString(re.sub("[省市]",'',tenderee),re.sub("[省市]",'',tenderee_to_merge))==1:
  2174. pass
  2175. else:
  2176. if b_log:
  2177. log("check tenderee failed %s===%s"%(str(tenderee),str(tenderee_to_merge)))
  2178. return -1
  2179. _set2 = set([a for a in [agency,agency_to_merge] if a!=""])
  2180. if len(_set2)>1:
  2181. if agency in enterprise_to_merge or agency_to_merge in enterprise:
  2182. pass
  2183. else:
  2184. if getSimilarityOfString(re.sub("[省市]",'',agency),re.sub("[省市]",'',agency_to_merge))==1:
  2185. pass
  2186. else:
  2187. if b_log:
  2188. log("check agency failed %s===%s"%(str(agency),str(agency_to_merge)))
  2189. return -1
  2190. _set3 = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
  2191. if len(_set3)>1:
  2192. if win_tenderer in enterprise_to_merge or win_tenderer_to_merge in enterprise:
  2193. pass
  2194. else:
  2195. if getSimilarityOfString(re.sub("[省市]",'',win_tenderer),re.sub("[省市]",'',win_tenderer_to_merge))==1:
  2196. pass
  2197. else:
  2198. if b_log:
  2199. log("check win_tenderer failed %s===%s"%(str(win_tenderer),str(win_tenderer_to_merge)))
  2200. return -1
  2201. if len(_set1)+len(_set2)+len(_set3)>=2:
  2202. if (tenderee!="" or agency!="" or win_tenderer!="") and (tenderee_to_merge!="" or agency_to_merge!="" or win_tenderer_to_merge!=""):
  2203. return 1
  2204. return 0
  2205. def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log):
  2206. #只判断最高前五位
  2207. bidding_budget = round(bidding_budget)
  2208. bidding_budget = round(bidding_budget,6-len(str(bidding_budget)))
  2209. bidding_budget_to_merge = round(bidding_budget_to_merge)
  2210. bidding_budget_to_merge = round(bidding_budget_to_merge,6-len(str(bidding_budget_to_merge)))
  2211. win_bid_price = round(win_bid_price)
  2212. win_bid_price = round(win_bid_price,6-len(str(win_bid_price)))
  2213. win_bid_price_to_merge = round(win_bid_price_to_merge)
  2214. win_bid_price_to_merge = round(win_bid_price_to_merge,6-len(str(win_bid_price_to_merge)))
  2215. _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
  2216. if len(_set)>1:
  2217. if b_log:
  2218. log("check bidding_budget failed %s===%s"%(str(bidding_budget),str(bidding_budget_to_merge)))
  2219. return -1
  2220. _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
  2221. if len(_set1)>1:
  2222. if b_log:
  2223. log("check win_bid_price failed %s===%s"%(str(win_bid_price),str(win_bid_price_to_merge)))
  2224. return -1
  2225. #check money
  2226. if len(_set)==1 and len(_set1)==0:
  2227. if (bidding_budget>0 and bidding_budget_to_merge>0):
  2228. return 1
  2229. if len(_set)==1 and len(_set1)==1:
  2230. max_win_bid_price = max(_set1)
  2231. max_bidding_budget = max(_set)
  2232. radio = max_win_bid_price/max_bidding_budget
  2233. if (bidding_budget>0 and bidding_budget_to_merge>0) or (win_bid_price>0 and win_bid_price_to_merge>0):
  2234. return 1
  2235. #允许中标金额大于预算10%
  2236. if max_win_bid_price>max_bidding_budget*(1.1):
  2237. if b_log:
  2238. log("check max_win_bid_price<=max_bidding_budget*(1.1) failed %s===%s"%(str(max(_set1)),str(max(_set))))
  2239. return -1
  2240. else:
  2241. if radio<0.3:
  2242. if b_log:
  2243. log("check money failed radio<0.3 %s===%s"%(str(max(_set1)),str(max(_set))))
  2244. return 0
  2245. # return -1
  2246. return 0
  2247. def check_project_codes_merge(list_code,list_code_to_merge,b_log):
  2248. #check project_codes
  2249. has_same = False
  2250. has_similar = False
  2251. for _c in list_code[:100]:
  2252. for _c1 in list_code_to_merge[:100]:
  2253. _c = str(_c).replace("【","[").replace("】","]")
  2254. _c1 = str(_c1).replace("【","[").replace("】","]")
  2255. _simi = getSimilarityOfString(_c,_c1,3)
  2256. if _simi==1:
  2257. has_same = True
  2258. elif _simi>0.6:
  2259. has_similar = True
  2260. else:
  2261. if len(_c)==len(_c1) and len(_c)>8 and _c!=_c1:
  2262. has_similar = True
  2263. if not has_same and has_similar:
  2264. if b_log:
  2265. log("check code failed %s===%s"%(str(list_code),str(list_code_to_merge)))
  2266. return -1
  2267. if has_same:
  2268. return 1
  2269. return 0
  2270. def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
  2271. docids = _proj.get(project_docids,"")
  2272. page_time = _proj.get(project_page_time,"")
  2273. project_codes = _proj.get(project_project_codes,"")
  2274. project_name = _proj.get(project_project_name,"")
  2275. tenderee = _proj.get(project_tenderee,"")
  2276. agency = _proj.get(project_agency,"")
  2277. product = _proj.get(project_product,"")
  2278. sub_project_name = _proj.get(project_sub_project_name,"")
  2279. bidding_budget = float(_proj.get(project_bidding_budget,-1))
  2280. win_tenderer = _proj.get(project_win_tenderer,"")
  2281. win_bid_price = float(_proj.get(project_win_bid_price,-1))
  2282. project_code = _proj.get(project_project_code,"")
  2283. zhao_biao_page_time = _proj.get(project_zhao_biao_page_time,"")
  2284. zhong_biao_page_time = _proj.get(project_zhong_biao_page_time,"")
  2285. project_dynamics = _proj.get(project_project_dynamics)
  2286. enterprise = _proj.get("enterprise")
  2287. if enterprise is None:
  2288. try:
  2289. enterprise = set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
  2290. enterprise |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
  2291. _proj["enterprise"] = enterprise
  2292. except Exception as e:
  2293. traceback.print_exc()
  2294. list_code = [a for a in project_codes.split(",") if a!='']
  2295. if project_code!="":
  2296. list_code.append(project_code)
  2297. list_code = [a for a in list_code if a is not None]
  2298. docids_to_merge = _dict.get(project_docids,"")
  2299. page_time_to_merge = _dict.get(project_page_time,"")
  2300. project_codes_to_merge = _dict.get(project_project_codes,"")
  2301. project_name_to_merge = _dict.get(project_project_name,"")
  2302. tenderee_to_merge = _dict.get(project_tenderee,"")
  2303. agency_to_merge = _dict.get(project_agency,"")
  2304. product_to_merge = _dict.get(project_product,"")
  2305. sub_project_name_to_merge = _dict.get(project_sub_project_name,"")
  2306. bidding_budget_to_merge = float(_dict.get(project_bidding_budget,-1))
  2307. win_tenderer_to_merge = _dict.get(project_win_tenderer,"")
  2308. win_bid_price_to_merge = float(_dict.get(project_win_bid_price,-1))
  2309. project_code_to_merge = _dict.get(project_project_code,"")
  2310. zhao_biao_page_time_to_merge = _dict.get(project_zhao_biao_page_time,"")
  2311. zhong_biao_page_time_to_merge = _dict.get(project_zhong_biao_page_time,"")
  2312. project_dynamics_to_merge = _dict.get(project_project_dynamics)
  2313. is_few = False
  2314. if (0 if project_codes=="" else 1) + (0 if project_name=="" else 1) + (0 if bidding_budget<0 else 1) +(0 if tenderee=="" else 1) + (0 if win_bid_price<0 else 1) + (0 if win_tenderer=="" else 1)<=1:
  2315. is_few = True
  2316. if (0 if project_codes_to_merge=="" else 1) + (0 if project_name_to_merge=="" else 1) + (0 if bidding_budget_to_merge<0 else 1) +(0 if tenderee_to_merge=="" else 1) + (0 if win_bid_price_to_merge<0 else 1) + (0 if win_tenderer_to_merge=="" else 1)<=1:
  2317. is_few = True
  2318. list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
  2319. if project_code_to_merge!="":
  2320. list_code_to_merge.append(project_code_to_merge)
  2321. list_code_to_merge = [a for a in list_code_to_merge if a is not None]
  2322. if b_log:
  2323. log("checking docids:%s and %s"%(str(docids),str(docids_to_merge)))
  2324. enterprise_to_merge = _dict.get("enterprise")
  2325. if enterprise_to_merge is None:
  2326. try:
  2327. enterprise_to_merge = set(json.loads(_dict.get(project_nlp_enterprise,"[]")))
  2328. enterprise_to_merge |= set(json.loads(_dict.get(project_nlp_enterprise_attachment,"[]")))
  2329. _dict["enterprise"] = enterprise_to_merge
  2330. except Exception as e:
  2331. traceback.print_exc()
  2332. check_dict = {0:0,1:0,-1:0}
  2333. prob_count = 0
  2334. #时间判断-招中标时间
  2335. _zhaozhong_check = check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,_proj,_dict,b_log)
  2336. check_dict[_zhaozhong_check] += 1
  2337. if check_dict[-1]>0:
  2338. if return_prob:
  2339. return False,0
  2340. return False
  2341. #事件判断-金额
  2342. _money_check = check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log)
  2343. check_dict[_money_check] += 1
  2344. prob_count += _money_check
  2345. #人物判断-角色
  2346. _roles_check = check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log)
  2347. check_dict[_roles_check] += 1
  2348. prob_count += _roles_check
  2349. _product_check = check_product_merge(product,product_to_merge,b_log)
  2350. _project_name_check = check_project_name_merge(project_name,project_name_to_merge,b_log)
  2351. _title_check = check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log)
  2352. #事件判断-编号
  2353. _codes_check = check_project_codes_merge(list_code,list_code_to_merge,b_log)
  2354. check_dict[_codes_check] += 1
  2355. prob_count += _codes_check
  2356. if is_few:
  2357. if _codes_check!=1:
  2358. if _title_check!=1:
  2359. if return_prob:
  2360. return False,0
  2361. return False
  2362. if len(enterprise)>0 and len(enterprise_to_merge)>0:
  2363. if len(enterprise & enterprise_to_merge)==0:
  2364. if return_prob:
  2365. return False,0
  2366. return False
  2367. if _product_check==-1:
  2368. if return_prob:
  2369. return False,0
  2370. return False
  2371. min_count = 2
  2372. if product=="" or product_to_merge=="":
  2373. min_count = 1
  2374. #事件判断--产品和名称、标题需要满足两个个
  2375. if max(_project_name_check,0)+max(_product_check,0)+max(_title_check,0)<min_count:
  2376. if b_log:
  2377. log("project_name,project_name_to_merge %s %s"%(project_name,project_name_to_merge))
  2378. log("product,product_to_merge %s %s"%(product,product_to_merge))
  2379. log("check _project_name_check+_product_check+_title_check<2 failed %d %s,%s,%s"%(_project_name_check+_product_check+_title_check,str(_project_name_check),str(_product_check),str(_title_check)))
  2380. # if return_prob:
  2381. # return False,0
  2382. # return False
  2383. prob_count += -1
  2384. else:
  2385. prob_count += 2
  2386. if simple_check:
  2387. if return_prob:
  2388. _prob = check_dict[1]/(check_dict[-1]+check_dict[0]+check_dict[1])
  2389. return True,_prob
  2390. return True
  2391. #时间判断-其他时间
  2392. _time_check = check_time_merge(_proj,_dict,b_log)
  2393. check_dict[_time_check] += 1
  2394. #时间判断-分包编号
  2395. _sub_project_name_check = check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,project_dynamics,project_dynamics_to_merge,b_log)
  2396. if docids==docids_to_merge and _sub_project_name_check==-1:
  2397. if return_prob:
  2398. return False,0
  2399. return False
  2400. check_dict[_sub_project_name_check] += 1
  2401. prob_count += _sub_project_name_check*3
  2402. #时间判断-发布时间
  2403. _page_time_check = check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit)
  2404. check_dict[_page_time_check] += 1
  2405. _prob = prob_count/8
  2406. if b_log:
  2407. log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
  2408. if _prob<0.15:
  2409. if b_log:
  2410. log("prob less than 0.15 prob_count:%d"%(prob_count))
  2411. if return_prob:
  2412. return False,_prob
  2413. return False
  2414. if check_dict[-1]>0:
  2415. if check_dict[-1]==1:
  2416. if _roles_check==-1:
  2417. if return_prob:
  2418. return False,0
  2419. return False
  2420. if (_codes_check==1 and _roles_check==1 and _product_check==1 and _money_check>=0) or (_roles_check==1 and _money_check==1 and _product_check==1) or (_money_check==1 and _product_check==1 and _codes_check==1) or (_money_check>=0 and _roles_check==1 and _codes_check==1 and (_title_check==1 or _project_name_check==1 or _product_check==1)):
  2421. if return_prob:
  2422. return True,_prob
  2423. return True
  2424. if return_prob:
  2425. return False,0
  2426. return False
  2427. if return_prob:
  2428. return True,_prob
  2429. return True
  2430. @annotate('string,bigint,string->string')
  2431. class f_group_merge_projects(BaseUDAF):
  2432. '''
  2433. 合并组为一条记录
  2434. '''
  2435. def __init__(self):
  2436. import json
  2437. global json
  2438. def new_buffer(self):
  2439. return [[]]
  2440. def iterate(self, buffer,_uuid,page_time_stamp,attrs_json):
  2441. buffer[0].append([_uuid,page_time_stamp,attrs_json])
  2442. buffer[0] = buffer[0][:1000]
  2443. def merge(self, buffer, pbuffer):
  2444. buffer[0].extend(pbuffer[0][:1000])
  2445. buffer[0] = buffer[0][:1000]
  2446. def terminate(self, buffer):
  2447. set_uuid = set()
  2448. list_data = []
  2449. for _uuid,page_time_stamp,attrs_json in buffer[0]:
  2450. if _uuid in set_uuid:
  2451. continue
  2452. try:
  2453. attrs = json.loads(attrs_json)
  2454. list_data.append([_uuid,page_time_stamp,attrs])
  2455. set_uuid.add(_uuid)
  2456. except Exception as e:
  2457. pass
  2458. list_group_data = []
  2459. list_group = split_with_time(list_data,1)
  2460. _time = time.time()
  2461. for _group in list_group[:100]:
  2462. list_group_pair = []
  2463. _group = _group[:50]
  2464. for _i in range(len(_group)):
  2465. for _j in range(_i+1,len(_group)):
  2466. _p_uuid,_,_p = _group[_i]
  2467. _pp_uuid,_,_pp = _group[_j]
  2468. if check_merge_rule(_p,_pp,False):
  2469. list_group_pair.append([_p_uuid,_pp_uuid])
  2470. if len(list_group_pair)>0:
  2471. list_group_data.append(list_group_pair)
  2472. if time.time()-_time>600:
  2473. break
  2474. return json.dumps(list_group_data)
  2475. @annotate('string -> string,string')
  2476. class f_extract_uuid_groups(BaseUDTF):
  2477. '''
  2478. 将多个组拆解成多条记录
  2479. '''
  2480. def __init__(self):
  2481. import logging
  2482. import json
  2483. global json,logging
  2484. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2485. def process(self,json_groups):
  2486. if json_groups is not None:
  2487. list_group = json.loads(json_groups)
  2488. for l_group in list_group:
  2489. for _group in l_group:
  2490. self.forward(_group[0],_group[1])
  2491. self.forward(_group[1],_group[0])
  2492. @annotate('string,string->string')
  2493. class f_group_uuids(BaseUDAF):
  2494. '''
  2495. 合并组为一条记录
  2496. '''
  2497. def __init__(self):
  2498. import json
  2499. global json
  2500. def new_buffer(self):
  2501. return [[]]
  2502. def iterate(self, buffer,uuid_1,uuid_2):
  2503. buffer[0].append([uuid_1,uuid_2])
  2504. buffer[0] = buffer[0][:1000]
  2505. def merge(self, buffer, pbuffer):
  2506. buffer[0].extend(pbuffer[0][:1000])
  2507. buffer[0] = buffer[0][:1000]
  2508. def terminate(self, buffer):
  2509. set_uuid = set()
  2510. for uuid_1,uuid_2 in buffer[0]:
  2511. set_uuid.add(uuid_1)
  2512. set_uuid.add(uuid_2)
  2513. list_uuid = list(set_uuid)
  2514. list_uuid.sort(key=lambda x:x)
  2515. return ",".join(list_uuid)
  2516. @annotate('string -> string,string')
  2517. class f_extract_union_group(BaseUDTF):
  2518. '''
  2519. 将多个组拆解成多条记录
  2520. '''
  2521. def __init__(self):
  2522. import logging
  2523. import json
  2524. global json,logging
  2525. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2526. def process(self,str_uuids):
  2527. if str_uuids is not None:
  2528. list_uuid = [a for a in str_uuids.split(",") if a!=""]
  2529. if len(list_uuid)>0:
  2530. for i in range(len(list_uuid)):
  2531. for j in range(i,len(list_uuid)):
  2532. self.forward(list_uuid[i],list_uuid[j])
  2533. self.forward(list_uuid[j],list_uuid[i])
  2534. @annotate('string -> string,string')
  2535. class f_extract_group_uuids(BaseUDTF):
  2536. '''
  2537. 将多个组拆解成多条记录
  2538. '''
  2539. def __init__(self):
  2540. import logging
  2541. import json
  2542. global json,logging
  2543. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2544. def process(self,str_uuids):
  2545. if str_uuids is not None:
  2546. list_uuid = [a for a in str_uuids.split(",") if a!=""]
  2547. if len(list_uuid)>0:
  2548. main_uuid = list_uuid[0]
  2549. for _uuid in list_uuid:
  2550. self.forward(main_uuid,_uuid)
  2551. class MyEncoder(json.JSONEncoder):
  2552. def default(self, obj):
  2553. if isinstance(obj, np.ndarray):
  2554. return obj.tolist()
  2555. elif isinstance(obj, bytes):
  2556. return str(obj, encoding='utf-8')
  2557. elif isinstance(obj, (np.float_, np.float16, np.float32,
  2558. np.float64)):
  2559. return float(obj)
  2560. elif isinstance(obj,str):
  2561. return obj
  2562. return json.JSONEncoder.default(self, obj)
  2563. def to_project_json(projects):
  2564. list_proj = []
  2565. for _proj in projects:
  2566. _uuid = _proj.get(project_uuid,"")
  2567. if "enterprise" in _proj:
  2568. _proj.pop("enterprise")
  2569. list_uuid = [a for a in _uuid.split(",") if a!=""]
  2570. if len(list_uuid)>0:
  2571. _proj["keep_uuid"] = list_uuid[0]
  2572. _proj["delete_uuid"] = ",".join(list_uuid[1:])
  2573. else:
  2574. _proj["keep_uuid"] = _proj.get("keep_uuid","")
  2575. to_delete = _proj.get("to_delete","")
  2576. if to_delete=="" and _proj.get("keep_uuid","")=="":
  2577. _uuid = uuid4()
  2578. _proj["keep_uuid_generated"] = str(_uuid)
  2579. _proj["delete_uuid"] = _proj.get("delete_uuid","")
  2580. list_proj.append(_proj)
  2581. if project_uuid in _proj:
  2582. _proj.pop(project_uuid)
  2583. return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
  2584. def get_page_time_dis(page_time,n_page_time):
  2585. _dis = -1
  2586. try:
  2587. page_time_stamp = time.mktime(time.strptime(page_time,'%Y-%m-%d'))
  2588. n_page_time_stamp = time.mktime(time.strptime(n_page_time,'%Y-%m-%d'))
  2589. _dis = (max(page_time_stamp,n_page_time_stamp)-min(page_time_stamp,n_page_time_stamp))//86400
  2590. except Exception as e:
  2591. pass
  2592. return _dis
  2593. def check_page_time_dup(page_time,n_page_time):
  2594. _dis = get_page_time_dis(page_time,n_page_time)
  2595. if _dis>=0 and _dis<=20:
  2596. return True
  2597. return False
  2598. def dumplicate_document_in_merge(list_projects,dup_docid):
  2599. '''
  2600. 合并时去重
  2601. :param list_projects:
  2602. :return:
  2603. '''
  2604. dup_docid = set(dup_docid)
  2605. set_dup_total = set()
  2606. for _proj in list_projects:
  2607. try:
  2608. docids = _proj.get(project_docids,"")
  2609. set_docids = set([a for a in docids.split(",") if a!=""])
  2610. dict_channel_proj = {}
  2611. _project_dynamics = _proj.get(project_project_dynamics,"[]")
  2612. list_dynamics = json.loads(_project_dynamics)
  2613. set_dup_docid = set()
  2614. _time = time.time()
  2615. for _d in list_dynamics:
  2616. docid = _d.get(document_docid)
  2617. doctitle = _d.get(document_doctitle,"")
  2618. title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",doctitle)
  2619. if str(docid) not in set_docids:
  2620. continue
  2621. if docid in dup_docid:
  2622. continue
  2623. _status = _d.get(document_status,201)
  2624. is_multipack = _d.get("is_multipack",True)
  2625. extract_count = _d.get(document_tmp_extract_count,0)
  2626. docchannel = _d.get(document_docchannel,0)
  2627. page_time = _d.get(document_page_time,"")
  2628. # if _status>=401 and _status<=450:
  2629. # print(":1",docid)
  2630. # set_dup_docid.add(str(docid))
  2631. if docchannel in {52,101,118,119,120} and extract_count>5:
  2632. if docchannel in dict_channel_proj:
  2633. n_d = dict_channel_proj[docchannel]
  2634. n_docid = n_d.get(document_docid)
  2635. n_is_multipack = n_d.get("is_multipack",True)
  2636. n_extract_count = n_d.get(document_tmp_extract_count,0)
  2637. n_page_time = n_d.get(document_page_time,"")
  2638. n_doctitle = n_d.get(document_doctitle,"")
  2639. if docid==n_docid:
  2640. continue
  2641. if not check_page_time_dup(page_time,n_page_time):
  2642. continue
  2643. if is_multipack or n_is_multipack:
  2644. continue
  2645. n_title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",n_doctitle)
  2646. if title_search is None and n_title_search is None:
  2647. pass
  2648. elif title_search is not None and n_title_search is not None and str(title_search.group())==str(n_title_search.group()):
  2649. pass
  2650. else:
  2651. continue
  2652. if extract_count>n_extract_count:
  2653. n_d[document_status] = 401
  2654. set_dup_docid.add(str(n_docid))
  2655. dict_channel_proj[docchannel] = _d
  2656. elif extract_count==n_extract_count:
  2657. if int(n_docid)>int(docid):
  2658. n_d[document_status] = 401
  2659. set_dup_docid.add(str(n_docid))
  2660. dict_channel_proj[docchannel] = _d
  2661. elif int(n_docid)<int(docid):
  2662. _d[document_status] = 401
  2663. set_dup_docid.add(str(docid))
  2664. else:
  2665. _d[document_status] = 401
  2666. set_dup_docid.add(str(docid))
  2667. if not is_multipack and not n_is_multipack:
  2668. pass
  2669. else:
  2670. dict_channel_proj[docchannel] = _d
  2671. set_docids = set_docids-set_dup_docid
  2672. set_dup_total |= set_dup_docid
  2673. if len(set_docids)==0:
  2674. log("projects set_docids length is zero %s"%(docids))
  2675. else:
  2676. _proj[project_docids] = ",".join(list(set_docids))
  2677. _proj[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  2678. _proj[project_docid_number] = len(set_docids)
  2679. _proj[project_dup_docid] = ",".join(list(set_dup_docid))
  2680. # log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
  2681. except Exception as e:
  2682. traceback.print_exc()
  2683. return list(set_dup_total)
  2684. @annotate('string,string->string')
  2685. class f_dumplicate_projects(BaseUDAF):
  2686. '''
  2687. 合并组为一条记录
  2688. '''
  2689. def __init__(self):
  2690. import json
  2691. import sys
  2692. global json,sys
  2693. def new_buffer(self):
  2694. return [[]]
  2695. def iterate(self, buffer,_uuid,attrs_json):
  2696. buffer[0].append([_uuid,attrs_json])
  2697. buffer[0] = buffer[0][:1000]
  2698. def merge(self, buffer, pbuffer):
  2699. buffer[0].extend(pbuffer[0][:1000])
  2700. buffer[0] = buffer[0][:1000]
  2701. def terminate(self, buffer):
  2702. set_uuid = set()
  2703. list_data = []
  2704. for uuid_1,attrs_json in buffer[0]:
  2705. if attrs_json is None:
  2706. continue
  2707. if uuid_1 in set_uuid:
  2708. continue
  2709. list_data.append(json.loads(attrs_json))
  2710. set_uuid.add(uuid_1)
  2711. list_projects = dumplicate_projects(list_data,False)
  2712. # dumplicate_document_in_merge(list_projects)
  2713. project_json = to_project_json(list_projects)
  2714. return project_json
  2715. @annotate('string -> string')
  2716. class f_generate_project_with_attrs_json(BaseUDTF):
  2717. '''
  2718. 将多个组拆解成多条记录
  2719. '''
  2720. def __init__(self):
  2721. import logging
  2722. import json
  2723. global json,logging
  2724. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2725. def process(self,attrs_json):
  2726. if attrs_json is not None:
  2727. _group = json.loads(attrs_json)
  2728. project_json = to_project_json([_group])
  2729. self.forward(project_json)
  2730. @annotate('string -> string')
  2731. class f_generate_project_with_delete_uuid(BaseUDTF):
  2732. '''
  2733. 将多个组拆解成多条记录
  2734. '''
  2735. def __init__(self):
  2736. import logging
  2737. import json
  2738. global json,logging
  2739. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2740. def process(self,delete_uuid):
  2741. if delete_uuid is not None:
  2742. _group = {project_delete_uuid:delete_uuid,
  2743. "to_delete":True}
  2744. self.forward(json.dumps([_group],ensure_ascii=False))
  2745. def test_remerge():
  2746. a = f_remege_limit_num_contain_bychannel()
  2747. buffer = a.new_buffer()
  2748. tmp_s = '''
  2749. 266523906 266539038 2022-09-08 1662566400 SDGP371525000202201000421_A 冠县第二实验小学平台教育信息化设备采购智慧屏 冠县第二实验小学平台教育信息化设备采购智慧屏成交公告 冠县第二实验小学平台教育信息化设备智慧屏 冠县第二实验小学 聊城市采购中心 山东润博网络有限公司 246890.0 101 0 12 "{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
  2750. 266523906 266523906 2022-09-15 1663171200 SDGP371525000202201000421_A 冠县第二实验小学平台教育信息化设备采购智慧屏 冠县第二实验小学平台教育信息化设备采购智慧屏成交公告 冠县第二实验小学平台教育信息化设备智慧屏 冠县第二实验小学 聊城市采购中心 山东润博网络有限公司 246890.0 101 999 12 "{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
  2751. '''
  2752. for _s in tmp_s.split("\n"):
  2753. ls = _s.split("\t")
  2754. if len(ls)!=17:
  2755. continue
  2756. _confid = 1 if ls[14] =="" else ls[14]
  2757. a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
  2758. # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
  2759. # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
  2760. # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
  2761. print(a.terminate(buffer))
  2762. print(1)
  2763. print(getSimilarityOfString('37168100014015220220012_40785671','SDGP371681000202201000912'))
  2764. @annotate('string,bigint,bigint->string')
  2765. class f_check_projects_by_num(BaseUDTF):
  2766. def process(self,json_projects,len_start,len_end):
  2767. if json_projects is not None:
  2768. list_projects = json.loads(json_projects)
  2769. for _proj in list_projects:
  2770. _num = _proj.get(project_docid_number,0)
  2771. if _num>=len_start and _num<=len_end:
  2772. self.forward(json.dumps(_proj,ensure_ascii=False))
  2773. @annotate('string->string,string')
  2774. class f_check_projects_by_time(BaseUDTF):
  2775. def process(self,json_projects):
  2776. if json_projects is not None:
  2777. list_projects = json.loads(json_projects)
  2778. _type = ""
  2779. for _proj in list_projects:
  2780. zhaobiao = _proj.get(project_zhao_biao_page_time)
  2781. zhongbiao = _proj.get(project_zhong_biao_page_time)
  2782. if getLength(zhaobiao)>0 and getLength(zhongbiao)>0:
  2783. _type = "招中标"
  2784. elif getLength(zhaobiao)>0 and getLength(zhongbiao)==0:
  2785. _type = "招标"
  2786. elif getLength(zhaobiao)==0 and getLength(zhongbiao)>0:
  2787. _type = "中标"
  2788. else:
  2789. _type = "其他"
  2790. self.forward(json.dumps(_proj,ensure_ascii=False),_type)
  2791. # if (zhongbiao is None or zhongbiao=="") and zhaobiao is not None and zhaobiao!="":
  2792. # if zhaobiao is not None and zhongbiao is not None and zhaobiao!="" and zhongbiao!="":
  2793. # self.forward(json.dumps(_proj,ensure_ascii=False))
  2794. @annotate('string->string,string,double')
  2795. class f_extract_year_win_and_price(BaseUDTF):
  2796. def process(self,json_projects):
  2797. if json_projects is not None:
  2798. list_projects = json.loads(json_projects)
  2799. for _proj in list_projects:
  2800. win_tenderer = _proj.get(project_win_tenderer,"")
  2801. win_bid_price = float(_proj.get(project_win_bid_price,0))
  2802. page_time = _proj.get(project_zhong_biao_page_time,"")
  2803. if win_tenderer!="":
  2804. self.forward(page_time,win_tenderer,win_bid_price)
  2805. def test_merge_rule():
  2806. o_a = {
  2807. "bidding_budget":0,
  2808. "bidding_budget_unit":"",
  2809. "sub_project_code":"",
  2810. "sub_project_name":"Project",
  2811. "win_bid_price":0,
  2812. "win_bid_price_unit":"",
  2813. "win_service_time":"",
  2814. "win_tenderer":"日照华中机电贸易有限公司",
  2815. "district":"未知",
  2816. "city":"日照",
  2817. "province":"山东",
  2818. "area":"华东",
  2819. "industry":"建筑建材",
  2820. "info_type":"有色金属冶炼及压延产品",
  2821. "info_source":"企业采购",
  2822. "qcodes":"",
  2823. "project_code":"DLGCB-X001302",
  2824. "tenderee":"日照港通通信工程有限公司动力分公司",
  2825. "procurement_system":"企业采购系统",
  2826. "time_release":"2020-05-22",
  2827. "extract_count":3,
  2828. "project_dynamic":"[{\"docid\": 99800062, \"doctitle\": \"DLGCB-X001302\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2020-05-22\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 3}]",
  2829. "docid_number":1,
  2830. "docids":"99800062",
  2831. "zhong_biao_page_time":"2020-05-22",
  2832. "project_codes":"DLGCB-X001302",
  2833. "page_time":"2020-05-22",
  2834. "product":"铜辫子",
  2835. "nlp_enterprise":"[\"日照华中机电贸易有限公司\", \"乐清\", \"日照港通通信工程有限公司动力分公司\"]",
  2836. "nlp_enterprise_attachment":"[]",
  2837. "delete_uuid":"03f60e46-3036-4f2a-a4bb-f5a326c2755e"
  2838. }
  2839. o_b = {
  2840. "bidding_budget":0,
  2841. "bidding_budget_unit":"",
  2842. "sub_project_code":"",
  2843. "sub_project_name":"Project",
  2844. "district":"未知",
  2845. "city":"日照",
  2846. "province":"山东",
  2847. "area":"华东",
  2848. "industry":"建筑建材",
  2849. "info_type":"有色金属冶炼及压延产品",
  2850. "info_source":"企业采购",
  2851. "qcodes":"",
  2852. "project_code":"DLGCB-X001302",
  2853. "tenderee":"日照港通通信工程有限公司动力分公司",
  2854. "procurement_system":"企业采购系统",
  2855. "time_release":"2020-05-19",
  2856. "extract_count":2,
  2857. "project_dynamic":"[{\"docid\": 99403871, \"doctitle\": \"DLGCB-X001302\", \"docchannel\": 52, \"bidway\": \"\", \"page_time\": \"2020-05-19\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 2}]",
  2858. "docid_number":1,
  2859. "docids":"99403871",
  2860. "zhao_biao_page_time":"2020-05-19",
  2861. "project_codes":"DLGCB-X001302",
  2862. "page_time":"2020-05-19",
  2863. "product":"铜辫子",
  2864. "nlp_enterprise":"[\"日照港通通信工程有限公司动力分公司\"]",
  2865. "nlp_enterprise_attachment":"[]",
  2866. "delete_uuid":"03f60e46-3036-4f2a-a4bb-f5a326c2755e"
  2867. }
  2868. o_c = {
  2869. "district":"未知",
  2870. "city":"日照",
  2871. "province":"山东",
  2872. "area":"华东",
  2873. "industry":"建筑建材",
  2874. "info_type":"有色金属冶炼及压延产品",
  2875. "info_source":"企业采购",
  2876. "qcodes":"",
  2877. "project_code":"ZBCGZX-X039338",
  2878. "tenderee_addr":"",
  2879. "procurement_system":"",
  2880. "extract_count":1,
  2881. "project_dynamic":"[{\"docid\": 110153883, \"doctitle\": \"ZBCGZX-X039338\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2020-08-31\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 1}]",
  2882. "docid_number":1,
  2883. "docids":"110153883",
  2884. "zhong_biao_page_time":"2020-08-31",
  2885. "project_codes":"ZBCGZX-X039338",
  2886. "page_time":"2020-08-31",
  2887. "product":"",
  2888. "nlp_enterprise":"[]",
  2889. "nlp_enterprise_attachment":"[]",
  2890. "delete_uuid":"4b4967be-b387-4259-9eb4-cd228a6b223f"
  2891. }
  2892. # print(check_merge_rule(o_a,o_b,True))
  2893. print(dumplicate_projects([o_a,o_b,o_c],True))
  2894. if __name__ == '__main__':
  2895. test_merge_rule()
  2896. # a = uuid4()
  2897. # print(str(a))
  2898. # print(to_project_json([{"keep_uuid":"123"}]))