documentDumplicate.py 168 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. import os
  7. import traceback
  8. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  9. class f_decode_extract(BaseUDTF):
  10. def __init__(self):
  11. import logging
  12. import json
  13. import time,re
  14. global json,logging,time,re
  15. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  16. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  17. self.dict_channel = {"公告变更":51,
  18. "招标公告":52,
  19. "中标信息":101,
  20. "招标预告":102,
  21. "招标答疑":103,
  22. "资审结果":105,
  23. "法律法规":106,
  24. "新闻资讯":107,
  25. "采购意向":114,
  26. "拍卖出让":115,
  27. "土地矿产":116,
  28. "产权交易":117,
  29. "废标公告":118,
  30. "候选人公示":119,
  31. "合同公告":120}
  32. def process(self, extractjson,otherjson):
  33. if extractjson is not None:
  34. _extract = json.loads(extractjson)
  35. else:
  36. _extract = {}
  37. if otherjson is not None:
  38. _other = json.loads(otherjson)
  39. else:
  40. _other = {}
  41. project_code = ""
  42. project_name = ""
  43. tenderee = ""
  44. agency = ""
  45. win_tenderer = ""
  46. bidding_budget = ""
  47. win_bid_price = ""
  48. fingerprint = ""
  49. page_time_stamp = 0
  50. docchannel = 0
  51. extract_count = 0
  52. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  53. doctitle = _other.get("doctitle","")
  54. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  55. area = _other.get("area","")
  56. province = _other.get("province","")
  57. city = _other.get("city","")
  58. district = _other.get("district","")
  59. web_source_no = _other.get("webSourceNo","")
  60. time_bidclose = _extract.get("time_bidclose")
  61. time_bidopen = _extract.get("time_bidopen")
  62. time_bidstart = _extract.get("time_bidstart")
  63. time_commencement = _extract.get("time_commencement")
  64. time_completion = _extract.get("time_completion")
  65. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  66. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  67. time_get_file_end = _extract.get("time_getFileEnd")
  68. time_get_file_start = _extract.get("time_getFileStart")
  69. time_publicity_end = _extract.get("time_publicityEnd")
  70. time_publicity_start = _extract.get("time_publicityStart")
  71. time_registration_end = _extract.get("time_registrationEnd")
  72. time_registration_start = _extract.get("time_registrationStart")
  73. time_release = _extract.get("time_release")
  74. # docchannel = _other.get("docchannel",0)
  75. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  76. doctype_name = _extract.get("docchannel",{}).get("doctype")
  77. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  78. docchannel_name = doctype_name
  79. docchannel = self.dict_channel.get(docchannel_name,0)
  80. if re.search(self.time_pattern,page_time) is not None:
  81. try:
  82. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  83. page_time_stamp = int(time.mktime(timeArray))
  84. except Exception as e:
  85. pass
  86. list_code = _extract.get("code",[])
  87. if len(list_code)>0:
  88. project_code = list_code[0]
  89. project_name = _extract.get("name","")
  90. fingerprint = _extract.get("fingerprint","")
  91. dict_pack = _extract.get("prem",{})
  92. logging.info(dict_pack)
  93. for _key in dict_pack.keys():
  94. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  95. extract_count += 1
  96. if bidding_budget=="":
  97. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  98. for _role in dict_pack[_key]["roleList"]:
  99. if isinstance(_role,list):
  100. extract_count += 1
  101. if _role[2]!='' and float(_role[2])>0:
  102. extract_count += 1
  103. if _role[0]=="tenderee":
  104. tenderee = _role[1]
  105. if _role[0]=="win_tenderer":
  106. if win_tenderer=="":
  107. win_tenderer = _role[1]
  108. if _role[2]!='' and float(_role[2])>0:
  109. extract_count += 1
  110. if win_bid_price=="":
  111. win_bid_price = str(float(_role[2]))
  112. if _role[0]=="agency":
  113. agency = _role[1]
  114. if isinstance(_role,dict):
  115. extract_count += 1
  116. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  117. extract_count += 1
  118. if _role["role_name"]=="tenderee":
  119. tenderee = _role["role_text"]
  120. if _role["role_name"]=="win_tenderer":
  121. if win_tenderer=="":
  122. win_tenderer = _role["role_text"]
  123. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  124. extract_count += 1
  125. if win_bid_price=="":
  126. win_bid_price = str(float(_role["role_money"]["money"]))
  127. if _role["role_name"]=="agency":
  128. agency = _role["role_text"]
  129. if project_code!="":
  130. extract_count += 1
  131. if project_name!="":
  132. extract_count += 1
  133. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  134. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  135. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  136. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  137. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  138. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  139. @annotate("string->string")
  140. class f_get_product(object):
  141. def __init__(self):
  142. import time
  143. global time
  144. import logging
  145. import json
  146. import re
  147. global json,logging,re
  148. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  149. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  150. def evaluate(self, extractjson):
  151. if extractjson is None or extractjson=="":
  152. extractjson = "{}"
  153. _extract = json.loads(extractjson)
  154. return ",".join(_extract.get("product",[]))
  155. @annotate("string->string")
  156. class f_get_package(object):
  157. def __init__(self):
  158. import time
  159. global time
  160. import logging
  161. import json
  162. import re
  163. global json,logging,re
  164. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  165. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  166. def evaluate(self, extractjson):
  167. if extractjson is None or extractjson=="":
  168. extractjson = "{}"
  169. _extract = json.loads(extractjson)
  170. prem = _extract.get("prem",{})
  171. list_pack = []
  172. for k,v in prem.items():
  173. if k!="Project":
  174. list_pack.append(k)
  175. return ",".join(list_pack)
  176. @annotate("string->string")
  177. class f_get_nlp_enterprise(object):
  178. def __init__(self):
  179. import time
  180. global time
  181. import logging
  182. import json
  183. import re
  184. global json,logging,re
  185. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  186. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  187. def evaluate(self, extractjson):
  188. if extractjson is None or extractjson=="":
  189. extractjson = "{}"
  190. _extract = json.loads(extractjson)
  191. nlp_enterprise = _extract.get("nlp_enterprise",[])
  192. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  193. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  194. dict_pack = _extract.get("prem",{})
  195. for _key in dict_pack.keys():
  196. for _role in dict_pack[_key]["roleList"]:
  197. if isinstance(_role,list):
  198. _entity = _role[1]
  199. nlp_enterprise.append(_entity)
  200. if isinstance(_role,dict):
  201. _entity = _role["role_text"]
  202. nlp_enterprise.append(_entity)
  203. nlp_enterprise = list(set(nlp_enterprise))
  204. dict_entity = {"indoctextcon":nlp_enterprise,
  205. "notindoctextcon":nlp_enterprise_attachment}
  206. return json.dumps(dict_entity,ensure_ascii=False)
  207. @annotate("string->bigint")
  208. class f_get_extractCount(object):
  209. def __init__(self):
  210. import time
  211. global time
  212. import logging
  213. import json
  214. import re
  215. global json,logging,re
  216. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  217. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  218. def evaluate(self, extractjson):
  219. if extractjson is not None:
  220. _extract = json.loads(extractjson)
  221. return _extract.get("extract_count",0)
  222. else:
  223. _extract = {}
  224. dict_pack = _extract.get("prem",{})
  225. extract_count = 0
  226. list_code = _extract.get("code",[])
  227. if len(list_code)>0:
  228. project_code = list_code[0]
  229. else:
  230. project_code = ""
  231. project_name = _extract.get("name","")
  232. bidding_budget = ""
  233. win_tenderer = ""
  234. win_bid_price = ""
  235. linklist_count = 0
  236. for _key in dict_pack.keys():
  237. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  238. extract_count += 1
  239. if bidding_budget=="":
  240. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  241. for _role in dict_pack[_key]["roleList"]:
  242. if isinstance(_role,list):
  243. extract_count += 1
  244. if _role[2]!='' and float(_role[2])>0:
  245. extract_count += 1
  246. if _role[0]=="tenderee":
  247. tenderee = _role[1]
  248. if _role[0]=="win_tenderer":
  249. if win_tenderer=="":
  250. win_tenderer = _role[1]
  251. if _role[2]!='' and float(_role[2])>0:
  252. extract_count += 1
  253. if win_bid_price=="":
  254. win_bid_price = str(float(_role[2]))
  255. if _role[0]=="agency":
  256. agency = _role[1]
  257. if isinstance(_role,dict):
  258. extract_count += 1
  259. if "role_money" in _role:
  260. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  261. extract_count += 1
  262. if _role.get("role_name")=="tenderee":
  263. tenderee = _role["role_text"]
  264. if _role.get("role_name")=="win_tenderer":
  265. if win_tenderer=="":
  266. win_tenderer = _role["role_text"]
  267. if "role_money" in _role:
  268. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  269. extract_count += 1
  270. if win_bid_price=="":
  271. win_bid_price = str(float(_role["role_money"]["money"]))
  272. if _role["role_name"]=="agency":
  273. agency = _role["role_text"]
  274. linklist = _role.get("linklist",[])
  275. for link in linklist:
  276. for l in link:
  277. if l!="":
  278. linklist_count += 1
  279. extract_count += linklist_count//2
  280. if project_code!="":
  281. extract_count += 1
  282. if project_name!="":
  283. extract_count += 1
  284. return extract_count
  285. @annotate('string,string,string,string,string -> string,string,string,bigint')
  286. class f_decode_sub_docs_json(BaseUDTF):
  287. def __init__(self):
  288. import logging
  289. import json
  290. global json,logging
  291. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  292. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  293. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  294. extract_count = 0
  295. if project_code is not None and project_code!="":
  296. extract_count += 1
  297. if project_name is not None and project_name!="":
  298. extract_count += 1
  299. if tenderee is not None and tenderee!="":
  300. extract_count += 1
  301. if agency is not None and agency!="":
  302. extract_count += 1
  303. if sub_docs_json is not None:
  304. for sub_docs in json.loads(sub_docs_json):
  305. for _key_sub_docs in sub_docs.keys():
  306. extract_count += 1
  307. if _key_sub_docs in columns:
  308. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  309. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  310. if float(sub_docs[_key_sub_docs])>0:
  311. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  312. else:
  313. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  314. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  315. @annotate('string,string,string -> string,string,string,string,string,string,string')
  316. class f_decode_for_dumplicate(BaseUDTF):
  317. def __init__(self):
  318. import logging
  319. import json
  320. global json,logging
  321. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  322. def process(self,sub_docs_json,extractjson,extract):
  323. if extractjson is None or extractjson=="":
  324. extractjson = "{}"
  325. try:
  326. _extract = json.loads(extractjson)
  327. except Exception as e:
  328. _extract = {}
  329. product = ",".join(_extract.get("product",[]))
  330. list_product = product.split(",")
  331. project_codes = ",".join(_extract.get("code",[]))
  332. list_code = project_codes.split(",")
  333. if sub_docs_json is not None:
  334. list_sub_docs = json.loads(sub_docs_json)
  335. else:
  336. list_sub_docs = [{}]
  337. max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
  338. if extract!="extract":
  339. win_tenderer = ""
  340. bidding_budget = ""
  341. win_bid_price = ""
  342. for _subdoc in list_sub_docs:
  343. win_tenderer = _subdoc.get("win_tenderer","")
  344. bidding_budget = _subdoc.get("bidding_budget","0")
  345. if float(bidding_budget)==0:
  346. bidding_budget = ""
  347. else:
  348. bidding_budget = str(float(bidding_budget))
  349. win_bid_price = _subdoc.get("win_bid_price","0")
  350. if float(win_bid_price)==0:
  351. win_bid_price = ""
  352. else:
  353. win_bid_price = str(float(win_bid_price))
  354. if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
  355. break
  356. print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
  357. self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
  358. else:
  359. for _i in range(max_len):
  360. _product = list_product[_i%len(list_product)]
  361. _code = list_code[_i%len(list_code)]
  362. _subdoc = list_sub_docs[_i%len(list_sub_docs)]
  363. win_tenderer = _subdoc.get("win_tenderer","")
  364. bidding_budget = _subdoc.get("bidding_budget","0")
  365. if float(bidding_budget)==0:
  366. bidding_budget = ""
  367. else:
  368. bidding_budget = str(float(bidding_budget))
  369. win_bid_price = _subdoc.get("win_bid_price","0")
  370. if float(win_bid_price)==0:
  371. win_bid_price = ""
  372. else:
  373. win_bid_price = str(float(win_bid_price))
  374. self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
  375. @annotate("string->bigint")
  376. class totimestamp(object):
  377. def __init__(self):
  378. import time
  379. global time
  380. import logging
  381. import json
  382. import re
  383. global json,logging,re
  384. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  385. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  386. def evaluate(self, str_time):
  387. try:
  388. logging.info(str_time)
  389. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  390. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  391. timeStamp = int(time.mktime(timeArray))
  392. return timeStamp
  393. else:
  394. return 0
  395. except Exception as e:
  396. return 0
  397. @annotate("string->string")
  398. class refind_name(object):
  399. def __init__(self):
  400. import logging
  401. import re
  402. global logging,re
  403. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  404. def evaluate(self, title):
  405. if title is not None:
  406. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  407. return ""
  408. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  409. class f_set_docid(BaseUDAF):
  410. '''
  411. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  412. '''
  413. def __init__(self):
  414. import json
  415. global json
  416. def new_buffer(self):
  417. return [[]]
  418. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  419. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  420. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  421. def merge(self, buffer, pbuffer):
  422. buffer[0].extend(pbuffer[0])
  423. def terminate(self, buffer):
  424. list_docs = buffer[0]
  425. list_docs.sort(key=lambda x:x["page_time_stamp"])
  426. list_group = []
  427. _begin = 0
  428. defind_count = 0
  429. if len(list_docs)>0:
  430. defind_count = list_docs[0]["defind_count"]
  431. print(defind_count)
  432. for i in range(len(list_docs)-1):
  433. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
  434. continue
  435. else:
  436. _group = []
  437. _set_column = set()
  438. _set_tenderee = set()
  439. for j in range(_begin,i+1):
  440. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  441. _set_tenderee.add(list_docs[j]["tenderee"])
  442. _set_column.add(list_docs[j]["defind_column"])
  443. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  444. if len(_group)>=3 and len(_set_tenderee)>1:
  445. pass
  446. else:
  447. print(defind_count,len(_set_column))
  448. if len(_group)>1:
  449. if defind_count==2:
  450. if len(_set_column)>=2:
  451. list_group.append(_group)
  452. elif defind_count==1:
  453. if len(_set_column)==1:
  454. list_group.append(_group)
  455. elif defind_count==0:
  456. list_group.append(_group)
  457. _begin = i+1
  458. if len(list_docs)>1:
  459. _set_column = set()
  460. _set_tenderee = set()
  461. _group = []
  462. for j in range(_begin,len(list_docs)):
  463. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  464. _set_tenderee.add(list_docs[j]["tenderee"])
  465. _set_column.add(list_docs[j]["defind_column"])
  466. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  467. if len(_group)>=3 and len(_set_tenderee)>1:
  468. pass
  469. else:
  470. if len(_group)>1:
  471. if defind_count==2:
  472. if len(_set_column)>=2:
  473. list_group.append(_group)
  474. elif defind_count==1:
  475. if len(_set_column)==1:
  476. list_group.append(_group)
  477. elif defind_count==0:
  478. list_group.append(_group)
  479. return json.dumps(list_group)
  480. # def terminate(self, buffer):
  481. #
  482. #
  483. # list_docs = buffer[0]
  484. # if len(list_docs)>0:
  485. # defind_count = list_docs[0]["defind_count"]
  486. #
  487. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  488. #
  489. # list_group = []
  490. # for time_group in list_time_group:
  491. # _group = []
  492. # _set_column = set()
  493. # base_tenderee = ""
  494. # _set_tenderee = set()
  495. # for j in range(len(time_group)):
  496. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  497. # # if base_tenderee =="":
  498. # # base_tenderee = time_group[j]["tenderee"]
  499. # # _set_tenderee.add(time_group[j]["tenderee"])
  500. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  501. # # if simi<0.8:
  502. # # _set_tenderee.add(time_group[j]["tenderee"])
  503. #
  504. # _set_tenderee.add(time_group[j]["tenderee"])
  505. # _set_column.add(time_group[j]["defind_column"])
  506. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  507. #
  508. # if len(_group)>=3 and len(_set_tenderee)>1:
  509. # pass
  510. # else:
  511. # if len(_group)>1:
  512. # if defind_count==2:
  513. # if len(_set_column)>=2:
  514. # list_group.append(_group)
  515. # elif defind_count==1:
  516. # if len(_set_column)==1:
  517. # list_group.append(_group)
  518. # elif defind_count==0:
  519. # list_group.append(_group)
  520. #
  521. # return json.dumps(list_group)
  522. def isEmpty(_str):
  523. if _str is None or _str=="":
  524. return True
  525. return False
  526. @annotate('bigint->string')
  527. class f_group_fingerprint(BaseUDAF):
  528. def __init__(self):
  529. import json
  530. global json
  531. def new_buffer(self):
  532. return [[]]
  533. def iterate(self, buffer,docid):
  534. buffer[0].append(docid)
  535. def merge(self, buffer, pbuffer):
  536. buffer[0].extend(pbuffer[0][:100000])
  537. def terminate(self, buffer):
  538. list_docid = buffer[0][:100000]
  539. list_docid.sort(key=lambda x:x)
  540. return ",".join([str(a) for a in list_docid])
  541. @annotate('string->bigint,string')
  542. class f_ungroup_fingerprint(BaseUDTF):
  543. def process(self,dumplicates):
  544. list_docid = dumplicates.split(",")
  545. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  546. @annotate('bigint,bigint,string->string')
  547. class f_dump_probability(BaseUDAF):
  548. '''
  549. 合并组为一条记录
  550. '''
  551. def __init__(self):
  552. import json
  553. global json
  554. def new_buffer(self):
  555. return [[]]
  556. def iterate(self, buffer,docid,page_time_stamp,_type):
  557. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  558. def merge(self, buffer, pbuffer):
  559. buffer[0].extend(pbuffer[0])
  560. def terminate(self, buffer):
  561. list_dict = buffer[0]
  562. _set = set()
  563. list_data = []
  564. for _dict in list_dict:
  565. docid = _dict["docid"]
  566. if docid in _set:
  567. continue
  568. _set.add(docid)
  569. list_data.append(_dict)
  570. if len(list_data)>10000:
  571. break
  572. list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
  573. return json.dumps(list_group)
  574. @annotate('string -> bigint,bigint,bigint,bigint,string')
  575. class f_split_dumplicate_probability(BaseUDTF):
  576. def __init__(self):
  577. import logging
  578. import json
  579. global logging,json
  580. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  581. def process(self,list_group_str):
  582. logging.info("0")
  583. logging.info(list_group_str)
  584. if list_group_str is not None:
  585. logging.info("1")
  586. try:
  587. list_group = json.loads(list_group_str)
  588. logging.info("2")
  589. for _group in list_group:
  590. if len(_group)>0:
  591. _type = _group[0].get("type","")
  592. logging.info("3%d"%len(list_group))
  593. # _group.sort(key=lambda x:x["page_time_stamp"])
  594. _len = min(100,len(_group))
  595. for _index_i in range(_len):
  596. _count = 0
  597. for _index_j in range(_index_i+1,_len):
  598. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  599. break
  600. _count += 1
  601. _docid1 = _group[_index_i]["docid"]
  602. _docid2 = _group[_index_j]["docid"]
  603. if _docid1<_docid2:
  604. self.forward(_docid1,_docid2,1,_len,_type)
  605. elif _docid1>_docid2:
  606. self.forward(_docid2,_docid1,1,_len,_type)
  607. except Exception as e:
  608. logging(str(e))
  609. @annotate('bigint,bigint,string->string')
  610. class f_dumplicate_groupPairs(BaseUDAF):
  611. '''
  612. 合并组为一条记录
  613. '''
  614. def __init__(self):
  615. import json
  616. global json
  617. def new_buffer(self):
  618. return [[]]
  619. def iterate(self, buffer,is_exists,counts,_type):
  620. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  621. def merge(self, buffer, pbuffer):
  622. buffer[0].extend(pbuffer[0])
  623. def terminate(self, buffer):
  624. list_dict = buffer[0]
  625. list_dict = list_dict[:10000]
  626. return json.dumps(list_dict)
  627. from decimal import Decimal
  628. # 高精度四舍五入方法,参数同round,结果更准确
  629. def precise_round(number, decimals=0):
  630. # 转换为Decimal对象
  631. d = Decimal(str(number))
  632. # 构造四舍五入规则 (ROUND_HALF_UP为标准四舍五入)
  633. result = d.quantize(Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
  634. # result = d.quantize(Decimal("1e-%d"%decimals) if decimals>=0 else Decimal("1e%d"%-decimals), rounding='ROUND_HALF_UP')
  635. return float(result)
  636. def check_columns(tenderee_less,tenderee_greater,
  637. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  638. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  639. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  640. flag = True
  641. _set_tenderee = set()
  642. if tenderee_less is not None and tenderee_less!="":
  643. _set_tenderee.add(tenderee_less)
  644. if tenderee_greater is not None and tenderee_greater!="":
  645. _set_tenderee.add(tenderee_greater)
  646. if len(_set_tenderee)>1:
  647. return False
  648. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  649. if code_sim>0.6 and code_sim<1:
  650. return False
  651. #同批次不同编号
  652. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  653. _split_code_less = project_code_less.split("-")
  654. _split_code_greater = project_code_greater.split("-")
  655. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  656. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  657. return False
  658. _set_win_tenderer = set()
  659. if win_tenderer_less is not None and win_tenderer_less!="":
  660. _set_win_tenderer.add(win_tenderer_less)
  661. if win_tenderer_greater is not None and win_tenderer_greater!="":
  662. _set_win_tenderer.add(win_tenderer_greater)
  663. if len(_set_win_tenderer)>1:
  664. return False
  665. _set_win_bid_price = set()
  666. if win_bid_price_less is not None and win_bid_price_less!="":
  667. _set_win_bid_price.add(float(win_bid_price_less))
  668. if win_bid_price_greater is not None and win_bid_price_greater!="":
  669. _set_win_bid_price.add(float(win_bid_price_greater))
  670. if len(_set_win_bid_price)>1:
  671. return False
  672. _set_bidding_budget = set()
  673. if bidding_budget_less is not None and bidding_budget_less!="":
  674. _set_bidding_budget.add(float(bidding_budget_less))
  675. if bidding_budget_greater is not None and bidding_budget_greater!="":
  676. _set_bidding_budget.add(float(bidding_budget_greater))
  677. if len(_set_bidding_budget)>1:
  678. return False
  679. return True
  680. import math
  681. def featurnCount(_count,max_count=100):
  682. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  683. def getSimLevel(str1,str2):
  684. str1_null = False
  685. str2_null = False
  686. _v = 0
  687. if str1 is None or str1=="":
  688. str1_null = True
  689. if str2 is None or str2=="":
  690. str2_null = True
  691. if str1_null and str2_null:
  692. _v = 2
  693. elif str1_null and not str2_null:
  694. _v = 4
  695. elif not str1_null and str2_null:
  696. _v = 6
  697. elif not str1_null and not str2_null:
  698. if str1==str2:
  699. _v = 10
  700. else:
  701. _v = 0
  702. return _v
  703. def getLength(_str):
  704. return len(str(_str) if _str is not None else "")
  705. def check_money(bidding_budget_less,bidding_budget_greater,
  706. win_bid_price_less,win_bid_price_greater,
  707. moneys_less,moneys_greater,
  708. moneys_attachment_less,moneys_attachment_greater):
  709. # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater,'win_bid_price_less',win_bid_price_less,'win_bid_price_greater',win_bid_price_greater)
  710. bidding_budget_less_source = bidding_budget_less
  711. bidding_budget_greater_source = bidding_budget_greater
  712. win_bid_price_less_source = win_bid_price_less
  713. win_bid_price_greater_source = win_bid_price_greater
  714. #只判断最高前六位
  715. if getLength(bidding_budget_less)>0:
  716. bidding_budget_less_source = float(bidding_budget_less_source)
  717. # bidding_budget_less = round(float(bidding_budget_less))
  718. bidding_budget_less = int(precise_round(float(bidding_budget_less)))
  719. # bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
  720. bidding_budget_less = str(precise_round(bidding_budget_less,6-len(str(bidding_budget_less))))
  721. if getLength(bidding_budget_greater)>0:
  722. bidding_budget_greater_source = float(bidding_budget_greater_source)
  723. # bidding_budget_greater = round(float(bidding_budget_greater))
  724. bidding_budget_greater = int(precise_round(float(bidding_budget_greater)))
  725. # bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  726. bidding_budget_greater = str(precise_round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  727. if getLength(win_bid_price_less)>0:
  728. win_bid_price_less_source = float(win_bid_price_less_source)
  729. # win_bid_price_less = round(float(win_bid_price_less))
  730. win_bid_price_less = int(precise_round(float(win_bid_price_less)))
  731. # win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
  732. win_bid_price_less = str(precise_round(win_bid_price_less,6-len(str(win_bid_price_less))))
  733. if getLength(win_bid_price_greater)>0:
  734. win_bid_price_greater_source = float(win_bid_price_greater_source)
  735. # win_bid_price_greater = round(float(win_bid_price_greater))
  736. win_bid_price_greater = int(precise_round(float(win_bid_price_greater)))
  737. # win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  738. win_bid_price_greater = str(precise_round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  739. #check saming
  740. budget_is_same = ""
  741. price_is_same = ""
  742. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  743. budget_less = float(bidding_budget_less)
  744. budget_greater = float(bidding_budget_greater)
  745. if budget_less!=budget_greater:
  746. if min(budget_less,budget_greater)>0:
  747. # if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  748. # 金额单位错误,对比时为一万倍,考虑部分小数点后的数,9999<x<10001
  749. if (max(budget_less,budget_greater)/min(budget_less,budget_greater)>9999 and max(budget_less,budget_greater)/min(budget_less,budget_greater)<10001)\
  750. or (max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)>9999 and max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)<10001):
  751. budget_is_same = True
  752. if budget_less>10000 and budget_greater>10000 and precise_round(budget_less/10000,2)==precise_round(budget_greater/10000,2):
  753. budget_is_same = True
  754. if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
  755. budget_is_same = True
  756. if bidding_budget_less_source in moneys_greater or bidding_budget_less_source in moneys_attachment_greater:
  757. budget_is_same = True
  758. if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
  759. budget_is_same = True
  760. if bidding_budget_greater_source in moneys_less or bidding_budget_greater_source in moneys_attachment_less:
  761. budget_is_same = True
  762. if budget_is_same=="":
  763. return False
  764. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  765. price_less = float(win_bid_price_less)
  766. price_greater = float(win_bid_price_greater)
  767. if price_less!=price_greater:
  768. if min(price_less,price_greater)>0:
  769. # if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  770. if (max(price_less,price_greater)/min(price_less,price_greater)>9999 and max(price_less,price_greater)/min(price_less,price_greater)<10001)\
  771. or (max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)>9999 and max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)<10001):
  772. price_is_same = True
  773. if price_less>10000 and price_greater>10000 and precise_round(price_less/10000,2)==precise_round(price_greater/10000,2):
  774. price_is_same = True
  775. if price_less in moneys_greater or price_less in moneys_attachment_greater:
  776. price_is_same = True
  777. if win_bid_price_less_source in moneys_greater or win_bid_price_less_source in moneys_attachment_greater:
  778. price_is_same = True
  779. if price_greater in moneys_less or price_greater in moneys_attachment_less:
  780. price_is_same = True
  781. if win_bid_price_greater_source in moneys_less or win_bid_price_greater_source in moneys_attachment_less:
  782. price_is_same = True
  783. if price_is_same=="":
  784. return False
  785. return True
  786. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  787. tenderee_less,tenderee_greater,
  788. agency_less,agency_greater,
  789. win_tenderer_less,win_tenderer_greater,
  790. similarity=0.85):
  791. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  792. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  793. if entity_less!=entity_greater:
  794. is_same = ''
  795. _sim = jaccard_score(entity_less,entity_greater)
  796. if _sim>similarity:
  797. is_same = True
  798. if is_same=='':
  799. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  800. is_same = True
  801. if is_same=='':
  802. return False
  803. return True
  804. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  805. return False
  806. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  807. return False
  808. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  809. return False
  810. return True
  811. def check_punish(punish_less,punish_greater):
  812. same_count = 0
  813. not_same_count = 0
  814. _flag = True
  815. keys = list(set(list(punish_less.keys())) | set(list(punish_greater.keys())))
  816. for k in keys:
  817. v1 = punish_less.get(k)
  818. v2 = punish_greater.get(k)
  819. if getLength(v1)>0 and getLength(v2)>0:
  820. if k=="punish_code":
  821. if not check_codes([v1],[v2]):
  822. not_same_count += 1
  823. _flag = False
  824. else:
  825. same_count += 1
  826. if k=="punishDecision":
  827. if getSimilarityOfString(v1,v2)>0.8:
  828. same_count += 1
  829. if k in ("complainants","punishPeople","institutions"):
  830. if v1==v2:
  831. same_count += 1
  832. else:
  833. not_same_count == 1
  834. _flag = False
  835. return _flag,same_count,not_same_count
  836. def check_source_type(source_type_less,source_type_greater):
  837. if getLength(source_type_less)>0 and getLength(source_type_greater)>0:
  838. if source_type_less!=source_type_greater:
  839. return False
  840. return True
  841. def check_approval(approval_less,approval_greater,b_log):
  842. if b_log:
  843. logging.info("approval_less %s==approval_greater %s"%(approval_less,approval_greater))
  844. for _less in approval_less:
  845. for _greater in approval_greater:
  846. same_count = 0
  847. not_same_count = 0
  848. flag = True
  849. keys = ["source_stage","source_type","doc_num","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","evaluation_agency","legal_person","compilation_unit","time_approval"]
  850. for k in keys:
  851. v1 = _less.get(k)
  852. v2 = _greater.get(k)
  853. if getLength(v1)>0 and getLength(v2)>0:
  854. if k in ("source_stage","source_type"):
  855. if v1!=v2:
  856. flag = False
  857. if k in ("project_code","doc_num"):
  858. if check_codes([v1],[v2]):
  859. same_count += 1
  860. else:
  861. not_same_count -= 1
  862. if b_log:
  863. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  864. flag = False
  865. if k in ("approval_items","approval_result","project_name"):
  866. if getSimilarityOfString(v1,v2)>0.8:
  867. same_count += 1
  868. else:
  869. not_same_count -= 1
  870. if k in ("approver","construct_company","declare_company","evaluation_agency","legal_person","compilation_unit"):
  871. if v1==v2:
  872. same_count += 1
  873. else:
  874. not_same_count -= 1
  875. if b_log:
  876. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  877. flag = False
  878. if flag and same_count>1:
  879. return flag,same_count,not_same_count
  880. flag = True
  881. if len(approval_less)>0 and len(approval_greater)>0:
  882. flag = False
  883. return flag,0,0
  884. def check_codes(project_codes_less,project_codes_greater,word_count_less={},word_count_greater={}):
  885. #check the similarity
  886. is_same = False
  887. is_sim = False
  888. for project_code_less in project_codes_less:
  889. project_code_less = str(project_code_less).upper()
  890. project_code_refine_less = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_less))
  891. for project_code_greater in project_codes_greater:
  892. project_code_greater = str(project_code_greater).upper()
  893. project_code_refine_greater = "".join(re.findall("[\u4e00-\u9fa5a-zA-Z\d]+", project_code_greater))
  894. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  895. # print('code_sim',code_sim,project_code_less,project_code_greater)
  896. if project_code_refine_less == project_code_refine_greater:
  897. is_same = True
  898. if project_code_less is not None and project_code_greater is not None:
  899. if code_sim>0.6:
  900. if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
  901. is_same = True
  902. else:
  903. is_sim = True
  904. if project_code_less!=project_code_greater:
  905. if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
  906. is_sim = True
  907. if word_count_less.get("附件",0)>20 or word_count_greater.get("附件",0)>20:# 有一篇公告包含附件内容
  908. # code相似且长度相等时计算编辑距离
  909. distance, differences = edit_distance_with_diff(project_code_less,project_code_greater)
  910. is_all_same = True
  911. if distance >= len(project_code_less)/2:
  912. is_all_same = False
  913. else:
  914. for diff in differences:
  915. if diff[0] == '替换':
  916. if (diff[1] in similar_char_dict and diff[2] in similar_char_dict.get(diff[1],[])) or \
  917. (diff[2] in similar_char_dict and diff[1] in similar_char_dict.get(diff[2],[])):
  918. pass
  919. else:
  920. is_all_same = False
  921. break
  922. else:
  923. is_all_same = False
  924. break
  925. # 编辑字符是否都为OCR易识别错的相似字符,例:"0-O", "1-IL"
  926. if is_all_same:
  927. is_same = True
  928. if is_same:
  929. return True
  930. if is_sim:
  931. return False
  932. return True
  933. def check_demand():
  934. return True
  935. similar_char_dict = {
  936. "0":['O','Q'],
  937. "O":["0",'Q'],
  938. 'Q':['0','O'],
  939. "1":["L","I"],
  940. "L":["1"],
  941. "I":["1"]
  942. }
  943. def edit_distance_with_diff(s1, s2):
  944. m, n = len(s1), len(s2)
  945. # 创建动态规划表
  946. dp = [[0] * (n + 1) for _ in range(m + 1)]
  947. # 初始化动态规划表
  948. for i in range(m + 1):
  949. dp[i][0] = i
  950. for j in range(n + 1):
  951. dp[0][j] = j
  952. # 填充动态规划表
  953. for i in range(1, m + 1):
  954. for j in range(1, n + 1):
  955. if s1[i - 1] == s2[j - 1]:
  956. dp[i][j] = dp[i - 1][j - 1]
  957. else:
  958. dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
  959. # 回溯找到差异部分
  960. diff = []
  961. i, j = m, n
  962. while i > 0 and j > 0:
  963. if s1[i - 1] == s2[j - 1]:
  964. i -= 1
  965. j -= 1
  966. elif dp[i][j] == dp[i - 1][j] + 1:
  967. diff.append(("删除",s1[i - 1]))
  968. i -= 1
  969. elif dp[i][j] == dp[i][j - 1] + 1:
  970. diff.append(("插入",s2[j - 1]))
  971. j -= 1
  972. else:
  973. diff.append(("替换",s1[i - 1],s2[j - 1]))
  974. i -= 1
  975. j -= 1
  976. # 处理剩余部分
  977. while i > 0:
  978. diff.append(("删除",s1[i - 1]))
  979. i -= 1
  980. while j > 0:
  981. diff.append(("插入",s2[j - 1]))
  982. j -= 1
  983. # 返回编辑距离和差异部分
  984. return dp[m][n], diff[::-1] # 将差异部分反转,因为我们是从后往前回溯的
  985. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  986. package_number_pattern2 = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]+") # 提取标/包号,与上面package_number_pattern同步
  987. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  988. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  989. num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
  990. num2_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+")
  991. num3_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+|.")
  992. location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
  993. building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九十1-9]+[)\)]?[次批]"
  994. # 标题中被括号括起来的重点内容
  995. brackets_pattern = "【([^【】]+?)】" # |{([^{}]+?)}
  996. rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
  997. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  998. def check_doctitle(doctitle_refind_less, doctitle_refind_greater,docchannel_less,docchannel_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
  999. # print('doctitle',doctitle_refind_less,doctitle_refind_greater)
  1000. if code_greater is None:
  1001. code_greater = []
  1002. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  1003. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  1004. if doctitle_refind_less==doctitle_refind_greater:
  1005. return True
  1006. codes_less.sort(key=lambda x:len(x),reverse=True)
  1007. for _c in codes_less:
  1008. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  1009. code_greater.sort(key=lambda x:len(x), reverse=True)
  1010. for _c in code_greater:
  1011. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  1012. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  1013. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  1014. #check the package
  1015. if doctitle_refind_less is None:
  1016. doctitle_refind_less = ""
  1017. if doctitle_refind_greater is None:
  1018. doctitle_refind_greater = ""
  1019. if doctitle_refind_less==doctitle_refind_greater:
  1020. return True
  1021. _pack1 = None
  1022. _pack2 = None
  1023. #if contain then pass
  1024. if page_time_less and page_time_less == page_time_greater:
  1025. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  1026. return True
  1027. #check the package in title
  1028. _match = re.search(package_number_pattern,doctitle_refind_less)
  1029. if _match is not None:
  1030. _pack1 = _match.groupdict()["name"]
  1031. _match = re.search(package_number_pattern,doctitle_refind_greater)
  1032. if _match is not None:
  1033. _pack2 = _match.groupdict()["name"]
  1034. if _pack1 is not None and _pack2 is not None:
  1035. # if _pack1!=_pack2:
  1036. # return False
  1037. if _pack1 != _pack2:
  1038. _pack1_num = re.search(package_number_pattern2,_pack1)
  1039. _pack1_num = _pack1_num.group() if _pack1_num else ""
  1040. _pack2_num = re.search(package_number_pattern2,_pack2)
  1041. _pack2_num = _pack2_num.group() if _pack2_num else ""
  1042. if _pack1_num and _pack2_num:
  1043. if _pack1_num != _pack2_num:
  1044. return False
  1045. else:
  1046. return False
  1047. #check the nums in title
  1048. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  1049. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  1050. #check the nums,location,building in title
  1051. for _p in [code_pattern]:
  1052. num_all_l = re.findall(_p,doctitle_refind_less)
  1053. num_all_g = re.findall(_p,doctitle_refind_greater)
  1054. set_num_l = set()
  1055. set_num_g = set()
  1056. for _l in num_all_l:
  1057. if re.search(num_pattern,_l) is not None:
  1058. if _l.find(".")>0:
  1059. set_num_l.add(_l)
  1060. elif len(_l)<4:
  1061. set_num_l.add(_l)
  1062. for _g in num_all_g:
  1063. if re.search(num_pattern,_g) is not None:
  1064. if _g.find(".")>0:
  1065. set_num_g.add(_g)
  1066. elif len(_g)<4:
  1067. set_num_g.add(_g)
  1068. if len(set_num_l)>0 and len(set_num_g)>0:
  1069. if len(set_num_l&set_num_g)!=len(set_num_l):
  1070. return False
  1071. #check location and keywords
  1072. for _p in [num1_pattern,building_pattern,brackets_pattern]:
  1073. num_all_l = re.findall(_p,doctitle_refind_less)
  1074. num_all_g = re.findall(_p,doctitle_refind_greater)
  1075. set_num_l = set(num_all_l)
  1076. set_num_g = set(num_all_g)
  1077. if len(set_num_l)==len(set_num_g):
  1078. if len(set_num_l&set_num_g)!=len(set_num_l):
  1079. return False
  1080. # 产权拍卖类公告,例:小区6号楼2单元1302号
  1081. if docchannel_less==docchannel_greater and docchannel_less in [115,116,117]:
  1082. for _p in [num2_pattern]:
  1083. num_all_l = re.findall(_p, doctitle_refind_less)
  1084. num_all_g = re.findall(_p, doctitle_refind_greater)
  1085. set_num_l = set(num_all_l)
  1086. set_num_g = set(num_all_g)
  1087. if len(set_num_l) == len(set_num_g):
  1088. if len(set_num_l & set_num_g) != len(set_num_l):
  1089. return False
  1090. # 相似标题对比,编辑距离中替换字段前后都为"数字字母字符串"则判断为不同
  1091. if getSimilarityOfString(doctitle_refind_less,doctitle_refind_greater) > 0.7:
  1092. doctitle_refind_less_re = re.findall(num3_pattern,doctitle_refind_less)
  1093. doctitle_refind_greater_re = re.findall(num3_pattern,doctitle_refind_greater)
  1094. distance, differences = edit_distance_with_diff(doctitle_refind_less_re, doctitle_refind_greater_re)
  1095. for diff in differences:
  1096. if diff[0]=='替换':
  1097. if re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[1]) and re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[2]):
  1098. # print("标题编辑距离中替换字段前后 数字字母字符串不同")
  1099. return False
  1100. # 重新(多次)招标关键词
  1101. for _p in [rebid_pattern]:
  1102. num_all_l = re.findall(_p,doctitle_refind_less)
  1103. num_all_g = re.findall(_p,doctitle_refind_greater)
  1104. set_num_l = set(num_all_l)
  1105. set_num_g = set(num_all_g)
  1106. if len(set_num_l)==len(set_num_g):
  1107. if len(set_num_l&set_num_g)!=len(set_num_l):
  1108. return False
  1109. # if page_time_less and page_time_less != page_time_greater:
  1110. if (len(set_num_l) and not len(set_num_g)) or (len(set_num_g) and not len(set_num_l)):
  1111. return False
  1112. #check the location has conflict
  1113. for _p in [location_pattern]:
  1114. num_all_l = re.findall(_p,doctitle_refind_less)
  1115. num_all_g = re.findall(_p,doctitle_refind_greater)
  1116. dict_num_l = {}
  1117. dict_num_g = {}
  1118. for _l in num_all_l:
  1119. if len(_l)>0:
  1120. key = _l[-1:]
  1121. if key not in dict_num_l:
  1122. dict_num_l[key] = set()
  1123. dict_num_l[key].add(_l)
  1124. for _g in num_all_g:
  1125. if len(_g)>0:
  1126. key = _g[-1:]
  1127. if key not in dict_num_g:
  1128. dict_num_g[key] = set()
  1129. dict_num_g[key].add(_g)
  1130. for k,v in dict_num_l.items():
  1131. if k in dict_num_g:
  1132. if len(v&dict_num_g[k])==0:
  1133. return False
  1134. return True
  1135. def product_dump(list_product):
  1136. _product_l_l = []
  1137. list_product.sort(key=lambda x:len(x))
  1138. for _l in list_product:
  1139. _exists = False
  1140. for l1 in _product_l_l:
  1141. if l1 in _l:
  1142. _exists = True
  1143. break
  1144. if not _exists:
  1145. _product_l_l.append(_l)
  1146. return _product_l_l
  1147. def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
  1148. # print('product_less',product_less,'product_greater',product_greater)
  1149. if getLength(product_less)>0 and getLength(product_greater)>0:
  1150. _product_l = product_less.split(split_char)
  1151. _product_l = product_dump(_product_l)
  1152. _product_g = product_greater.split(split_char)
  1153. _product_g = product_dump(_product_g)
  1154. _title_l = doctitle_refine_less
  1155. _title_g = doctitle_refine_greater
  1156. same_count = 0
  1157. if len(_product_l)>len(_product_g):
  1158. a = _product_g
  1159. _product_g = _product_l
  1160. _product_l = a
  1161. _title_l = doctitle_refine_greater
  1162. _title_g = doctitle_refine_less
  1163. set_product_l_in_title = set()
  1164. set_product_g_in_title = set()
  1165. for _l in _product_l:
  1166. if _title_l.find(_l)>=0:
  1167. set_product_l_in_title.add(_l)
  1168. for _g in _product_g:
  1169. if _title_g.find(_g)>=0:
  1170. set_product_g_in_title.add(_g)
  1171. # 限制标题出现的产品要有重叠
  1172. if len(set_product_l_in_title)>0 and len(set_product_g_in_title)>0:
  1173. _set_union = set_product_l_in_title & set_product_g_in_title
  1174. # 不同的部门若有重叠则通过
  1175. # diff_l = set_product_l_in_title-_set_union
  1176. # diff_g = set_product_g_in_title-_set_union
  1177. # 排除因模型识别缺漏字导致结果不同的情况
  1178. diff_l = {p for p in set_product_l_in_title - _set_union if not _title_g.find(p)}
  1179. diff_g = {p for p in set_product_g_in_title - _set_union if not _title_l.find(p)}
  1180. diff_dump = product_dump(list(diff_l.union(diff_g)))
  1181. if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
  1182. return False
  1183. # 过于严格,暂时取消
  1184. # if len(_set_union)==0:
  1185. # return False
  1186. # if len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
  1187. # _l1 = list(set_product_l_in_title)
  1188. # _l2 = list(set_product_g_in_title)
  1189. # _l1.extend(_l2)
  1190. # _l1 = product_dump(_l1)
  1191. # if len(_l1)!=len(_set_union):
  1192. # return False
  1193. for _l in _product_l:
  1194. for _g in _product_g:
  1195. # if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
  1196. if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0:
  1197. same_count += 1
  1198. break
  1199. # print('check product',same_count,len(_product_l))
  1200. if same_count/len(_product_l)>=0.5:
  1201. return True
  1202. return False
  1203. return True
  1204. def check_package(package_less,package_greater,split_char=","):
  1205. if getLength(package_less)>0 and getLength(package_greater)>0:
  1206. _product_l = package_less.split(split_char)
  1207. _product_g = package_greater.split(split_char)
  1208. same_level = False
  1209. for _l in _product_l:
  1210. for _g in _product_g:
  1211. if abs(len(_l)-len(_g))<=2:
  1212. same_level = True
  1213. if _l==_g:
  1214. return True
  1215. if same_level:
  1216. return False
  1217. return True
  1218. def check_time(json_time_less,json_time_greater):
  1219. has_same = False
  1220. has_diff = False
  1221. time_count_less = 0
  1222. time_count_greater = 0
  1223. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  1224. if isinstance(json_time_less,dict):
  1225. time_less = json_time_less
  1226. else:
  1227. time_less = json.loads(json_time_less)
  1228. time_count_less += sum([1 for k,v in time_less.items() if v])
  1229. if isinstance(json_time_greater,dict):
  1230. time_greater = json_time_greater
  1231. else:
  1232. time_greater = json.loads(json_time_greater)
  1233. time_count_greater += sum([1 for k, v in time_greater.items() if v])
  1234. for k,v in time_less.items():
  1235. if getLength(v)>0:
  1236. v1 = time_greater.get(k,"")
  1237. if getLength(v1)>0:
  1238. if v[:10]!=v1[:10]:
  1239. # print('time diff',k,v,v1)
  1240. has_diff = True
  1241. else:
  1242. has_same = True
  1243. if time_count_less==0 and time_count_greater==0:
  1244. return 2
  1245. if has_same:
  1246. if has_diff:
  1247. return 1
  1248. return 2
  1249. if has_diff:
  1250. return 0
  1251. return 1
  1252. def check_products(products_less,products_greater):
  1253. if isinstance(products_less, list):
  1254. pass
  1255. else:
  1256. products_less = json.loads(products_less) if products_less else []
  1257. if isinstance(products_greater, list):
  1258. pass
  1259. else:
  1260. products_greater = json.loads(products_greater) if products_greater else []
  1261. # if len(products_less)>0 and len(products_greater)>0:
  1262. if len(products_less)>=4 and len(products_greater)>=4:
  1263. products_less_list = [p['product'].upper() for p in products_less]
  1264. products_less_list = product_dump(products_less_list)
  1265. products_greater_list = [p['product'].upper() for p in products_greater]
  1266. products_greater_list = product_dump(products_greater_list)
  1267. if len(products_less_list)>len(products_greater_list):
  1268. a = products_greater_list
  1269. products_greater_list = products_less_list
  1270. products_less_list = a
  1271. # print('products_less_list',products_less_list)
  1272. # print('products_greater_list',products_greater_list)
  1273. same_count = 0
  1274. for _l in products_less_list:
  1275. for _g in products_greater_list:
  1276. if getSimilarityOfString(_l,_g)>=0.8:
  1277. same_count += 1
  1278. break
  1279. if same_count/len(products_less_list)<0.5:
  1280. # print('check_products false')
  1281. return False
  1282. return True
  1283. def get_login_web_set():
  1284. file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
  1285. list_web = []
  1286. try:
  1287. if os.path.exists(file):
  1288. with open(file,"r",encoding="utf8") as f:
  1289. while 1:
  1290. line = f.readline()
  1291. if not line:
  1292. break
  1293. line = line.strip()
  1294. if line:
  1295. list_web.append(line)
  1296. except Exception as e:
  1297. traceback.print_exc()
  1298. _set = set(list_web)
  1299. # log("get_login_web_set length %d"%(len(_set)))
  1300. return _set
  1301. set_login_web = get_login_web_set()
  1302. def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,hard_level=1):
  1303. docid_less = document_less["docid"]
  1304. docchannel_less = document_less.get("docchannel",0)
  1305. page_time_less = document_less.get("page_time")
  1306. doctitle_refine_less = document_less.get("doctitle_refine","").upper()
  1307. doctitle_less = document_less.get("doctitle","").upper()
  1308. project_codes_less = document_less.get("project_codes")
  1309. nlp_enterprise_less = document_less["nlp_enterprise"]
  1310. tenderee_less = document_less.get("tenderee","")
  1311. agency_less = document_less.get("agency")
  1312. win_tenderer_less = document_less["win_tenderer"]
  1313. bidding_budget_less = document_less["bidding_budget"]
  1314. win_bid_price_less = document_less["win_bid_price"]
  1315. product_less = document_less.get("product").upper()
  1316. package_less = document_less.get("package").upper()
  1317. json_time_less = document_less.get("dict_time")
  1318. project_name_less = document_less.get("project_name").upper()
  1319. fingerprint_less = document_less.get("fingerprint")
  1320. extract_count_less = document_less.get("extract_count",0)
  1321. web_source_no_less = document_less.get("web_source_no")
  1322. web_source_name_less = document_less.get("web_source_name")
  1323. province_less = document_less.get("province")
  1324. city_less = document_less.get("city")
  1325. district_less = document_less.get("district")
  1326. moneys_less = document_less.get("moneys")
  1327. moneys_attachment_less = document_less.get("moneys_attachment")
  1328. page_attachments_less = document_less.get("page_attachments","[]")
  1329. punish_less = document_less.get("punish",{})
  1330. approval_less = document_less.get("approval",[])
  1331. source_type_less = document_less.get("source_type")
  1332. detail_link_less = document_less.get("detail_link")
  1333. is_special_bonds_less = document_less.get("is_special_bonds")
  1334. products_less = document_less.get("products")
  1335. products_original_less = document_less.get("products_original",[])
  1336. change_content_less = document_less.get("change_content","")
  1337. change_time_less = document_less.get("change_time","")
  1338. word_count_less = document_less.get("word_count",{})
  1339. docid_greater = document_greater["docid"]
  1340. page_time_greater = document_greater["page_time"]
  1341. docchannel_greater = document_greater.get("docchannel",0)
  1342. doctitle_refine_greater = document_greater.get("doctitle_refine","").upper()
  1343. doctitle_greater = document_greater.get("doctitle","").upper()
  1344. project_codes_greater = document_greater["project_codes"]
  1345. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1346. tenderee_greater = document_greater.get("tenderee","")
  1347. agency_greater = document_greater.get("agency","")
  1348. win_tenderer_greater = document_greater["win_tenderer"]
  1349. bidding_budget_greater = document_greater["bidding_budget"]
  1350. win_bid_price_greater = document_greater["win_bid_price"]
  1351. product_greater = document_greater.get("product").upper()
  1352. package_greater = document_greater.get("package").upper()
  1353. json_time_greater = document_greater["dict_time"]
  1354. project_name_greater = document_greater.get("project_name").upper()
  1355. fingerprint_greater = document_greater.get("fingerprint")
  1356. extract_count_greater = document_greater.get("extract_count",0)
  1357. web_source_no_greater = document_greater.get("web_source_no")
  1358. web_source_name_greater = document_greater.get("web_source_name")
  1359. province_greater = document_greater.get("province")
  1360. city_greater = document_greater.get("city")
  1361. district_greater = document_greater.get("district")
  1362. detail_link_greater = document_greater.get("detail_link")
  1363. is_special_bonds_greater = document_greater.get("is_special_bonds")
  1364. products_greater = document_greater.get("products")
  1365. products_original_greater = document_greater.get("products_original", [])
  1366. change_content_greater = document_greater.get("change_content", "")
  1367. change_time_greater = document_greater.get("change_time", "")
  1368. word_count_greater = document_greater.get("word_count", {})
  1369. moneys_greater = document_greater.get("moneys")
  1370. moneys_attachment_greater = document_greater.get("moneys_attachment")
  1371. page_attachments_greater = document_greater.get("page_attachments","[]")
  1372. punish_greater = document_greater.get("punish",{})
  1373. approval_greater = document_greater.get("approval",[])
  1374. source_type_greater = document_greater.get("source_type")
  1375. if isinstance(project_codes_less,str):
  1376. project_codes_less = [a.upper() for a in project_codes_less.split(",") if a!=""]
  1377. elif isinstance(project_codes_less,list):
  1378. project_codes_less = [a.upper() for a in project_codes_less if a!=""]
  1379. elif project_codes_less is None:
  1380. project_codes_less = []
  1381. if isinstance(project_codes_greater,str):
  1382. project_codes_greater = [a.upper() for a in project_codes_greater.split(",") if a!=""]
  1383. elif isinstance(project_codes_greater,list):
  1384. project_codes_greater = [a.upper() for a in project_codes_greater if a!=""]
  1385. elif project_codes_greater is None:
  1386. project_codes_greater = []
  1387. # print('docid:',docid_less,docid_greater)
  1388. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1389. # print('fingerprint same')
  1390. return 1
  1391. # 专项债去重
  1392. if is_special_bonds_greater==is_special_bonds_less==1:
  1393. detail_link_less = detail_link_less.strip() if detail_link_less else ""
  1394. detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
  1395. if "bondId=" in detail_link_less:
  1396. bondId_less = detail_link_less.split("bondId=")[1]
  1397. bondId_less = bondId_less.split(",") if bondId_less else []
  1398. else:
  1399. bondId_less = []
  1400. if "bondId=" in detail_link_greater:
  1401. bondId_greater = detail_link_greater.split("bondId=")[1]
  1402. bondId_greater = bondId_greater.split(",") if bondId_greater else []
  1403. else:
  1404. bondId_greater = []
  1405. # print('bondId_less',bondId_less)
  1406. # print('bondId_greater',bondId_greater)
  1407. if bondId_less and bondId_greater:
  1408. bondId_less = set(bondId_less)
  1409. bondId_greater = set(bondId_greater)
  1410. if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
  1411. return 1
  1412. # 站源相同时,除了fingerprint一样和detail_link一样,其他不去重
  1413. if web_source_no_less==web_source_no_greater and getLength(web_source_no_less)>0:
  1414. if getLength(detail_link_less)>0 and getLength(detail_link_greater)>0:
  1415. if detail_link_less != detail_link_greater:
  1416. # print('站源相同时,detail_link不一样,直接不去重')
  1417. return 0
  1418. else: # 链接一样时,判断其是否为主页或者列表页
  1419. detail_link_split_less = re.sub("https?://","",detail_link_less.strip())
  1420. detail_link_split_less = re.split("/",detail_link_split_less)
  1421. detail_link_split_less = [i for i in detail_link_split_less if i]
  1422. if len(detail_link_split_less)==1: #链接为站源主页域名
  1423. # print('站源相同时,detail_link一样,链接为站源主页域名')
  1424. return 0
  1425. elif re.search("(index|list)(\.html?|\.do)?$",detail_link_split_less[-1],re.I): #链接为列表页
  1426. # print('站源相同时,detail_link一样,链接为列表页')
  1427. return 0
  1428. # 采购产品products对比
  1429. if getLength(products_less)>0 and getLength(products_greater)>0:
  1430. if products_original_less:# products不是AI补充提取的
  1431. _products_less = products_original_less
  1432. else:
  1433. _products_less = products_less
  1434. if products_original_greater:
  1435. _products_greater = products_original_greater
  1436. else:
  1437. _products_greater = products_greater
  1438. if not check_products(_products_less,_products_greater):
  1439. # print("check_products error")
  1440. return 0
  1441. # 变更答疑公告 变更内容对比
  1442. if docchannel_less in [51,103] and docchannel_less==docchannel_greater:
  1443. if getLength(change_time_less)>0 and getLength(change_time_greater)>0:
  1444. if change_time_less != change_time_greater:
  1445. # print("change_time diff")
  1446. return 0
  1447. if getLength(change_content_less) > 10 and getLength(change_content_greater) > 10:
  1448. _change_content_less = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_less)
  1449. _change_content_less = "".join(_change_content_less)
  1450. _change_content_greater = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_greater)
  1451. _change_content_greater = "".join(_change_content_greater)
  1452. if _change_content_less == _change_content_greater:
  1453. # print("change_content same 1")
  1454. return 1
  1455. elif _change_content_less.find(_change_content_greater)>=0 or _change_content_greater.find(_change_content_less)>=0:
  1456. # print("change_content same 2")
  1457. return 1
  1458. # elif getSimilarityOfString(_change_content_less,_change_content_greater)>0.8:
  1459. # print("change_content same 3")
  1460. # print(_change_content_less)
  1461. # print(_change_content_greater)
  1462. # print(getSimilarityOfString(_change_content_less,_change_content_greater))
  1463. # return 1
  1464. #一篇要素都在附件,且两篇附件md5有重叠
  1465. set_md5_less = set()
  1466. set_md5_greater = set()
  1467. list_md5_less = []
  1468. if page_attachments_less:
  1469. try:
  1470. list_md5_less = json.loads(page_attachments_less)
  1471. except Exception as e:
  1472. pass
  1473. list_md5_greater = []
  1474. if page_attachments_greater:
  1475. try:
  1476. list_md5_greater = json.loads(page_attachments_greater)
  1477. except Exception as e:
  1478. pass
  1479. for _l in list_md5_less:
  1480. _md5 = _l.get("fileMd5")
  1481. if _md5 is not None:
  1482. set_md5_less.add(_md5)
  1483. for _l in list_md5_greater:
  1484. _md5 = _l.get("fileMd5")
  1485. if _md5 is not None:
  1486. set_md5_greater.add(_md5)
  1487. # if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
  1488. # one_in_attach = False
  1489. # dict_enterprise_less = json.loads(nlp_enterprise_less)
  1490. # dict_enterprise_greater = json.loads(nlp_enterprise_greater)
  1491. # indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
  1492. # notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
  1493. # indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
  1494. # notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
  1495. # if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
  1496. # one_in_attach = True
  1497. # if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
  1498. # one_in_attach = True
  1499. # if one_in_attach:
  1500. # if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1501. # return 1
  1502. #同一个站源,都有附件但附件没有重叠则不去重
  1503. if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
  1504. if b_log:
  1505. logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
  1506. return 0
  1507. # 采购意向去重
  1508. if docchannel_greater==docchannel_less==114:
  1509. sign = True
  1510. demand_info_less = document_less.get("demand_info",[])
  1511. demand_info_greater = document_greater.get("demand_info",[])
  1512. # if demand_info_less and not demand_info_greater:
  1513. # sign = False
  1514. # elif not demand_info_less and demand_info_greater:
  1515. # sign = False
  1516. # elif demand_info_less and demand_info_greater:
  1517. if demand_info_less and demand_info_greater:
  1518. # 重新确定demand_info的数量排序,按大小排序
  1519. if len(demand_info_greater)<len(demand_info_less):
  1520. _demand_info_less = demand_info_greater
  1521. _demand_info_greater = demand_info_less
  1522. demand_info_less = _demand_info_less
  1523. demand_info_greater = _demand_info_greater
  1524. for item1 in demand_info_less:
  1525. tmp_project_name_less = re.sub("\s","",item1.get("project_name","").strip())
  1526. tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")").upper()
  1527. tmp_budget_less = float(item1.get("budget",0) if item1.get("budget",0) else 0)
  1528. tmp_order_begin_less = item1.get("order_begin","")
  1529. tmp_order_end_less = item1.get("order_end", "")
  1530. get_same = False
  1531. for item2 in demand_info_greater:
  1532. tmp_project_name_greater = re.sub("\s", "", item2.get("project_name", "").strip())
  1533. tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")").upper()
  1534. tmp_budget_greater = float(item2.get("budget",0) if item2.get("budget",0) else 0)
  1535. tmp_order_begin_greater = item2.get("order_begin", "")
  1536. tmp_order_end_greater = item2.get("order_end", "")
  1537. # 项目名称相同或包含关系,预算金额对比,预计采购时间开始或结束相等(只对比到月份)
  1538. if (tmp_project_name_less==tmp_project_name_greater or
  1539. (len(tmp_project_name_less)>0 and len(tmp_project_name_greater)>0 and (tmp_project_name_less.find(tmp_project_name_greater)>=0 or tmp_project_name_greater.find(tmp_project_name_less)>=0))) and \
  1540. (check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) or (tmp_budget_less>=100000 and tmp_budget_greater>=100000 and precise_round(tmp_budget_less/10000,0)==precise_round(tmp_budget_greater/10000,0 and (tmp_budget_less%10000==0 or tmp_budget_greater%10000==0)))) and \
  1541. (tmp_order_begin_less[:7]==tmp_order_begin_greater[:7] or tmp_order_end_less[:7]==tmp_order_end_greater[:7]):
  1542. get_same = True
  1543. break
  1544. if not get_same:
  1545. sign = False
  1546. break
  1547. if not sign:
  1548. return 0
  1549. else:
  1550. if demand_info_greater and len(demand_info_greater)==len(demand_info_less):# demand_info完全相同
  1551. return 1
  1552. same_count = 0
  1553. all_count = 8
  1554. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1555. same_count += 1
  1556. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1557. same_count += 1
  1558. if getLength(agency_less)>0 and agency_less==agency_greater:
  1559. same_count += 1
  1560. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1561. same_count += 1
  1562. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1563. same_count += 1
  1564. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1565. same_count += 1
  1566. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1567. same_count += 1
  1568. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1569. same_count += 1
  1570. _flag,_c1,_c2 = check_punish(punish_less,punish_greater)
  1571. if not _flag:
  1572. if b_log:
  1573. logging.info("check_punish failed")
  1574. return 0
  1575. else:
  1576. if b_log:
  1577. logging.info("check_punish true %d"%(_c1))
  1578. same_count += _c1
  1579. _flag,_c1,_c2 = check_approval(approval_less,approval_greater,b_log)
  1580. if not _flag:
  1581. if b_log:
  1582. logging.info("check approval failed")
  1583. return 0
  1584. else:
  1585. if b_log:
  1586. logging.info("check approval true %d"%(_c1))
  1587. same_count += _c1
  1588. _flag = check_source_type(source_type_less,source_type_greater)
  1589. if not _flag:
  1590. if b_log:
  1591. logging.info("check source type failed")
  1592. return 0
  1593. base_prob = 0
  1594. if min_counts<3:
  1595. base_prob = 0.9
  1596. elif min_counts<5:
  1597. base_prob = 0.8
  1598. elif min_counts<8:
  1599. base_prob = 0.7
  1600. else:
  1601. base_prob = 0.6
  1602. _prob = base_prob*same_count/all_count
  1603. # print('base_prob',base_prob,'min_counts',min_counts,'same_count',same_count,'all_count',all_count)
  1604. # web_source_name在set_login_web的站源表中时,extract_count加回3再比较
  1605. if min(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web else extract_count_greater+3)<=3 and \
  1606. max(extract_count_less if web_source_name_less not in set_login_web else extract_count_less+3,extract_count_greater if web_source_name_greater not in set_login_web else extract_count_greater+3)>=5:
  1607. if _prob<0.1 and str(page_time_less)==str(page_time_greater):
  1608. if str(docchannel_less) not in ("302","303"):
  1609. _prob = 0.15
  1610. if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
  1611. if doctitle_refine_less!=doctitle_refine_greater and len(set(project_codes_less) & set(project_codes_greater))==0:
  1612. if b_log:
  1613. logging.info("%d-%d,province not same:%s-%s"%(docid_less,docid_greater,province_less,province_greater))
  1614. return 0
  1615. if _prob<0.1:
  1616. if b_log:
  1617. logging.info("prob too low:%f"%(_prob))
  1618. return _prob
  1619. check_result = {"pass":1}
  1620. if docchannel_less in (51,102,103,104,115,116,117):
  1621. if doctitle_refine_less!=doctitle_refine_greater:
  1622. if page_time_less!=page_time_greater:
  1623. check_result["docchannel"] = 0
  1624. check_result["pass"] = 0
  1625. else:
  1626. check_result["docchannel"] = 2
  1627. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,docchannel_less,docchannel_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
  1628. check_result["doctitle"] = 0
  1629. check_result["pass"] = 0
  1630. if b_log:
  1631. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1632. else:
  1633. check_result["doctitle"] = 2
  1634. #added check
  1635. if not check_codes(project_codes_less,project_codes_greater,word_count_less,word_count_greater):
  1636. check_result["code"] = 0
  1637. check_result["pass"] = 0
  1638. if b_log:
  1639. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1640. else:
  1641. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1642. check_result["code"] = 2
  1643. else:
  1644. check_result["code"] = 1
  1645. # if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1646. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_less,doctitle_refine_greater=doctitle_greater):
  1647. check_result["product"] = 0
  1648. check_result["pass"] = 0
  1649. if b_log:
  1650. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1651. else:
  1652. if getLength(product_less)>0 and getLength(product_greater)>0:
  1653. check_result["product"] = 2
  1654. else:
  1655. check_result["product"] = 1
  1656. if not check_demand():
  1657. check_result["pass"] = 0
  1658. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1659. tenderee_less,tenderee_greater,
  1660. agency_less,agency_greater,
  1661. win_tenderer_less,win_tenderer_greater):
  1662. check_result["entity"] = 0
  1663. check_result["pass"] = 0
  1664. if b_log:
  1665. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1666. else:
  1667. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1668. check_result["entity"] = 2
  1669. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1670. check_result["entity"] = 2
  1671. else:
  1672. check_result["entity"] = 1
  1673. if not check_money(bidding_budget_less,bidding_budget_greater,
  1674. win_bid_price_less,win_bid_price_greater,
  1675. moneys_less,moneys_greater,
  1676. moneys_attachment_less,moneys_attachment_greater):
  1677. if b_log:
  1678. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1679. check_result["money"] = 0
  1680. check_result["pass"] = 0
  1681. else:
  1682. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1683. check_result["money"] = 2
  1684. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1685. check_result["money"] = 2
  1686. else:
  1687. check_result["money"] = 1
  1688. #added check
  1689. if not check_package(package_less,package_greater):
  1690. if b_log:
  1691. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1692. check_result["package"] = 0
  1693. check_result["pass"] = 0
  1694. else:
  1695. if getLength(package_less)>0 and getLength(package_greater)>0:
  1696. check_result["package"] = 2
  1697. else:
  1698. check_result["package"] = 1
  1699. #added check
  1700. _time_check = check_time(json_time_less,json_time_greater)
  1701. # if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
  1702. if not _time_check or (_time_check==1 and docchannel_less in (51,103) and
  1703. len([k for k,v in json_time_less.items() if v])>0 and len([k for k,v in json_time_greater.items() if v])>0):
  1704. if b_log:
  1705. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1706. if isinstance(json_time_less,dict):
  1707. time_less = json_time_less
  1708. else:
  1709. time_less = json.loads(json_time_less)
  1710. if isinstance(json_time_greater,dict):
  1711. time_greater = json_time_greater
  1712. else:
  1713. time_greater = json.loads(json_time_greater)
  1714. for k,v in time_less.items():
  1715. if getLength(v)>0:
  1716. v1 = time_greater.get(k,"")
  1717. if getLength(v1)>0:
  1718. if v!=v1:
  1719. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1720. check_result["time"] = 0
  1721. check_result["pass"] = 0
  1722. else:
  1723. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1724. check_result["time"] = 2
  1725. else:
  1726. check_result["time"] = 1
  1727. if hard_level==2 and check_result["product"]<=1:
  1728. if b_log:
  1729. logging.info("hard_level %s and check_product less than 2"%(str(hard_level)))
  1730. return 0
  1731. # print('check_result',check_result,'_prob',_prob)
  1732. if check_result.get("pass",0)==0:
  1733. if b_log:
  1734. logging.info(str(check_result))
  1735. if check_result.get("money",1)==0:
  1736. return 0
  1737. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1738. return _prob
  1739. elif check_result.get("entity",1)==2 and check_result.get("code",1)>=1 and check_result.get("doctitle",2)==2 and check_result.get("package",2)==2 and check_result.get("money",0)==2:
  1740. return _prob
  1741. else:
  1742. return 0
  1743. return _prob
  1744. def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
  1745. if web_source_no_less==web_source_no_greater:
  1746. if fingerprint_less==fingerprint_greater:
  1747. return 1
  1748. else:
  1749. return 0
  1750. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1751. return 1
  1752. if isinstance(project_codes_less,str):
  1753. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1754. elif project_codes_less is None:
  1755. project_codes_less = []
  1756. if isinstance(project_codes_greater,str):
  1757. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1758. elif project_codes_greater is None:
  1759. project_codes_greater = []
  1760. same_count = 0
  1761. all_count = 8
  1762. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1763. same_count += 1
  1764. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1765. same_count += 1
  1766. if getLength(agency_less)>0 and agency_less==agency_greater:
  1767. same_count += 1
  1768. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1769. same_count += 1
  1770. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1771. same_count += 1
  1772. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1773. same_count += 1
  1774. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1775. same_count += 1
  1776. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1777. same_count += 1
  1778. base_prob = 0
  1779. if min_counts<3:
  1780. base_prob = 0.9
  1781. elif min_counts<5:
  1782. base_prob = 0.8
  1783. elif min_counts<8:
  1784. base_prob = 0.7
  1785. else:
  1786. base_prob = 0.6
  1787. _prob = base_prob*same_count/all_count
  1788. if min(extract_count_less,extract_count_greater)<=3:
  1789. if _prob<0.1:
  1790. _prob = 0.15
  1791. if province_less!=province_greater:
  1792. return 0
  1793. if _prob<0.1:
  1794. return _prob
  1795. check_result = {"pass":1}
  1796. if docchannel_less in (51,102,103,104,115,116,117):
  1797. if doctitle_refine_less!=doctitle_refine_greater:
  1798. if page_time_less!=page_time_greater:
  1799. check_result["docchannel"] = 0
  1800. check_result["pass"] = 0
  1801. else:
  1802. check_result["docchannel"] = 2
  1803. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1804. check_result["doctitle"] = 0
  1805. check_result["pass"] = 0
  1806. if b_log:
  1807. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1808. else:
  1809. check_result["doctitle"] = 2
  1810. #added check
  1811. if not check_codes(project_codes_less,project_codes_greater):
  1812. check_result["code"] = 0
  1813. check_result["pass"] = 0
  1814. if b_log:
  1815. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1816. else:
  1817. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1818. check_result["code"] = 2
  1819. else:
  1820. check_result["code"] = 1
  1821. if not check_product(product_less,product_greater):
  1822. check_result["product"] = 0
  1823. check_result["pass"] = 0
  1824. if b_log:
  1825. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1826. else:
  1827. if getLength(product_less)>0 and getLength(product_greater)>0:
  1828. check_result["product"] = 2
  1829. else:
  1830. check_result["product"] = 1
  1831. if not check_demand():
  1832. check_result["pass"] = 0
  1833. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1834. tenderee_less,tenderee_greater,
  1835. agency_less,agency_greater,
  1836. win_tenderer_less,win_tenderer_greater):
  1837. check_result["entity"] = 0
  1838. check_result["pass"] = 0
  1839. if b_log:
  1840. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1841. else:
  1842. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1843. check_result["entity"] = 2
  1844. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1845. check_result["entity"] = 2
  1846. else:
  1847. check_result["entity"] = 1
  1848. if not check_money(bidding_budget_less,bidding_budget_greater,
  1849. win_bid_price_less,win_bid_price_greater):
  1850. if b_log:
  1851. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1852. check_result["money"] = 0
  1853. check_result["pass"] = 0
  1854. else:
  1855. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1856. check_result["money"] = 2
  1857. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1858. check_result["money"] = 2
  1859. else:
  1860. check_result["money"] = 1
  1861. #added check
  1862. if not check_package(package_less,package_greater):
  1863. if b_log:
  1864. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1865. check_result["package"] = 0
  1866. check_result["pass"] = 0
  1867. else:
  1868. if getLength(package_less)>0 and getLength(package_greater)>0:
  1869. check_result["package"] = 2
  1870. else:
  1871. check_result["package"] = 1
  1872. #added check
  1873. if not check_time(json_time_less,json_time_greater):
  1874. if b_log:
  1875. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1876. if isinstance(json_time_less,dict):
  1877. time_less = json_time_less
  1878. else:
  1879. time_less = json.loads(json_time_less)
  1880. if isinstance(json_time_greater,dict):
  1881. time_greater = json_time_greater
  1882. else:
  1883. time_greater = json.loads(json_time_greater)
  1884. for k,v in time_less.items():
  1885. if getLength(v)>0:
  1886. v1 = time_greater.get(k,"")
  1887. if getLength(v1)>0:
  1888. if v!=v1:
  1889. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1890. check_result["time"] = 0
  1891. check_result["pass"] = 0
  1892. else:
  1893. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1894. check_result["time"] = 2
  1895. else:
  1896. check_result["time"] = 1
  1897. if hard_level==2 and check_result["product"]<=1:
  1898. return 0
  1899. if check_result.get("pass",0)==0:
  1900. if b_log:
  1901. logging.info(str(check_result))
  1902. if check_result.get("money",1)==0:
  1903. return 0
  1904. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1905. return _prob
  1906. else:
  1907. return 0
  1908. if check_result.get("time",1)==0:
  1909. return 0
  1910. return _prob
  1911. @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
  1912. class f_dumplicate_check(BaseUDTF):
  1913. def __init__(self):
  1914. import logging
  1915. import json
  1916. global logging,json
  1917. def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
  1918. tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
  1919. bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
  1920. project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
  1921. extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
  1922. page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
  1923. package_less,package_greater,json_time_less,json_time_greater,json_context,
  1924. province_less,province_greater,city_less,city_greater,district_less,district_greater,
  1925. web_source_no_less,web_source_no_greater,
  1926. extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
  1927. min_counts = 100
  1928. if json_context is not None:
  1929. _context = json.loads(json_context)
  1930. for item in _context:
  1931. if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
  1932. min_counts = item["counts"]
  1933. _extract_less = {}
  1934. if extract_json_less is not None:
  1935. _extract_less = json.loads(extract_json_less)
  1936. _extract_less["docid"] = docid_less
  1937. _extract_less["win_tenderer"] = win_tenderer_less
  1938. _extract_less["win_bid_price"] = win_bid_price_less
  1939. _extract_less["bidding_budget"] = bidding_budget_less
  1940. _extract_less["product"] = product_less
  1941. _extract_less["page_attachments"] = page_attachments_less
  1942. _extract_less["page_time"] = page_time_less
  1943. _extract_less["fingerprint"] = fingerprint_less
  1944. _extract_less["project_codes"] = project_codes_less
  1945. _extract_less["tenderee"] = tenderee_less
  1946. _extract_less["agency"] = agency_less
  1947. _extract_less["docchannel"] = docchannel_less
  1948. _extract_less["project_name"] = project_name_less
  1949. _extract_less["doctitle_refine"] = doctitle_refine_less
  1950. _extract_less["province"] = province_less
  1951. _extract_less["city"] = city_less
  1952. _extract_less["district"] = district_less
  1953. _extract_less["web_source_no"] = web_source_no_less
  1954. _extract_less["extract_count"] = extract_count_less
  1955. _extract_less["json_time"] = json_time_less
  1956. _extract_less["nlp_enterprise"] = nlp_enterprise_less
  1957. _extract_less["package"] = package_less
  1958. _extract_greater = {}
  1959. if extract_json_greater is not None:
  1960. _extract_greater = json.loads(extract_json_greater)
  1961. _extract_greater["docid"] = docid_greater
  1962. _extract_greater["win_tenderer"] = win_tenderer_greater
  1963. _extract_greater["win_bid_price"] = win_bid_price_greater
  1964. _extract_greater["bidding_budget"] = bidding_budget_greater
  1965. _extract_greater["product"] = product_greater
  1966. _extract_greater["page_attachments"] = page_attachments_greater
  1967. _extract_greater["page_time"] = page_time_greater
  1968. _extract_greater["fingerprint"] = fingerprint_greater
  1969. _extract_greater["project_codes"] = project_codes_greater
  1970. _extract_greater["tenderee"] = tenderee_greater
  1971. _extract_greater["agency"] = agency_greater
  1972. _extract_greater["docchannel"] = docchannel_greater
  1973. _extract_greater["project_name"] = project_name_greater
  1974. _extract_greater["doctitle_refine"] = doctitle_refine_greater
  1975. _extract_greater["province"] = province_greater
  1976. _extract_greater["city"] = city_greater
  1977. _extract_greater["district"] = district_greater
  1978. _extract_greater["web_source_no"] = web_source_no_greater
  1979. _extract_greater["extract_count"] = extract_count_greater
  1980. _extract_greater["json_time"] = json_time_greater
  1981. _extract_greater["nlp_enterprise"] = nlp_enterprise_greater
  1982. _extract_greater["package"] = package_greater
  1983. moneys_less = set(_extract_less.get("moneys",[]))
  1984. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1985. moneys_greater = set(_extract_greater.get("moneys",[]))
  1986. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1987. if page_attachments_less is None:
  1988. page_attachments_less = '[]'
  1989. if page_attachments_greater is None:
  1990. page_attachments_greater = '[]'
  1991. punish_less = _extract_less.get("punish",{})
  1992. punish_greater = _extract_greater.get("punish",{})
  1993. approval_less = _extract_less.get("approval",[])
  1994. approval_greater = _extract_greater.get("approval",[])
  1995. _prob = check_dumplicate_rule(_extract_less,_extract_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
  1996. self.forward(_prob)
  1997. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  1998. class f_dumplicate_featureMatrix(BaseUDTF):
  1999. def __init__(self):
  2000. import logging
  2001. import json
  2002. global logging,json
  2003. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  2004. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  2005. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  2006. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  2007. #check the page_time by special docchannel
  2008. if docchannel_less in (51,102,103,104,115,116,117):
  2009. if doctitle_refine_less!=doctitle_refine_greater:
  2010. if page_time_less!=page_time_greater:
  2011. self.forward("[1-%s]"%(str(docchannel_less)),0)
  2012. return
  2013. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  2014. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  2015. return
  2016. # if not check_codes([project_code_less],[project_code_greater]):
  2017. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  2018. # return
  2019. if not check_demand():
  2020. self.forward("[4-]",0)
  2021. return
  2022. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  2023. tenderee_less,tenderee_greater,
  2024. agency_less,agency_greater,
  2025. win_tenderer_less,win_tenderer_greater):
  2026. _error = ""
  2027. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  2028. _error += str(a)
  2029. self.forward("[5-%s]"%_error,0)
  2030. return
  2031. if not check_money(bidding_budget_less,bidding_budget_greater,
  2032. win_bid_price_less,win_bid_price_greater):
  2033. _error = ""
  2034. for a in [bidding_budget_less,bidding_budget_greater,
  2035. win_bid_price_less,win_bid_price_greater]:
  2036. _error += str(a)
  2037. self.forward("[6-%s]"%_error,0)
  2038. return
  2039. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  2040. _error = "%s=%s"%(str(product_less),str(product_greater))
  2041. self.forward("7-%s"%_error,0)
  2042. return
  2043. _context = json.loads(json_context)
  2044. min_counts = 100
  2045. dict_context = {}
  2046. for item in _context:
  2047. if item["counts"]<min_counts:
  2048. min_counts = item["counts"]
  2049. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  2050. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  2051. list_matrix = []
  2052. #get the featurn of the context into matrix
  2053. # for index_i in range(len(context_key)):
  2054. # for index_j in range(index_i+1,len(context_key)):
  2055. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  2056. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  2057. # list_matrix.append(_v)
  2058. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  2059. # for index_i in range(len(context3_key)):
  2060. # for index_j in range(index_i+1,len(context3_key)):
  2061. # for index_k in range(index_j+1,len(context3_key)):
  2062. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  2063. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  2064. # list_matrix.append(_v)
  2065. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  2066. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  2067. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  2068. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  2069. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  2070. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  2071. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  2072. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  2073. json_matrix = json.dumps(list_matrix)
  2074. same_count = 0
  2075. all_count = 8
  2076. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  2077. same_count += 1
  2078. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  2079. same_count += 1
  2080. if getSimilarityOfString(agency_less,agency_greater)==1:
  2081. same_count += 1
  2082. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  2083. same_count += 1
  2084. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  2085. same_count += 1
  2086. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  2087. same_count += 1
  2088. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  2089. same_count += 1
  2090. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  2091. same_count += 1
  2092. base_prob = 0
  2093. if min_counts<3:
  2094. base_prob = 0.9
  2095. elif min_counts<5:
  2096. base_prob = 0.8
  2097. elif min_counts<8:
  2098. base_prob = 0.7
  2099. else:
  2100. base_prob = 0.6
  2101. _prob = base_prob*same_count/all_count
  2102. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  2103. self.forward(json_matrix,_prob)
  2104. return
  2105. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
  2106. class f_redump_probability_final_check(BaseUDAF):
  2107. '''
  2108. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2109. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2110. '''
  2111. def __init__(self):
  2112. import logging
  2113. import json,re
  2114. global json,logging,re
  2115. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2116. def new_buffer(self):
  2117. return [list()]
  2118. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
  2119. province,city,district,web_source_no,extract_json,page_attachments):
  2120. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  2121. "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  2122. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
  2123. "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
  2124. def merge(self, buffer, pbuffer):
  2125. buffer[0].extend(pbuffer[0])
  2126. def terminate(self, buffer):
  2127. list_group = []
  2128. the_group = buffer[0]
  2129. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2130. _index = 0
  2131. final_group = []
  2132. if len(the_group)>0:
  2133. _index = 0
  2134. while _index<len(the_group):
  2135. document_greater = the_group[_index]
  2136. docid_greater = document_greater["docid"]
  2137. docchannel_greater = document_greater["docchannel"]
  2138. page_time_greater = document_greater["page_time"]
  2139. doctitle_refine_greater = document_greater["doctitle_refine"]
  2140. project_codes_greater = document_greater["project_codes"]
  2141. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2142. tenderee_greater = document_greater["tenderee"]
  2143. agency_greater = document_greater["agency"]
  2144. win_tenderer_greater = document_greater["win_tenderer"]
  2145. bidding_budget_greater = document_greater["bidding_budget"]
  2146. win_bid_price_greater = document_greater["win_bid_price"]
  2147. product_greater = document_greater["product"]
  2148. package_greater = document_greater["package"]
  2149. json_time_greater = document_greater["json_dicttime"]
  2150. fingerprint_greater = document_greater.get("fingerprint","")
  2151. project_name_greater = document_greater["project_name"]
  2152. extract_count_greater = document_greater["extract_count"]
  2153. province_greater = document_greater["province"]
  2154. city_greater = document_greater["city"]
  2155. district_greater = document_greater["district"]
  2156. web_source_no_greater = document_greater["web_source_no"]
  2157. extract_json_greater = document_greater["extract_json"]
  2158. page_attachments_greater = document_greater["page_attachments"]
  2159. _pass = True
  2160. for document_less in final_group:
  2161. docid_less = document_less["docid"]
  2162. docchannel_less = document_less["docchannel"]
  2163. page_time_less = document_less["page_time"]
  2164. doctitle_refine_less = document_less["doctitle_refine"]
  2165. project_codes_less = document_less["project_codes"]
  2166. nlp_enterprise_less = document_less["nlp_enterprise"]
  2167. tenderee_less = document_less["tenderee"]
  2168. agency_less = document_less["agency"]
  2169. win_tenderer_less = document_less["win_tenderer"]
  2170. bidding_budget_less = document_less["bidding_budget"]
  2171. win_bid_price_less = document_less["win_bid_price"]
  2172. product_less = document_less["product"]
  2173. package_less = document_less["package"]
  2174. json_time_less = document_less["json_dicttime"]
  2175. fingerprint_less = document_less.get("fingerprint","")
  2176. project_name_less = document_less["project_name"]
  2177. extract_count_less = document_less["extract_count"]
  2178. province_less = document_less["province"]
  2179. city_less = document_less["city"]
  2180. district_less = document_less["district"]
  2181. web_source_no_less = document_less["web_source_no"]
  2182. extract_json_less = document_less["extract_json"]
  2183. page_attachments_less = document_less["page_attachments"]
  2184. _extract_less = {}
  2185. if extract_json_less is not None:
  2186. _extract_less = json.loads(extract_json_less)
  2187. _extract_greater = {}
  2188. if extract_json_greater is not None:
  2189. _extract_greater = json.loads(extract_json_greater)
  2190. moneys_less = set(_extract_less.get("moneys",[]))
  2191. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  2192. moneys_greater = set(_extract_greater.get("moneys",[]))
  2193. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  2194. if page_attachments_less is None:
  2195. page_attachments_less = '[]'
  2196. if page_attachments_greater is None:
  2197. page_attachments_greater = '[]'
  2198. punish_less = _extract_less.get("punish",{})
  2199. punish_greater = _extract_greater.get("punish",{})
  2200. approval_less = _extract_less.get("approval",[])
  2201. approval_greater = _extract_greater.get("approval",[])
  2202. _prob = check_dumplicate_rule(_extract_less,_extract_greater,len(the_group),b_log=False)
  2203. if _prob<0.1:
  2204. _pass = False
  2205. break
  2206. if _pass:
  2207. final_group.append(document_greater)
  2208. else:
  2209. break
  2210. _index += 1
  2211. dumplicates = ""
  2212. if _index>1:
  2213. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  2214. final_group.sort(key=lambda x:x["docid"])
  2215. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  2216. _set = set()
  2217. for _d in final_group:
  2218. _docid = _d["docid"]
  2219. if _docid in _set:
  2220. continue
  2221. dumplicates += "%d,"%_docid
  2222. _set.add(_docid)
  2223. dumplicates = dumplicates[:-1]
  2224. return dumplicates
  2225. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  2226. class f_redump_probability_final_check_bak(BaseUDAF):
  2227. '''
  2228. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2229. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2230. '''
  2231. def __init__(self):
  2232. import logging
  2233. import json,re
  2234. global json,logging,re
  2235. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2236. def new_buffer(self):
  2237. return [list()]
  2238. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  2239. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  2240. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  2241. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  2242. def merge(self, buffer, pbuffer):
  2243. buffer[0].extend(pbuffer[0])
  2244. def terminate(self, buffer):
  2245. list_group = []
  2246. the_group = buffer[0]
  2247. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2248. _index = 0
  2249. if len(the_group)>0:
  2250. _index = 1
  2251. while _index<len(the_group):
  2252. document_greater = the_group[_index]
  2253. docchannel_greater = document_greater["docchannel"]
  2254. page_time_greater = document_greater["page_time"]
  2255. doctitle_refine_greater = document_greater["doctitle_refine"]
  2256. project_code_greater = document_greater["project_code"]
  2257. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2258. tenderee_greater = document_greater["tenderee"]
  2259. agency_greater = document_greater["agency"]
  2260. win_tenderer_greater = document_greater["win_tenderer"]
  2261. bidding_budget_greater = document_greater["bidding_budget"]
  2262. win_bid_price_greater = document_greater["win_bid_price"]
  2263. product_greater = document_greater["product"]
  2264. package_greater = document_greater["package"]
  2265. json_time_greater = document_greater["json_dicttime"]
  2266. _less_index = 0
  2267. while _less_index<_index:
  2268. document_less = the_group[_less_index]
  2269. docchannel_less = document_less["docchannel"]
  2270. page_time_less = document_less["page_time"]
  2271. doctitle_refine_less = document_less["doctitle_refine"]
  2272. project_code_less = document_less["project_code"]
  2273. nlp_enterprise_less = document_less["nlp_enterprise"]
  2274. tenderee_less = document_less["tenderee"]
  2275. agency_less = document_less["agency"]
  2276. win_tenderer_less = document_less["win_tenderer"]
  2277. bidding_budget_less = document_less["bidding_budget"]
  2278. win_bid_price_less = document_less["win_bid_price"]
  2279. product_less = document_less["product"]
  2280. package_less = document_less["package"]
  2281. json_time_less = document_less["json_dicttime"]
  2282. check_result = {"pass":1}
  2283. if docchannel_less in (51,102,103,104,115,116,117):
  2284. if doctitle_refine_less!=doctitle_refine_greater:
  2285. if page_time_less!=page_time_greater:
  2286. check_result["docchannel"] = 0
  2287. check_result["pass"] = 0
  2288. else:
  2289. check_result["docchannel"] = 2
  2290. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  2291. check_result["doctitle"] = 0
  2292. check_result["pass"] = 0
  2293. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  2294. else:
  2295. check_result["doctitle"] = 2
  2296. #added check
  2297. if not check_codes([project_code_less],[project_code_greater]):
  2298. check_result["code"] = 0
  2299. check_result["pass"] = 0
  2300. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  2301. else:
  2302. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  2303. check_result["code"] = 2
  2304. else:
  2305. check_result["code"] = 1
  2306. if not check_product(product_less,product_greater):
  2307. check_result["product"] = 0
  2308. check_result["pass"] = 0
  2309. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  2310. else:
  2311. if getLength(product_less)>0 and getLength(product_greater)>0:
  2312. check_result["product"] = 2
  2313. else:
  2314. check_result["product"] = 1
  2315. if not check_demand():
  2316. check_result["pass"] = 0
  2317. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  2318. tenderee_less,tenderee_greater,
  2319. agency_less,agency_greater,
  2320. win_tenderer_less,win_tenderer_greater):
  2321. check_result["entity"] = 0
  2322. check_result["pass"] = 0
  2323. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  2324. else:
  2325. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  2326. check_result["entity"] = 2
  2327. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  2328. check_result["entity"] = 2
  2329. else:
  2330. check_result["entity"] = 1
  2331. if not check_money(bidding_budget_less,bidding_budget_greater,
  2332. win_bid_price_less,win_bid_price_greater):
  2333. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  2334. check_result["money"] = 0
  2335. check_result["pass"] = 0
  2336. else:
  2337. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  2338. check_result["money"] = 2
  2339. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  2340. check_result["money"] = 2
  2341. else:
  2342. check_result["money"] = 1
  2343. #added check
  2344. if not check_package(package_less,package_greater):
  2345. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  2346. check_result["package"] = 0
  2347. check_result["pass"] = 0
  2348. else:
  2349. if getLength(package_less)>0 and getLength(package_greater)>0:
  2350. check_result["package"] = 2
  2351. else:
  2352. check_result["package"] = 1
  2353. #added check
  2354. if not check_time(json_time_less,json_time_greater):
  2355. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  2356. check_result["time"] = 0
  2357. check_result["pass"] = 0
  2358. else:
  2359. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  2360. check_result["time"] = 2
  2361. else:
  2362. check_result["time"] = 1
  2363. if check_result.get("pass",0)==0:
  2364. logging.info(str(check_result))
  2365. if check_result.get("time",1)==0:
  2366. break
  2367. if check_result.get("money",1)==0:
  2368. break
  2369. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  2370. pass
  2371. else:
  2372. break
  2373. _less_index += 1
  2374. if _less_index!=_index:
  2375. break
  2376. _index += 1
  2377. dumplicates = ""
  2378. if _index>1:
  2379. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  2380. final_group = the_group[:_index]
  2381. final_group.sort(key=lambda x:x["docid"])
  2382. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  2383. _set = set()
  2384. for _d in final_group:
  2385. _docid = _d["docid"]
  2386. if _docid in _set:
  2387. continue
  2388. dumplicates += "%d,"%_docid
  2389. _set.add(_docid)
  2390. dumplicates = dumplicates[:-1]
  2391. return dumplicates
  2392. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  2393. class f_set_docid_binaryChart(BaseUDAF):
  2394. '''
  2395. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2396. '''
  2397. def __init__(self):
  2398. import json
  2399. global json
  2400. def new_buffer(self):
  2401. return [[]]
  2402. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  2403. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  2404. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  2405. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  2406. "agency":agency,"web_source_no":web_source_no})
  2407. def merge(self, buffer, pbuffer):
  2408. buffer[0].extend(pbuffer[0])
  2409. def terminate(self, buffer):
  2410. list_docs = buffer[0]
  2411. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
  2412. list_group = []
  2413. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  2414. for _timeGroups in list_timeGroups:
  2415. list_empty = []
  2416. list_notEmpty = []
  2417. for _item in _timeGroups:
  2418. empty_flag = True
  2419. for _key in empty_key:
  2420. if not isEmpty(_item[_key]):
  2421. empty_flag = False
  2422. break
  2423. if empty_flag:
  2424. list_empty.append(_item)
  2425. else:
  2426. list_notEmpty.append(_item)
  2427. for _e in list_empty:
  2428. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  2429. _e_tenderee = _e["tenderee"]
  2430. for _ne in list_notEmpty:
  2431. if "set_webSource" not in _ne:
  2432. _ne["set_webSource"] = set()
  2433. _ne["set_webSource"].add(_ne["web_source_no"])
  2434. _suit = False
  2435. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  2436. _suit = True
  2437. elif isEmpty(_e_tenderee):
  2438. _suit = True
  2439. if _suit:
  2440. if _e["web_source_no"] not in _ne["set_webSource"]:
  2441. _ne["set_webSource"].add(_e["web_source_no"])
  2442. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  2443. break
  2444. if len(_group)>1:
  2445. list_group.append(_group)
  2446. return json.dumps(list_group)
  2447. def split_with_time(list_dict,sort_key,timedelta=86400*7):
  2448. if len(list_dict)>0:
  2449. if sort_key in list_dict[0]:
  2450. list_dict.sort(key=lambda x:x[sort_key])
  2451. list_group = []
  2452. _begin = 0
  2453. for i in range(len(list_dict)-1):
  2454. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  2455. continue
  2456. else:
  2457. _group = []
  2458. for j in range(_begin,i+1):
  2459. _group.append(list_dict[j])
  2460. if len(_group)>1:
  2461. list_group.append(_group)
  2462. _begin = i + 1
  2463. if len(list_dict)>1:
  2464. _group = []
  2465. for j in range(_begin,len(list_dict)):
  2466. _group.append(list_dict[j])
  2467. if len(_group)>1:
  2468. list_group.append(_group)
  2469. return list_group
  2470. return [list_dict]
  2471. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  2472. class f_set_docid_limitNum_contain(BaseUDAF):
  2473. '''
  2474. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  2475. '''
  2476. def __init__(self):
  2477. import logging
  2478. import json,re
  2479. global json,logging,re
  2480. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2481. def new_buffer(self):
  2482. return [list()]
  2483. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  2484. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  2485. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  2486. "contain_column":contain_column})
  2487. def merge(self, buffer, pbuffer):
  2488. buffer[0].extend(pbuffer[0])
  2489. def terminate(self, buffer):
  2490. list_split = split_with_time(buffer[0],"page_time_stamp")
  2491. list_group = []
  2492. for _split in list_split:
  2493. flag = True
  2494. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  2495. for _key in keys:
  2496. logging.info(_key+str(getSet(_split,_key)))
  2497. if len(getSet(_split,_key))>1:
  2498. flag = False
  2499. break
  2500. MAX_CONTAIN_COLUMN = None
  2501. #判断组内每条公告是否包含
  2502. if flag:
  2503. for _d in _split:
  2504. contain_column = _d["contain_column"]
  2505. if contain_column is not None and contain_column !="":
  2506. if MAX_CONTAIN_COLUMN is None:
  2507. MAX_CONTAIN_COLUMN = contain_column
  2508. else:
  2509. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  2510. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  2511. flag = False
  2512. break
  2513. MAX_CONTAIN_COLUMN = contain_column
  2514. else:
  2515. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  2516. flag = False
  2517. break
  2518. if flag:
  2519. if len(_split)>1:
  2520. _group = []
  2521. for _item in _split:
  2522. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  2523. list_group.append(_group)
  2524. return json.dumps(list_group)
  2525. @annotate('bigint->string')
  2526. class f_stamp_squence(BaseUDAF):
  2527. '''
  2528. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2529. '''
  2530. def __init__(self):
  2531. import json
  2532. global json
  2533. import logging
  2534. global logging
  2535. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2536. def new_buffer(self):
  2537. return [set()]
  2538. def iterate(self, buffer,page_time_stamp):
  2539. buffer[0].add(page_time_stamp)
  2540. def merge(self, buffer, pbuffer):
  2541. buffer[0] |= pbuffer[0]
  2542. def terminate(self, buffer):
  2543. if 0 in buffer[0]:
  2544. buffer[0].remove(0)
  2545. list_stamp = list(buffer[0])
  2546. list_stamp.sort(key=lambda x:x)
  2547. list_stamp_final = []
  2548. _begin = 0
  2549. _time_decase = 86400*7
  2550. logging.info(str(list_stamp))
  2551. for _index in range(len(list_stamp)-1):
  2552. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  2553. continue
  2554. else:
  2555. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  2556. _begin = _index+1
  2557. if len(list_stamp)>0:
  2558. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  2559. return json.dumps(list_stamp_final)
  2560. @annotate("bigint,string->bigint")
  2561. class in_stamp(object):
  2562. def __init__(self):
  2563. import logging
  2564. import re
  2565. import json
  2566. global logging,re,json
  2567. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2568. def evaluate(self, page_time_stamp,json_stamp):
  2569. list_stamp = json.loads(json_stamp)
  2570. int_flag = 0
  2571. for item in list_stamp:
  2572. if page_time_stamp <item[0]:
  2573. break
  2574. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  2575. int_flag = 1
  2576. break
  2577. return int_flag
  2578. def getConfidence(rule_id):
  2579. if rule_id ==0:
  2580. return 30
  2581. elif rule_id >=1 and rule_id <30:
  2582. return 20
  2583. else:
  2584. return 10
  2585. @annotate('string,string -> string')
  2586. class f_splitStr(BaseUDTF):
  2587. '''
  2588. 将多个组拆解成多条记录
  2589. '''
  2590. def __init__(self):
  2591. import logging
  2592. import json
  2593. global json,logging
  2594. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2595. def process(self, str_split,_split):
  2596. try:
  2597. for _s in str_split.split(_split):
  2598. self.forward(_s)
  2599. except Exception as e:
  2600. pass
  2601. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  2602. class f_split_group_single(BaseUDTF):
  2603. '''
  2604. 将多个组拆解成多条记录
  2605. '''
  2606. def __init__(self):
  2607. import logging
  2608. import json
  2609. global json,logging
  2610. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2611. def process(self, json_set_docid,rule_id):
  2612. list_group = json.loads(json_set_docid)
  2613. for item in list_group:
  2614. if len(item)>100:
  2615. item.sort(key=lambda x:x["docid"],reverse=True)
  2616. index_i = 0
  2617. for index_j in range(1,len(item)):
  2618. if item[index_i]["docid"]!=item[index_j]["docid"]:
  2619. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2620. else:
  2621. for index_i in range(len(item)):
  2622. for index_j in range(len(item)):
  2623. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  2624. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2625. @annotate('bigint,string->string')
  2626. class group_document(BaseUDAF):
  2627. '''
  2628. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2629. '''
  2630. def __init__(self):
  2631. import json
  2632. global json
  2633. def new_buffer(self):
  2634. return [[]]
  2635. def iterate(self, buffer,id,json_set_docid):
  2636. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  2637. def merge(self, buffer, pbuffer):
  2638. buffer[0].extend(pbuffer[0])
  2639. def terminate(self, buffer):
  2640. return json.dumps(buffer[0])
  2641. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  2642. class decare_document(BaseUDTF):
  2643. '''
  2644. 将多个组拆解成多条记录
  2645. '''
  2646. def __init__(self):
  2647. import logging
  2648. import json
  2649. global json,logging
  2650. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2651. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  2652. #y=x,少掉近一半的数据
  2653. if group_id1>=group_id2:
  2654. list_doc1 = json.loads(json_list_doc1)
  2655. list_doc2 = json.loads(json_list_doc2)
  2656. for _doc1 in list_doc1:
  2657. for _doc2 in list_doc2:
  2658. #同一个重复group不做判断
  2659. if _doc1["id"]!=_doc2["id"]:
  2660. #判断两个group是否有重复
  2661. _set1 = set()
  2662. for _item1 in _doc1["json_set_docid"]:
  2663. _set1.add(_item1["docid"])
  2664. _set2 = set()
  2665. for _item2 in _doc2["json_set_docid"]:
  2666. _set2.add(_item2["docid"])
  2667. if len(_set1&_set2)>0:
  2668. new_json_set_docid = _doc1["json_set_docid"]
  2669. for _item2 in _doc2["json_set_docid"]:
  2670. if _item2["docid"] not in _set1:
  2671. new_json_set_docid.append(_item2)
  2672. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  2673. def getBestDocid(list_pair):
  2674. # [docid1,extract_count1,docid2,extract_count2]
  2675. # list_pair.sort(key=lambda x:x[3],reverse=True)
  2676. # _max_count = max(list_pair[0][3],list_pair[0][1])
  2677. # set_candidate = set()
  2678. # if list_pair[0][1]==_max_count:
  2679. # set_candidate.add(list_pair[0][0])
  2680. # for item in list_pair:
  2681. # if item[3]==_max_count:
  2682. # set_candidate.add(item[2])
  2683. # else:
  2684. # break
  2685. # list_candidate = list(set_candidate)
  2686. # list_candidate.sort(key=lambda x:x)
  2687. new_pair = []
  2688. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  2689. for item in list_pair:
  2690. new_pair.append([item[0],item[2],item[3]])
  2691. new_pair.sort(key=lambda x:x[1])
  2692. new_pair.sort(key=lambda x:x[2],reverse=True)
  2693. return new_pair[0][1]
  2694. @annotate('bigint,bigint,bigint,bigint->string')
  2695. class choose_document(BaseUDAF):
  2696. '''
  2697. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2698. '''
  2699. def __init__(self):
  2700. import json
  2701. global json
  2702. def new_buffer(self):
  2703. return [[]]
  2704. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2705. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2706. def merge(self, buffer, pbuffer):
  2707. buffer[0].extend(pbuffer[0])
  2708. def terminate(self, buffer):
  2709. list_pair = buffer[0]
  2710. _set = set()
  2711. for item in buffer[0]:
  2712. _set.add(str(item[2]))
  2713. list_dumplicate = list(_set)
  2714. best_docid = getBestDocid(list_pair)
  2715. if best_docid==list_pair[0][0]:
  2716. save_flag = 1
  2717. else:
  2718. save_flag = 0
  2719. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  2720. @annotate('string -> bigint,string')
  2721. class f_get_choose_document(BaseUDTF):
  2722. '''
  2723. 将多个组拆解成多条记录
  2724. '''
  2725. def __init__(self):
  2726. import logging
  2727. import json
  2728. global json,logging
  2729. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2730. def process(self,json_choose):
  2731. if json_choose is None:
  2732. self.forward(1,None)
  2733. else:
  2734. _choose = json.loads(json_choose)
  2735. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  2736. @annotate('string->bigint')
  2737. class f_get_codes_count(object):
  2738. def evaluate(self,extract_json):
  2739. if extract_json is None or extract_json=="":
  2740. extract_json = "{}"
  2741. _extract = json.loads(extract_json)
  2742. _codes = _extract.get("code",[])
  2743. return len(_codes)
  2744. @annotate('string->string')
  2745. class f_get_codes(object):
  2746. def evaluate(self,extract_json):
  2747. if extract_json is None or extract_json=="":
  2748. extract_json = "{}"
  2749. _extract = json.loads(extract_json)
  2750. _codes = _extract.get("code",[])
  2751. return ",".join(_codes)
  2752. @annotate('bigint,bigint,bigint,bigint->string')
  2753. class group_document_bestFirst(BaseUDAF):
  2754. '''
  2755. 将组里面最优的放在前面
  2756. '''
  2757. def __init__(self):
  2758. import json
  2759. global json
  2760. def new_buffer(self):
  2761. return [[]]
  2762. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2763. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2764. def merge(self, buffer, pbuffer):
  2765. buffer[0].extend(pbuffer[0])
  2766. def terminate(self, buffer):
  2767. list_pair = buffer[0]
  2768. _set = set()
  2769. for item in buffer[0]:
  2770. _set.add(item[2])
  2771. _set.add(list_pair[0][0])
  2772. best_docid = getBestDocid(list_pair)
  2773. _set.remove(best_docid)
  2774. list_dumplicate = list(_set)
  2775. list_dumplicate.sort(key=lambda x:x)
  2776. list_dumplicate.insert(0,best_docid)
  2777. list_dumplicate_str = []
  2778. for item in list_dumplicate:
  2779. list_dumplicate_str.append(str(item))
  2780. return ",".join(list_dumplicate_str)
  2781. @annotate('string -> bigint,string')
  2782. class f_get_best_dumplicates(BaseUDTF):
  2783. '''
  2784. 得到每个分组中最优的那一条及其重复记录
  2785. '''
  2786. def __init__(self):
  2787. import logging
  2788. import json
  2789. global json,logging
  2790. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2791. def process(self,list_dumplicate_str):
  2792. if list_dumplicate_str is None or list_dumplicate_str=='':
  2793. pass
  2794. else:
  2795. list_dumplicate = list_dumplicate_str.split(",")
  2796. if len(list_dumplicate)>0:
  2797. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  2798. else:
  2799. pass
  2800. @annotate('bigint,bigint->string')
  2801. class bridge2group(BaseUDAF):
  2802. '''
  2803. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2804. '''
  2805. def __init__(self):
  2806. import json
  2807. global json
  2808. def new_buffer(self):
  2809. return [set()]
  2810. def iterate(self, buffer,docid1,docid2):
  2811. buffer[0].add(docid1)
  2812. buffer[0].add(docid2)
  2813. def merge(self, buffer, pbuffer):
  2814. buffer[0] |= pbuffer[0]
  2815. def terminate(self, buffer):
  2816. list_pair = list(buffer[0])
  2817. list_pair.sort(key=lambda x:x,reverse=True)
  2818. return json.dumps(list_pair)
  2819. @annotate('string -> bigint,bigint')
  2820. class group2bridge(BaseUDTF):
  2821. '''
  2822. 将多个组拆解成多条记录
  2823. '''
  2824. def __init__(self):
  2825. import logging
  2826. import json
  2827. global json,logging
  2828. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2829. def process(self,json_list_docid):
  2830. list_docid = json.loads(json_list_docid)
  2831. for _docid in list_docid:
  2832. self.forward(list_docid[-1],_docid)
  2833. @annotate('string->string')
  2834. class to_url(object):
  2835. def evaluate(self,_s):
  2836. if _s is None or _s=="":
  2837. return
  2838. else:
  2839. list_l = []
  2840. for l in _s.split(","):
  2841. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  2842. return ",".join(list_l)
  2843. @annotate('bigint,bigint,string -> bigint')
  2844. class f_get_dump_docid(BaseUDTF):
  2845. '''
  2846. 将多个组拆解成多条记录
  2847. '''
  2848. def __init__(self):
  2849. import logging
  2850. import json
  2851. global json,logging
  2852. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2853. def process(self,docid,save_flag,dumplicates):
  2854. if save_flag==0:
  2855. self.forward(docid)
  2856. if dumplicates is not None:
  2857. list_docid = dumplicates.split(",")
  2858. if len(list_docid)>0:
  2859. for _docid in list_docid[1:]:
  2860. self.forward(int(_docid))
  2861. else:
  2862. if dumplicates is not None:
  2863. list_docid = dumplicates.split(",")
  2864. if len(list_docid)>0:
  2865. for _docid in list_docid:
  2866. self.forward(int(_docid))
  2867. @annotate('string -> bigint,bigint')
  2868. class f_get_docid(BaseUDTF):
  2869. '''
  2870. 将多个组拆解成多条记录
  2871. '''
  2872. def __init__(self):
  2873. import logging
  2874. import json
  2875. global json,logging
  2876. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2877. def process(self,json_set_docid):
  2878. team_id = 0
  2879. if json_set_docid is not None:
  2880. list_docses = json.loads(json_set_docid)
  2881. for list_docs in list_docses:
  2882. team_id += 1
  2883. for item in list_docs:
  2884. self.forward(team_id,item["docid"])
  2885. @annotate("string->bigint")
  2886. class get_count_dump(object):
  2887. def __init__(self):
  2888. import logging
  2889. import re
  2890. global logging,re
  2891. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2892. def evaluate(self, title):
  2893. _count = 0
  2894. if title is not None:
  2895. _count = len(title.split(","))
  2896. return _count
  2897. def getSet(list_dict,key):
  2898. _set = set()
  2899. for item in list_dict:
  2900. if key in item:
  2901. if item[key]!='' and item[key] is not None:
  2902. if re.search("^\d[\d\.]*$",item[key]) is not None:
  2903. _set.add(str(float(item[key])))
  2904. else:
  2905. _set.add(str(item[key]))
  2906. return _set
  2907. def getDiffIndex(list_dict,key,confidence=100):
  2908. '''
  2909. 优化为相似度判断
  2910. :param list_dict:
  2911. :param key:
  2912. :param confidence:
  2913. :return:
  2914. '''
  2915. # _set = set()
  2916. # for _i in range(len(list_dict)):
  2917. # item = list_dict[_i]
  2918. # if item["confidence"]>=confidence:
  2919. # continue
  2920. # if key in item:
  2921. # if item[key]!='' and item[key] is not None:
  2922. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2923. # _set.add(str(float(item[key])))
  2924. # else:
  2925. # _set.add(str(item[key]))
  2926. # if len(_set)>1:
  2927. # return _i
  2928. # ==============================
  2929. _set = set()
  2930. _set_m = set()
  2931. base_s = ""
  2932. for _i in range(len(list_dict)):
  2933. item = list_dict[_i]
  2934. if item["confidence"]>=confidence:
  2935. continue
  2936. if key in item:
  2937. if item[key]!='' and item[key] is not None:
  2938. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2939. _m = float(item[key])
  2940. if _m>100000:
  2941. _m = _m//10000*10000
  2942. _set_m.add(str(_m))
  2943. else:
  2944. _s = str(item[key])
  2945. if base_s=="":
  2946. base_s = _s
  2947. else:
  2948. simi = getSimilarityOfString(base_s,_s)
  2949. if simi<0.8:
  2950. return _i
  2951. if len(_set_m)>1:
  2952. return _i
  2953. return len(list_dict)
  2954. @annotate('bigint,string -> bigint,bigint')
  2955. class f_getGroup_dumpFinal(BaseUDTF):
  2956. '''
  2957. 从最后的结果中获取组
  2958. '''
  2959. def __init__(self):
  2960. import logging
  2961. import json
  2962. global json,logging
  2963. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2964. def process(self,docid,dumplicates):
  2965. self.forward(int(docid),int(docid))
  2966. if dumplicates is not None:
  2967. list_docids = dumplicates.split(",")
  2968. for _docid in list_docids:
  2969. self.forward(int(docid),int(_docid))
  2970. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  2971. class f_redump_limit_num(BaseUDAF):
  2972. '''
  2973. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2974. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2975. '''
  2976. def __init__(self):
  2977. import logging
  2978. import json,re
  2979. global json,logging,re
  2980. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2981. def new_buffer(self):
  2982. return [list()]
  2983. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  2984. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  2985. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  2986. "extract_count2":extract_count2,"confidence":confidence})
  2987. def merge(self, buffer, pbuffer):
  2988. buffer[0].extend(pbuffer[0])
  2989. def terminate(self, buffer):
  2990. list_group = []
  2991. the_group = buffer[0]
  2992. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2993. if len(the_group)>5:
  2994. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  2995. else:
  2996. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  2997. final_group = []
  2998. #置信度
  2999. list_key_index = []
  3000. for _k in keys:
  3001. if _k=="doctitle":
  3002. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  3003. else:
  3004. list_key_index.append(getDiffIndex(the_group,_k))
  3005. _index = min(list_key_index)
  3006. if _index>1:
  3007. main_docid = the_group[0]["main_docid"]
  3008. for item in the_group[:_index]:
  3009. if item["docid"]!=main_docid:
  3010. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  3011. # stay = True
  3012. # for _key in keys:
  3013. # if len(getSet(the_group,_key))>1:
  3014. # stay = False
  3015. # break
  3016. #
  3017. # if stay:
  3018. # main_docid = the_group[0]["main_docid"]
  3019. # for item in the_group:
  3020. # if item["docid"]!=main_docid:
  3021. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  3022. return json.dumps(final_group)
  3023. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  3024. class f_get_dumpFinal_checked(BaseUDTF):
  3025. '''
  3026. 从最后的结果中获取组
  3027. '''
  3028. def __init__(self):
  3029. import logging
  3030. import json
  3031. global json,logging
  3032. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  3033. def process(self,list_group):
  3034. if list_group is not None:
  3035. final_group = json.loads(list_group)
  3036. for _group in final_group:
  3037. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  3038. @annotate('string -> bigint')
  3039. class f_getDumplicateDocids(BaseUDTF):
  3040. '''
  3041. 从最后的结果中获取组
  3042. '''
  3043. def __init__(self):
  3044. import logging
  3045. import json
  3046. global json,logging
  3047. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  3048. def process(self,dumplicates):
  3049. list_docids = dumplicates.split(",")
  3050. for _d in list_docids:
  3051. self.forward(int(_d))
  3052. def jaccard_score(source,target):
  3053. source_set = set([s for s in source])
  3054. target_set = set([s for s in target])
  3055. if len(source_set)==0 or len(target_set)==0:
  3056. return 0
  3057. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  3058. def getSimilarityOfString(str1,str2):
  3059. _set1 = set()
  3060. _set2 = set()
  3061. if str1 is not None:
  3062. for i in range(1,len(str1)):
  3063. _set1.add(str1[i-1:i+1])
  3064. for i in range(2,len(str1)):
  3065. _set1.add(str1[i-2:i+1])
  3066. if str2 is not None:
  3067. for i in range(1,len(str2)):
  3068. _set2.add(str2[i-1:i+1])
  3069. for i in range(2,len(str2)):
  3070. _set2.add(str2[i-2:i+1])
  3071. _len = max(1,min(len(_set1),len(_set2)))
  3072. return len(_set1&_set2)/_len
  3073. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  3074. class f_is_legal(object):
  3075. def __init__(self):
  3076. import logging
  3077. import re
  3078. global logging,re
  3079. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  3080. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  3081. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  3082. return 0
  3083. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  3084. return 0
  3085. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  3086. return 0
  3087. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  3088. return 0
  3089. _sim = getSimilarityOfString(project_code1,project_code2)
  3090. if _sim>0.7 and _sim<1:
  3091. return 0
  3092. return 1
  3093. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  3094. class f_autorule_group(BaseUDAF):
  3095. '''
  3096. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  3097. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  3098. '''
  3099. def __init__(self):
  3100. import logging
  3101. import json,re
  3102. global json,logging,re
  3103. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  3104. def new_buffer(self):
  3105. return [list()]
  3106. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  3107. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  3108. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  3109. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  3110. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  3111. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  3112. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  3113. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  3114. def merge(self, buffer, pbuffer):
  3115. buffer[0].extend(pbuffer[0][:100])
  3116. buffer[0] = buffer[0][:100]
  3117. def getSameKeys(self,_dict1,_dict2):
  3118. list_keys = []
  3119. for k,v in _dict1.items():
  3120. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  3121. continue
  3122. v2 = _dict2.get(k,"")
  3123. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  3124. list_keys.append(k)
  3125. list_keys.sort(key=lambda x:x)
  3126. return "=".join(list_keys)
  3127. def terminate(self, buffer):
  3128. list_group = []
  3129. the_group = buffer[0]
  3130. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  3131. if len(the_group)>5:
  3132. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  3133. else:
  3134. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  3135. #置信度
  3136. list_key_index = []
  3137. for _k in keys:
  3138. if _k=="doctitle":
  3139. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  3140. else:
  3141. list_key_index.append(getDiffIndex(the_group,_k))
  3142. final_group = []
  3143. _index = min(list_key_index)
  3144. if _index>1:
  3145. for item in the_group[:_index]:
  3146. final_group.append(item)
  3147. list_rules = []
  3148. for i in range(len(final_group)):
  3149. for j in range(i+1,len(final_group)):
  3150. _dict1 = final_group[i]
  3151. _dict2 = final_group[j]
  3152. _rule = self.getSameKeys(_dict1,_dict2)
  3153. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  3154. return json.dumps(list_rules)
  3155. @annotate('string -> string,bigint,bigint')
  3156. class f_autorule_group_extract(BaseUDTF):
  3157. '''
  3158. 从最后的结果中获取组
  3159. '''
  3160. def __init__(self):
  3161. import logging
  3162. import json
  3163. global json,logging
  3164. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  3165. def process(self,rules_json):
  3166. list_rules = json.loads(rules_json)
  3167. for _rule in list_rules:
  3168. self.forward(_rule[0],_rule[1],_rule[2])
  3169. if __name__ == '__main__':
  3170. # f = f_decode_for_dumplicate()
  3171. # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
  3172. # print(b)
  3173. print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
  3174. # f = f_get_extractCount()
  3175. # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
  3176. # print(f.evaluate(j))
  3177. # _str1 = "PMJJ-202211030004001"
  3178. # _str2 = "PMJJ-202211030001001"
  3179. # print(getSimilarityOfString(_str1,_str2))
  3180. # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  3181. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  3182. # print(check_product(None,None))
  3183. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  3184. # print(check_money("550.0","440.0","",""))
  3185. # for i in range(0,2):
  3186. # print(i)
  3187. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  3188. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  3189. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  3190. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  3191. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  3192. # if _match is not None:
  3193. # print(_match.groupdict()["name"])
  3194. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  3195. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  3196. # c = f_get_extractCount()
  3197. # _json = '''
  3198. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  3199. # '''
  3200. # c = f_get_nlp_enterprise()
  3201. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  3202. # print(c.evaluate(_json))
  3203. # c = f_set_docid()
  3204. # _s = '''
  3205. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  3206. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  3207. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  3208. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  3209. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  3210. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  3211. # '''
  3212. # buffer = c.new_buffer()
  3213. # for _line in _s.split("\n"):
  3214. # _line = _line.strip()
  3215. # if _line=="":
  3216. # continue
  3217. # l_column = _line.split("\t")
  3218. # print(l_column)
  3219. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  3220. # page_time_stamp = int(page_time_stamp)
  3221. # extract_count = int(extract_count)
  3222. # num = 1
  3223. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  3224. # print(c.terminate(buffer))