documentDumplicate.py 147 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  191. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  192. dict_pack = _extract.get("prem",{})
  193. for _key in dict_pack.keys():
  194. for _role in dict_pack[_key]["roleList"]:
  195. if isinstance(_role,list):
  196. _entity = _role[1]
  197. nlp_enterprise.append(_entity)
  198. if isinstance(_role,dict):
  199. _entity = _role["role_text"]
  200. nlp_enterprise.append(_entity)
  201. nlp_enterprise = list(set(nlp_enterprise))
  202. dict_entity = {"indoctextcon":nlp_enterprise,
  203. "notindoctextcon":nlp_enterprise_attachment}
  204. return json.dumps(dict_entity,ensure_ascii=False)
  205. @annotate("string->bigint")
  206. class f_get_extractCount(object):
  207. def __init__(self):
  208. import time
  209. global time
  210. import logging
  211. import json
  212. import re
  213. global json,logging,re
  214. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  215. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  216. def evaluate(self, extractjson):
  217. if extractjson is not None:
  218. _extract = json.loads(extractjson)
  219. return _extract.get("extract_count",0)
  220. else:
  221. _extract = {}
  222. dict_pack = _extract.get("prem",{})
  223. extract_count = 0
  224. list_code = _extract.get("code",[])
  225. if len(list_code)>0:
  226. project_code = list_code[0]
  227. else:
  228. project_code = ""
  229. project_name = _extract.get("name","")
  230. bidding_budget = ""
  231. win_tenderer = ""
  232. win_bid_price = ""
  233. linklist_count = 0
  234. for _key in dict_pack.keys():
  235. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  236. extract_count += 1
  237. if bidding_budget=="":
  238. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  239. for _role in dict_pack[_key]["roleList"]:
  240. if isinstance(_role,list):
  241. extract_count += 1
  242. if _role[2]!='' and float(_role[2])>0:
  243. extract_count += 1
  244. if _role[0]=="tenderee":
  245. tenderee = _role[1]
  246. if _role[0]=="win_tenderer":
  247. if win_tenderer=="":
  248. win_tenderer = _role[1]
  249. if _role[2]!='' and float(_role[2])>0:
  250. extract_count += 1
  251. if win_bid_price=="":
  252. win_bid_price = str(float(_role[2]))
  253. if _role[0]=="agency":
  254. agency = _role[1]
  255. if isinstance(_role,dict):
  256. extract_count += 1
  257. if "role_money" in _role:
  258. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  259. extract_count += 1
  260. if _role.get("role_name")=="tenderee":
  261. tenderee = _role["role_text"]
  262. if _role.get("role_name")=="win_tenderer":
  263. if win_tenderer=="":
  264. win_tenderer = _role["role_text"]
  265. if "role_money" in _role:
  266. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  267. extract_count += 1
  268. if win_bid_price=="":
  269. win_bid_price = str(float(_role["role_money"]["money"]))
  270. if _role["role_name"]=="agency":
  271. agency = _role["role_text"]
  272. linklist = _role.get("linklist",[])
  273. for link in linklist:
  274. for l in link:
  275. if l!="":
  276. linklist_count += 1
  277. extract_count += linklist_count//2
  278. if project_code!="":
  279. extract_count += 1
  280. if project_name!="":
  281. extract_count += 1
  282. return extract_count
  283. @annotate('string,string,string,string,string -> string,string,string,bigint')
  284. class f_decode_sub_docs_json(BaseUDTF):
  285. def __init__(self):
  286. import logging
  287. import json
  288. global json,logging
  289. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  290. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  291. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  292. extract_count = 0
  293. if project_code is not None and project_code!="":
  294. extract_count += 1
  295. if project_name is not None and project_name!="":
  296. extract_count += 1
  297. if tenderee is not None and tenderee!="":
  298. extract_count += 1
  299. if agency is not None and agency!="":
  300. extract_count += 1
  301. if sub_docs_json is not None:
  302. for sub_docs in json.loads(sub_docs_json):
  303. for _key_sub_docs in sub_docs.keys():
  304. extract_count += 1
  305. if _key_sub_docs in columns:
  306. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  307. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  308. if float(sub_docs[_key_sub_docs])>0:
  309. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  310. else:
  311. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  312. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  313. @annotate('string,string,string -> string,string,string,string,string,string,string')
  314. class f_decode_for_dumplicate(BaseUDTF):
  315. def __init__(self):
  316. import logging
  317. import json
  318. global json,logging
  319. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  320. def process(self,sub_docs_json,extractjson,extract):
  321. if extractjson is None or extractjson=="":
  322. extractjson = "{}"
  323. try:
  324. _extract = json.loads(extractjson)
  325. except Exception as e:
  326. _extract = {}
  327. product = ",".join(_extract.get("product",[]))
  328. list_product = product.split(",")
  329. project_codes = ",".join(_extract.get("code",[]))
  330. list_code = project_codes.split(",")
  331. if sub_docs_json is not None:
  332. list_sub_docs = json.loads(sub_docs_json)
  333. else:
  334. list_sub_docs = [{}]
  335. max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
  336. if extract!="extract":
  337. win_tenderer = ""
  338. bidding_budget = ""
  339. win_bid_price = ""
  340. for _subdoc in list_sub_docs:
  341. win_tenderer = _subdoc.get("win_tenderer","")
  342. bidding_budget = _subdoc.get("bidding_budget","0")
  343. if float(bidding_budget)==0:
  344. bidding_budget = ""
  345. else:
  346. bidding_budget = str(float(bidding_budget))
  347. win_bid_price = _subdoc.get("win_bid_price","0")
  348. if float(win_bid_price)==0:
  349. win_bid_price = ""
  350. else:
  351. win_bid_price = str(float(win_bid_price))
  352. if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
  353. break
  354. print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
  355. self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
  356. else:
  357. for _i in range(max_len):
  358. _product = list_product[_i%len(list_product)]
  359. _code = list_code[_i%len(list_code)]
  360. _subdoc = list_sub_docs[_i%len(list_sub_docs)]
  361. win_tenderer = _subdoc.get("win_tenderer","")
  362. bidding_budget = _subdoc.get("bidding_budget","0")
  363. if float(bidding_budget)==0:
  364. bidding_budget = ""
  365. else:
  366. bidding_budget = str(float(bidding_budget))
  367. win_bid_price = _subdoc.get("win_bid_price","0")
  368. if float(win_bid_price)==0:
  369. win_bid_price = ""
  370. else:
  371. win_bid_price = str(float(win_bid_price))
  372. self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
  373. @annotate("string->bigint")
  374. class totimestamp(object):
  375. def __init__(self):
  376. import time
  377. global time
  378. import logging
  379. import json
  380. import re
  381. global json,logging,re
  382. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  383. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  384. def evaluate(self, str_time):
  385. try:
  386. logging.info(str_time)
  387. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  388. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  389. timeStamp = int(time.mktime(timeArray))
  390. return timeStamp
  391. else:
  392. return 0
  393. except Exception as e:
  394. return 0
  395. @annotate("string->string")
  396. class refind_name(object):
  397. def __init__(self):
  398. import logging
  399. import re
  400. global logging,re
  401. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  402. def evaluate(self, title):
  403. if title is not None:
  404. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  405. return ""
  406. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  407. class f_set_docid(BaseUDAF):
  408. '''
  409. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  410. '''
  411. def __init__(self):
  412. import json
  413. global json
  414. def new_buffer(self):
  415. return [[]]
  416. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  417. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  418. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  419. def merge(self, buffer, pbuffer):
  420. buffer[0].extend(pbuffer[0])
  421. def terminate(self, buffer):
  422. list_docs = buffer[0]
  423. list_docs.sort(key=lambda x:x["page_time_stamp"])
  424. list_group = []
  425. _begin = 0
  426. defind_count = 0
  427. if len(list_docs)>0:
  428. defind_count = list_docs[0]["defind_count"]
  429. print(defind_count)
  430. for i in range(len(list_docs)-1):
  431. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
  432. continue
  433. else:
  434. _group = []
  435. _set_column = set()
  436. _set_tenderee = set()
  437. for j in range(_begin,i+1):
  438. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  439. _set_tenderee.add(list_docs[j]["tenderee"])
  440. _set_column.add(list_docs[j]["defind_column"])
  441. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  442. if len(_group)>=3 and len(_set_tenderee)>1:
  443. pass
  444. else:
  445. print(defind_count,len(_set_column))
  446. if len(_group)>1:
  447. if defind_count==2:
  448. if len(_set_column)>=2:
  449. list_group.append(_group)
  450. elif defind_count==1:
  451. if len(_set_column)==1:
  452. list_group.append(_group)
  453. elif defind_count==0:
  454. list_group.append(_group)
  455. _begin = i+1
  456. if len(list_docs)>1:
  457. _set_column = set()
  458. _set_tenderee = set()
  459. _group = []
  460. for j in range(_begin,len(list_docs)):
  461. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  462. _set_tenderee.add(list_docs[j]["tenderee"])
  463. _set_column.add(list_docs[j]["defind_column"])
  464. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  465. if len(_group)>=3 and len(_set_tenderee)>1:
  466. pass
  467. else:
  468. if len(_group)>1:
  469. if defind_count==2:
  470. if len(_set_column)>=2:
  471. list_group.append(_group)
  472. elif defind_count==1:
  473. if len(_set_column)==1:
  474. list_group.append(_group)
  475. elif defind_count==0:
  476. list_group.append(_group)
  477. return json.dumps(list_group)
  478. # def terminate(self, buffer):
  479. #
  480. #
  481. # list_docs = buffer[0]
  482. # if len(list_docs)>0:
  483. # defind_count = list_docs[0]["defind_count"]
  484. #
  485. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  486. #
  487. # list_group = []
  488. # for time_group in list_time_group:
  489. # _group = []
  490. # _set_column = set()
  491. # base_tenderee = ""
  492. # _set_tenderee = set()
  493. # for j in range(len(time_group)):
  494. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  495. # # if base_tenderee =="":
  496. # # base_tenderee = time_group[j]["tenderee"]
  497. # # _set_tenderee.add(time_group[j]["tenderee"])
  498. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  499. # # if simi<0.8:
  500. # # _set_tenderee.add(time_group[j]["tenderee"])
  501. #
  502. # _set_tenderee.add(time_group[j]["tenderee"])
  503. # _set_column.add(time_group[j]["defind_column"])
  504. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  505. #
  506. # if len(_group)>=3 and len(_set_tenderee)>1:
  507. # pass
  508. # else:
  509. # if len(_group)>1:
  510. # if defind_count==2:
  511. # if len(_set_column)>=2:
  512. # list_group.append(_group)
  513. # elif defind_count==1:
  514. # if len(_set_column)==1:
  515. # list_group.append(_group)
  516. # elif defind_count==0:
  517. # list_group.append(_group)
  518. #
  519. # return json.dumps(list_group)
  520. def isEmpty(_str):
  521. if _str is None or _str=="":
  522. return True
  523. return False
  524. @annotate('bigint->string')
  525. class f_group_fingerprint(BaseUDAF):
  526. def __init__(self):
  527. import json
  528. global json
  529. def new_buffer(self):
  530. return [[]]
  531. def iterate(self, buffer,docid):
  532. buffer[0].append(docid)
  533. def merge(self, buffer, pbuffer):
  534. buffer[0].extend(pbuffer[0][:100000])
  535. def terminate(self, buffer):
  536. list_docid = buffer[0][:100000]
  537. list_docid.sort(key=lambda x:x)
  538. return ",".join([str(a) for a in list_docid])
  539. @annotate('string->bigint,string')
  540. class f_ungroup_fingerprint(BaseUDTF):
  541. def process(self,dumplicates):
  542. list_docid = dumplicates.split(",")
  543. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  544. @annotate('bigint,bigint,string->string')
  545. class f_dump_probability(BaseUDAF):
  546. '''
  547. 合并组为一条记录
  548. '''
  549. def __init__(self):
  550. import json
  551. global json
  552. def new_buffer(self):
  553. return [[]]
  554. def iterate(self, buffer,docid,page_time_stamp,_type):
  555. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  556. def merge(self, buffer, pbuffer):
  557. buffer[0].extend(pbuffer[0])
  558. def terminate(self, buffer):
  559. list_dict = buffer[0]
  560. _set = set()
  561. list_data = []
  562. for _dict in list_dict:
  563. docid = _dict["docid"]
  564. if docid in _set:
  565. continue
  566. _set.add(docid)
  567. list_data.append(_dict)
  568. if len(list_data)>10000:
  569. break
  570. list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
  571. return json.dumps(list_group)
  572. @annotate('string -> bigint,bigint,bigint,bigint,string')
  573. class f_split_dumplicate_probability(BaseUDTF):
  574. def __init__(self):
  575. import logging
  576. import json
  577. global logging,json
  578. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  579. def process(self,list_group_str):
  580. logging.info("0")
  581. logging.info(list_group_str)
  582. if list_group_str is not None:
  583. logging.info("1")
  584. try:
  585. list_group = json.loads(list_group_str)
  586. logging.info("2")
  587. for _group in list_group:
  588. if len(_group)>0:
  589. _type = _group[0].get("type","")
  590. logging.info("3%d"%len(list_group))
  591. # _group.sort(key=lambda x:x["page_time_stamp"])
  592. _len = min(100,len(_group))
  593. for _index_i in range(_len):
  594. _count = 0
  595. for _index_j in range(_index_i+1,_len):
  596. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  597. break
  598. _count += 1
  599. _docid1 = _group[_index_i]["docid"]
  600. _docid2 = _group[_index_j]["docid"]
  601. if _docid1<_docid2:
  602. self.forward(_docid1,_docid2,1,_len,_type)
  603. elif _docid1>_docid2:
  604. self.forward(_docid2,_docid1,1,_len,_type)
  605. except Exception as e:
  606. logging(str(e))
  607. @annotate('bigint,bigint,string->string')
  608. class f_dumplicate_groupPairs(BaseUDAF):
  609. '''
  610. 合并组为一条记录
  611. '''
  612. def __init__(self):
  613. import json
  614. global json
  615. def new_buffer(self):
  616. return [[]]
  617. def iterate(self, buffer,is_exists,counts,_type):
  618. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  619. def merge(self, buffer, pbuffer):
  620. buffer[0].extend(pbuffer[0])
  621. def terminate(self, buffer):
  622. list_dict = buffer[0]
  623. list_dict = list_dict[:10000]
  624. return json.dumps(list_dict)
  625. def check_columns(tenderee_less,tenderee_greater,
  626. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  627. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  628. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  629. flag = True
  630. _set_tenderee = set()
  631. if tenderee_less is not None and tenderee_less!="":
  632. _set_tenderee.add(tenderee_less)
  633. if tenderee_greater is not None and tenderee_greater!="":
  634. _set_tenderee.add(tenderee_greater)
  635. if len(_set_tenderee)>1:
  636. return False
  637. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  638. if code_sim>0.6 and code_sim<1:
  639. return False
  640. #同批次不同编号
  641. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  642. _split_code_less = project_code_less.split("-")
  643. _split_code_greater = project_code_greater.split("-")
  644. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  645. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  646. return False
  647. _set_win_tenderer = set()
  648. if win_tenderer_less is not None and win_tenderer_less!="":
  649. _set_win_tenderer.add(win_tenderer_less)
  650. if win_tenderer_greater is not None and win_tenderer_greater!="":
  651. _set_win_tenderer.add(win_tenderer_greater)
  652. if len(_set_win_tenderer)>1:
  653. return False
  654. _set_win_bid_price = set()
  655. if win_bid_price_less is not None and win_bid_price_less!="":
  656. _set_win_bid_price.add(float(win_bid_price_less))
  657. if win_bid_price_greater is not None and win_bid_price_greater!="":
  658. _set_win_bid_price.add(float(win_bid_price_greater))
  659. if len(_set_win_bid_price)>1:
  660. return False
  661. _set_bidding_budget = set()
  662. if bidding_budget_less is not None and bidding_budget_less!="":
  663. _set_bidding_budget.add(float(bidding_budget_less))
  664. if bidding_budget_greater is not None and bidding_budget_greater!="":
  665. _set_bidding_budget.add(float(bidding_budget_greater))
  666. if len(_set_bidding_budget)>1:
  667. return False
  668. return True
  669. import math
  670. def featurnCount(_count,max_count=100):
  671. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  672. def getSimLevel(str1,str2):
  673. str1_null = False
  674. str2_null = False
  675. _v = 0
  676. if str1 is None or str1=="":
  677. str1_null = True
  678. if str2 is None or str2=="":
  679. str2_null = True
  680. if str1_null and str2_null:
  681. _v = 2
  682. elif str1_null and not str2_null:
  683. _v = 4
  684. elif not str1_null and str2_null:
  685. _v = 6
  686. elif not str1_null and not str2_null:
  687. if str1==str2:
  688. _v = 10
  689. else:
  690. _v = 0
  691. return _v
  692. def getLength(_str):
  693. return len(str(_str) if _str is not None else "")
  694. def check_money(bidding_budget_less,bidding_budget_greater,
  695. win_bid_price_less,win_bid_price_greater,
  696. moneys_less,moneys_greater,
  697. moneys_attachment_less,moneys_attachment_greater):
  698. bidding_budget_less_source = bidding_budget_less
  699. bidding_budget_greater_source = bidding_budget_greater
  700. win_bid_price_less_source = win_bid_price_less
  701. win_bid_price_greater_source = win_bid_price_greater
  702. #只判断最高前六位
  703. if getLength(bidding_budget_less)>0:
  704. bidding_budget_less_source = float(bidding_budget_less_source)
  705. bidding_budget_less = round(float(bidding_budget_less))
  706. bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
  707. if getLength(bidding_budget_greater)>0:
  708. bidding_budget_greater_source = float(bidding_budget_greater_source)
  709. bidding_budget_greater = round(float(bidding_budget_greater))
  710. bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  711. if getLength(win_bid_price_less)>0:
  712. win_bid_price_less_source = float(win_bid_price_less_source)
  713. win_bid_price_less = round(float(win_bid_price_less))
  714. win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
  715. if getLength(win_bid_price_greater)>0:
  716. win_bid_price_greater_source = float(win_bid_price_greater_source)
  717. win_bid_price_greater = round(float(win_bid_price_greater))
  718. win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  719. #check saming
  720. budget_is_same = ""
  721. price_is_same = ""
  722. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  723. budget_less = float(bidding_budget_less)
  724. budget_greater = float(bidding_budget_greater)
  725. if budget_less!=budget_greater:
  726. if min(budget_less,budget_greater)>0:
  727. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  728. budget_is_same = True
  729. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  730. budget_is_same = True
  731. if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
  732. budget_is_same = True
  733. if bidding_budget_less_source in moneys_greater or bidding_budget_less_source in moneys_attachment_greater:
  734. budget_is_same = True
  735. if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
  736. budget_is_same = True
  737. if bidding_budget_greater_source in moneys_less or bidding_budget_greater_source in moneys_attachment_less:
  738. budget_is_same = True
  739. if budget_is_same=="":
  740. return False
  741. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  742. price_less = float(win_bid_price_less)
  743. price_greater = float(win_bid_price_greater)
  744. if price_less!=price_greater:
  745. if min(price_less,price_greater)>0:
  746. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  747. price_is_same = True
  748. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  749. price_is_same = True
  750. if price_less in moneys_greater or price_less in moneys_attachment_greater:
  751. price_is_same = True
  752. if win_bid_price_less_source in moneys_greater or win_bid_price_less_source in moneys_attachment_greater:
  753. price_is_same = True
  754. if price_greater in moneys_less or price_greater in moneys_attachment_less:
  755. price_is_same = True
  756. if win_bid_price_greater_source in moneys_less or win_bid_price_greater_source in moneys_attachment_less:
  757. price_is_same = True
  758. if price_is_same=="":
  759. return False
  760. return True
  761. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  762. tenderee_less,tenderee_greater,
  763. agency_less,agency_greater,
  764. win_tenderer_less,win_tenderer_greater,
  765. similarity=0.85):
  766. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  767. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  768. if entity_less!=entity_greater:
  769. is_same = ''
  770. _sim = jaccard_score(entity_less,entity_greater)
  771. if _sim>similarity:
  772. is_same = True
  773. if is_same=='':
  774. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  775. is_same = True
  776. if is_same=='':
  777. return False
  778. return True
  779. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  780. return False
  781. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  782. return False
  783. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  784. return False
  785. return True
  786. def check_punish(punish_less,punish_greater):
  787. same_count = 0
  788. not_same_count = 0
  789. _flag = True
  790. keys = list(set(list(punish_less.keys())) | set(list(punish_greater.keys())))
  791. for k in keys:
  792. v1 = punish_less.get(k)
  793. v2 = punish_greater.get(k)
  794. if getLength(v1)>0 and getLength(v2)>0:
  795. if k=="punish_code":
  796. if not check_codes([v1],[v2]):
  797. not_same_count += 1
  798. _flag = False
  799. else:
  800. same_count += 1
  801. if k=="punishDecision":
  802. if getSimilarityOfString(v1,v2)>0.8:
  803. same_count += 1
  804. if k in ("complainants","punishPeople","institutions"):
  805. if v1==v2:
  806. same_count += 1
  807. else:
  808. not_same_count == 1
  809. _flag = False
  810. return _flag,same_count,not_same_count
  811. def check_source_type(source_type_less,source_type_greater):
  812. if getLength(source_type_less)>0 and getLength(source_type_greater)>0:
  813. if source_type_less!=source_type_greater:
  814. return False
  815. return True
  816. def check_approval(approval_less,approval_greater,b_log):
  817. if b_log:
  818. logging.info("approval_less %s==approval_greater %s"%(approval_less,approval_greater))
  819. for _less in approval_less:
  820. for _greater in approval_greater:
  821. same_count = 0
  822. not_same_count = 0
  823. flag = True
  824. keys = ["source_stage","source_type","doc_num","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","evaluation_agency","legal_person","compilation_unit","time_approval"]
  825. for k in keys:
  826. v1 = _less.get(k)
  827. v2 = _greater.get(k)
  828. if getLength(v1)>0 and getLength(v2)>0:
  829. if k in ("source_stage","source_type"):
  830. if v1!=v2:
  831. flag = False
  832. if k in ("project_code","doc_num"):
  833. if check_codes([v1],[v2]):
  834. same_count += 1
  835. else:
  836. not_same_count -= 1
  837. if b_log:
  838. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  839. flag = False
  840. if k in ("approval_items","approval_result","project_name"):
  841. if getSimilarityOfString(v1,v2)>0.8:
  842. same_count += 1
  843. else:
  844. not_same_count -= 1
  845. if k in ("approver","construct_company","declare_company","evaluation_agency","legal_person","compilation_unit"):
  846. if v1==v2:
  847. same_count += 1
  848. else:
  849. not_same_count -= 1
  850. if b_log:
  851. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  852. flag = False
  853. if flag and same_count>1:
  854. return flag,same_count,not_same_count
  855. flag = True
  856. if len(approval_less)>0 and len(approval_greater)>0:
  857. flag = False
  858. return flag,0,0
  859. def check_codes(project_codes_less,project_codes_greater):
  860. #check the similarity
  861. is_same = False
  862. is_sim = False
  863. for project_code_less in project_codes_less:
  864. for project_code_greater in project_codes_greater:
  865. project_code_less = str(project_code_less).upper()
  866. project_code_greater = str(project_code_greater).upper()
  867. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  868. if project_code_less is not None and project_code_greater is not None:
  869. if code_sim>0.6:
  870. if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
  871. is_same = True
  872. else:
  873. is_sim = True
  874. if project_code_less!=project_code_greater:
  875. if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
  876. is_sim = True
  877. if is_same:
  878. return True
  879. if is_sim:
  880. return False
  881. return True
  882. def check_demand():
  883. return True
  884. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  885. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  886. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  887. num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
  888. location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
  889. building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  890. rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
  891. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  892. def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
  893. if code_greater is None:
  894. code_greater = []
  895. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  896. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  897. for _c in codes_less:
  898. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  899. for _c in code_greater:
  900. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  901. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  902. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  903. #check the package
  904. if doctitle_refind_less is None:
  905. doctitle_refind_less = ""
  906. if doctitle_refind_greater is None:
  907. doctitle_refind_greater = ""
  908. _pack1 = None
  909. _pack2 = None
  910. #if contain then pass
  911. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  912. return True
  913. #check the package in title
  914. _match = re.search(package_number_pattern,doctitle_refind_less)
  915. if _match is not None:
  916. _pack1 = _match.groupdict()["name"]
  917. _match = re.search(package_number_pattern,doctitle_refind_greater)
  918. if _match is not None:
  919. _pack2 = _match.groupdict()["name"]
  920. if _pack1 is not None and _pack2 is not None:
  921. if _pack1!=_pack2:
  922. return False
  923. #check the nums in title
  924. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  925. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  926. #check the nums,location,building in title
  927. for _p in [code_pattern]:
  928. num_all_l = re.findall(_p,doctitle_refind_less)
  929. num_all_g = re.findall(_p,doctitle_refind_greater)
  930. set_num_l = set()
  931. set_num_g = set()
  932. for _l in num_all_l:
  933. if re.search(num_pattern,_l) is not None:
  934. if _l.find(".")>0:
  935. set_num_l.add(_l)
  936. elif len(_l)<4:
  937. set_num_l.add(_l)
  938. for _g in num_all_g:
  939. if re.search(num_pattern,_g) is not None:
  940. if _g.find(".")>0:
  941. set_num_g.add(_g)
  942. elif len(_g)<4:
  943. set_num_g.add(_g)
  944. if len(set_num_l)>0 and len(set_num_g)>0:
  945. if len(set_num_l&set_num_g)!=len(set_num_l):
  946. return False
  947. #check location and keywords
  948. for _p in [num1_pattern,building_pattern,rebid_pattern]:
  949. num_all_l = re.findall(_p,doctitle_refind_less)
  950. num_all_g = re.findall(_p,doctitle_refind_greater)
  951. set_num_l = set(num_all_l)
  952. set_num_g = set(num_all_g)
  953. if len(set_num_l)==len(set_num_g):
  954. if len(set_num_l&set_num_g)!=len(set_num_l):
  955. return False
  956. #check the location has conflict
  957. for _p in [location_pattern]:
  958. num_all_l = re.findall(_p,doctitle_refind_less)
  959. num_all_g = re.findall(_p,doctitle_refind_greater)
  960. dict_num_l = {}
  961. dict_num_g = {}
  962. for _l in num_all_l:
  963. if len(_l)>0:
  964. key = _l[-1:]
  965. if key not in dict_num_l:
  966. dict_num_l[key] = set()
  967. dict_num_l[key].add(_l)
  968. for _g in num_all_g:
  969. if len(_g)>0:
  970. key = _g[-1:]
  971. if key not in dict_num_g:
  972. dict_num_g[key] = set()
  973. dict_num_g[key].add(_g)
  974. for k,v in dict_num_l.items():
  975. if k in dict_num_g:
  976. if len(v&dict_num_g[k])==0:
  977. return False
  978. return True
  979. def product_dump(list_product):
  980. _product_l_l = []
  981. list_product.sort(key=lambda x:len(x))
  982. for _l in list_product:
  983. _exists = False
  984. for l1 in _product_l_l:
  985. if l1 in _l:
  986. _exists = True
  987. break
  988. if not _exists:
  989. _product_l_l.append(_l)
  990. return _product_l_l
  991. def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
  992. if getLength(product_less)>0 and getLength(product_greater)>0:
  993. _product_l = product_less.split(split_char)
  994. _product_l = product_dump(_product_l)
  995. _product_g = product_greater.split(split_char)
  996. _product_g = product_dump(_product_g)
  997. _title_l = doctitle_refine_less
  998. _title_g = doctitle_refine_greater
  999. same_count = 0
  1000. if len(_product_l)>len(_product_g):
  1001. a = _product_g
  1002. _product_g = _product_l
  1003. _product_l = a
  1004. _title_l = doctitle_refine_greater
  1005. _title_g = doctitle_refine_less
  1006. set_product_l_in_title = set()
  1007. set_product_g_in_title = set()
  1008. for _l in _product_l:
  1009. if _title_l.find(_l)>=0:
  1010. set_product_l_in_title.add(_l)
  1011. for _g in _product_g:
  1012. if _title_g.find(_g)>=0:
  1013. set_product_g_in_title.add(_g)
  1014. # 限制标题出现的产品要有重叠
  1015. if len(set_product_l_in_title)>0 and len(set_product_g_in_title)>0:
  1016. _set_union = set_product_l_in_title & set_product_g_in_title
  1017. # 不同的部门若有重叠则通过
  1018. diff_l = set_product_l_in_title-_set_union
  1019. diff_g = set_product_g_in_title-_set_union
  1020. diff_dump = product_dump(list(diff_l.union(diff_g)))
  1021. if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
  1022. return False
  1023. # 过于严格,暂时取消
  1024. # if len(_set_union)==0:
  1025. # return False
  1026. # if len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
  1027. # _l1 = list(set_product_l_in_title)
  1028. # _l2 = list(set_product_g_in_title)
  1029. # _l1.extend(_l2)
  1030. # _l1 = product_dump(_l1)
  1031. # if len(_l1)!=len(_set_union):
  1032. # return False
  1033. for _l in _product_l:
  1034. for _g in _product_g:
  1035. if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
  1036. same_count += 1
  1037. break
  1038. if same_count/len(_product_l)>=0.5:
  1039. return True
  1040. return False
  1041. return True
  1042. def check_package(package_less,package_greater,split_char=","):
  1043. if getLength(package_less)>0 and getLength(package_greater)>0:
  1044. _product_l = package_less.split(split_char)
  1045. _product_g = package_greater.split(split_char)
  1046. same_level = False
  1047. for _l in _product_l:
  1048. for _g in _product_g:
  1049. if abs(len(_l)-len(_g))<=2:
  1050. save_level = True
  1051. if _l==_g:
  1052. return True
  1053. if same_level:
  1054. return False
  1055. return True
  1056. def check_time(json_time_less,json_time_greater):
  1057. has_same = False
  1058. has_diff = False
  1059. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  1060. if isinstance(json_time_less,dict):
  1061. time_less = json_time_less
  1062. else:
  1063. time_less = json.loads(json_time_less)
  1064. if isinstance(json_time_greater,dict):
  1065. time_greater = json_time_greater
  1066. else:
  1067. time_greater = json.loads(json_time_greater)
  1068. for k,v in time_less.items():
  1069. if getLength(v)>0:
  1070. v1 = time_greater.get(k,"")
  1071. if getLength(v1)>0:
  1072. if v[:10]!=v1[:10]:
  1073. has_diff = True
  1074. else:
  1075. has_same = True
  1076. if has_same:
  1077. if has_diff:
  1078. return 1
  1079. return 2
  1080. if has_diff:
  1081. return 0
  1082. return 1
  1083. def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,hard_level=1):
  1084. docid_less = document_less["docid"]
  1085. docchannel_less = document_less.get("docchannel",0)
  1086. page_time_less = document_less.get("page_time")
  1087. doctitle_refine_less = document_less["doctitle_refine"]
  1088. project_codes_less = document_less.get("project_codes")
  1089. nlp_enterprise_less = document_less["nlp_enterprise"]
  1090. tenderee_less = document_less.get("tenderee","")
  1091. agency_less = document_less.get("agency")
  1092. win_tenderer_less = document_less["win_tenderer"]
  1093. bidding_budget_less = document_less["bidding_budget"]
  1094. win_bid_price_less = document_less["win_bid_price"]
  1095. product_less = document_less.get("product")
  1096. package_less = document_less.get("package")
  1097. json_time_less = document_less.get("dict_time")
  1098. project_name_less = document_less.get("project_name")
  1099. fingerprint_less = document_less.get("fingerprint")
  1100. extract_count_less = document_less.get("extract_count",0)
  1101. web_source_no_less = document_less.get("web_source_no")
  1102. province_less = document_less.get("province")
  1103. city_less = document_less.get("city")
  1104. district_less = document_less.get("district")
  1105. moneys_less = document_less.get("moneys")
  1106. moneys_attachment_less = document_less.get("moneys_attachment")
  1107. page_attachments_less = document_less.get("page_attachments","[]")
  1108. punish_less = document_less.get("punish",{})
  1109. approval_less = document_less.get("approval",[])
  1110. source_type_less = document_less.get("source_type")
  1111. detail_link_less = document_less.get("detail_link")
  1112. is_special_bonds_less = document_less.get("is_special_bonds")
  1113. docid_greater = document_greater["docid"]
  1114. page_time_greater = document_greater["page_time"]
  1115. docchannel_greater = document_greater.get("docchannel",0)
  1116. doctitle_refine_greater = document_greater.get("doctitle_refine","")
  1117. project_codes_greater = document_greater["project_codes"]
  1118. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1119. tenderee_greater = document_greater.get("tenderee","")
  1120. agency_greater = document_greater.get("agency","")
  1121. win_tenderer_greater = document_greater["win_tenderer"]
  1122. bidding_budget_greater = document_greater["bidding_budget"]
  1123. win_bid_price_greater = document_greater["win_bid_price"]
  1124. product_greater = document_greater.get("product")
  1125. package_greater = document_greater.get("package")
  1126. json_time_greater = document_greater["dict_time"]
  1127. project_name_greater = document_greater.get("project_name")
  1128. fingerprint_greater = document_greater.get("fingerprint")
  1129. extract_count_greater = document_greater.get("extract_count",0)
  1130. web_source_no_greater = document_greater.get("web_source_no")
  1131. province_greater = document_greater.get("province")
  1132. city_greater = document_greater.get("city")
  1133. district_greater = document_greater.get("district")
  1134. detail_link_greater = document_greater.get("detail_link")
  1135. is_special_bonds_greater = document_greater.get("is_special_bonds")
  1136. moneys_greater = document_greater.get("moneys")
  1137. moneys_attachment_greater = document_greater.get("moneys_attachment")
  1138. page_attachments_greater = document_greater.get("page_attachments","[]")
  1139. punish_greater = document_greater.get("punish",{})
  1140. approval_greater = document_greater.get("approval",[])
  1141. source_type_greater = document_greater.get("source_type")
  1142. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1143. return 1
  1144. #一篇要素都在附件,且两篇附件md5有重叠
  1145. set_md5_less = set()
  1146. set_md5_greater = set()
  1147. list_md5_less = []
  1148. if page_attachments_less:
  1149. try:
  1150. list_md5_less = json.loads(page_attachments_less)
  1151. except Exception as e:
  1152. pass
  1153. list_md5_greater = []
  1154. if page_attachments_greater:
  1155. try:
  1156. list_md5_greater = json.loads(page_attachments_greater)
  1157. except Exception as e:
  1158. pass
  1159. for _l in list_md5_less:
  1160. _md5 = _l.get("fileMd5")
  1161. if _md5 is not None:
  1162. set_md5_less.add(_md5)
  1163. for _l in list_md5_greater:
  1164. _md5 = _l.get("fileMd5")
  1165. if _md5 is not None:
  1166. set_md5_greater.add(_md5)
  1167. if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
  1168. one_in_attach = False
  1169. dict_enterprise_less = json.loads(nlp_enterprise_less)
  1170. dict_enterprise_greater = json.loads(nlp_enterprise_greater)
  1171. indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
  1172. notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
  1173. indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
  1174. notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
  1175. if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
  1176. one_in_attach = True
  1177. if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
  1178. one_in_attach = True
  1179. if one_in_attach:
  1180. if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1181. return 1
  1182. #同一个站源,都有附件但附件没有重叠则不去重
  1183. if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
  1184. if b_log:
  1185. logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
  1186. return 0
  1187. if isinstance(project_codes_less,str):
  1188. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1189. elif project_codes_less is None:
  1190. project_codes_less = []
  1191. if isinstance(project_codes_greater,str):
  1192. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1193. elif project_codes_greater is None:
  1194. project_codes_greater = []
  1195. # 专项债去重
  1196. if is_special_bonds_greater==is_special_bonds_less==1:
  1197. detail_link_less = detail_link_less.strip() if detail_link_less else ""
  1198. detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
  1199. if "bondId=" in detail_link_less:
  1200. bondId_less = detail_link_less.split("bondId=")[1]
  1201. bondId_less = bondId_less.split(",") if bondId_less else []
  1202. else:
  1203. bondId_less = []
  1204. if "bondId=" in detail_link_greater:
  1205. bondId_greater = detail_link_greater.split("bondId=")[1]
  1206. bondId_greater = bondId_greater.split(",") if bondId_greater else []
  1207. else:
  1208. bondId_greater = []
  1209. # print('bondId_less',bondId_less)
  1210. # print('bondId_greater',bondId_greater)
  1211. if bondId_less and bondId_greater:
  1212. bondId_less = set(bondId_less)
  1213. bondId_greater = set(bondId_greater)
  1214. if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
  1215. return 1
  1216. same_count = 0
  1217. all_count = 8
  1218. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1219. same_count += 1
  1220. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1221. same_count += 1
  1222. if getLength(agency_less)>0 and agency_less==agency_greater:
  1223. same_count += 1
  1224. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1225. same_count += 1
  1226. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1227. same_count += 1
  1228. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1229. same_count += 1
  1230. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1231. same_count += 1
  1232. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1233. same_count += 1
  1234. _flag,_c1,_c2 = check_punish(punish_less,punish_greater)
  1235. if not _flag:
  1236. if b_log:
  1237. logging.info("check_punish failed")
  1238. return 0
  1239. else:
  1240. if b_log:
  1241. logging.info("check_punish true %d"%(_c1))
  1242. same_count += _c1
  1243. _flag,_c1,_c2 = check_approval(approval_less,approval_greater,b_log)
  1244. if not _flag:
  1245. if b_log:
  1246. logging.info("check approval failed")
  1247. return 0
  1248. else:
  1249. if b_log:
  1250. logging.info("check approval true %d"%(_c1))
  1251. same_count += _c1
  1252. _flag = check_source_type(source_type_less,source_type_greater)
  1253. if not _flag:
  1254. if b_log:
  1255. logging.info("check source type failed")
  1256. return 0
  1257. base_prob = 0
  1258. if min_counts<3:
  1259. base_prob = 0.9
  1260. elif min_counts<5:
  1261. base_prob = 0.8
  1262. elif min_counts<8:
  1263. base_prob = 0.7
  1264. else:
  1265. base_prob = 0.6
  1266. _prob = base_prob*same_count/all_count
  1267. if min(extract_count_less,extract_count_greater)<=3 and max(extract_count_less,extract_count_greater)>=5:
  1268. if _prob<0.1 and str(page_time_less)==str(page_time_greater):
  1269. if str(docchannel_less) not in ("302","303"):
  1270. _prob = 0.15
  1271. if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
  1272. if b_log:
  1273. logging.info("province not same:%s-%s"%(province_less,province_greater))
  1274. return 0
  1275. if _prob<0.1:
  1276. if b_log:
  1277. logging.info("prob too low:%f"%(_prob))
  1278. return _prob
  1279. check_result = {"pass":1}
  1280. if docchannel_less in (51,102,103,104,115,116,117):
  1281. if doctitle_refine_less!=doctitle_refine_greater:
  1282. if page_time_less!=page_time_greater:
  1283. check_result["docchannel"] = 0
  1284. check_result["pass"] = 0
  1285. else:
  1286. check_result["docchannel"] = 2
  1287. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1288. check_result["doctitle"] = 0
  1289. check_result["pass"] = 0
  1290. if b_log:
  1291. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1292. else:
  1293. check_result["doctitle"] = 2
  1294. #added check
  1295. if not check_codes(project_codes_less,project_codes_greater):
  1296. check_result["code"] = 0
  1297. check_result["pass"] = 0
  1298. if b_log:
  1299. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1300. else:
  1301. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1302. check_result["code"] = 2
  1303. else:
  1304. check_result["code"] = 1
  1305. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1306. check_result["product"] = 0
  1307. check_result["pass"] = 0
  1308. if b_log:
  1309. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1310. else:
  1311. if getLength(product_less)>0 and getLength(product_greater)>0:
  1312. check_result["product"] = 2
  1313. else:
  1314. check_result["product"] = 1
  1315. if not check_demand():
  1316. check_result["pass"] = 0
  1317. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1318. tenderee_less,tenderee_greater,
  1319. agency_less,agency_greater,
  1320. win_tenderer_less,win_tenderer_greater):
  1321. check_result["entity"] = 0
  1322. check_result["pass"] = 0
  1323. if b_log:
  1324. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1325. else:
  1326. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1327. check_result["entity"] = 2
  1328. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1329. check_result["entity"] = 2
  1330. else:
  1331. check_result["entity"] = 1
  1332. if not check_money(bidding_budget_less,bidding_budget_greater,
  1333. win_bid_price_less,win_bid_price_greater,
  1334. moneys_less,moneys_greater,
  1335. moneys_attachment_less,moneys_attachment_greater):
  1336. if b_log:
  1337. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1338. check_result["money"] = 0
  1339. check_result["pass"] = 0
  1340. else:
  1341. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1342. check_result["money"] = 2
  1343. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1344. check_result["money"] = 2
  1345. else:
  1346. check_result["money"] = 1
  1347. #added check
  1348. if not check_package(package_less,package_greater):
  1349. if b_log:
  1350. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1351. check_result["package"] = 0
  1352. check_result["pass"] = 0
  1353. else:
  1354. if getLength(package_less)>0 and getLength(package_greater)>0:
  1355. check_result["package"] = 2
  1356. else:
  1357. check_result["package"] = 1
  1358. #added check
  1359. _time_check = check_time(json_time_less,json_time_greater)
  1360. if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
  1361. if b_log:
  1362. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1363. if isinstance(json_time_less,dict):
  1364. time_less = json_time_less
  1365. else:
  1366. time_less = json.loads(json_time_less)
  1367. if isinstance(json_time_greater,dict):
  1368. time_greater = json_time_greater
  1369. else:
  1370. time_greater = json.loads(json_time_greater)
  1371. for k,v in time_less.items():
  1372. if getLength(v)>0:
  1373. v1 = time_greater.get(k,"")
  1374. if getLength(v1)>0:
  1375. if v!=v1:
  1376. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1377. check_result["time"] = 0
  1378. check_result["pass"] = 0
  1379. else:
  1380. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1381. check_result["time"] = 2
  1382. else:
  1383. check_result["time"] = 1
  1384. if hard_level==2 and check_result["product"]<=1:
  1385. if b_log:
  1386. logging.info("hard_level %s and check_product less than 2"%(str(hard_level)))
  1387. return 0
  1388. if check_result.get("pass",0)==0:
  1389. if b_log:
  1390. logging.info(str(check_result))
  1391. if check_result.get("money",1)==0:
  1392. return 0
  1393. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1394. return _prob
  1395. else:
  1396. return 0
  1397. return _prob
  1398. def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
  1399. if web_source_no_less==web_source_no_greater:
  1400. if fingerprint_less==fingerprint_greater:
  1401. return 1
  1402. else:
  1403. return 0
  1404. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1405. return 1
  1406. if isinstance(project_codes_less,str):
  1407. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1408. elif project_codes_less is None:
  1409. project_codes_less = []
  1410. if isinstance(project_codes_greater,str):
  1411. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1412. elif project_codes_greater is None:
  1413. project_codes_greater = []
  1414. same_count = 0
  1415. all_count = 8
  1416. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1417. same_count += 1
  1418. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1419. same_count += 1
  1420. if getLength(agency_less)>0 and agency_less==agency_greater:
  1421. same_count += 1
  1422. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1423. same_count += 1
  1424. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1425. same_count += 1
  1426. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1427. same_count += 1
  1428. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1429. same_count += 1
  1430. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1431. same_count += 1
  1432. base_prob = 0
  1433. if min_counts<3:
  1434. base_prob = 0.9
  1435. elif min_counts<5:
  1436. base_prob = 0.8
  1437. elif min_counts<8:
  1438. base_prob = 0.7
  1439. else:
  1440. base_prob = 0.6
  1441. _prob = base_prob*same_count/all_count
  1442. if min(extract_count_less,extract_count_greater)<=3:
  1443. if _prob<0.1:
  1444. _prob = 0.15
  1445. if province_less!=province_greater:
  1446. return 0
  1447. if _prob<0.1:
  1448. return _prob
  1449. check_result = {"pass":1}
  1450. if docchannel_less in (51,102,103,104,115,116,117):
  1451. if doctitle_refine_less!=doctitle_refine_greater:
  1452. if page_time_less!=page_time_greater:
  1453. check_result["docchannel"] = 0
  1454. check_result["pass"] = 0
  1455. else:
  1456. check_result["docchannel"] = 2
  1457. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1458. check_result["doctitle"] = 0
  1459. check_result["pass"] = 0
  1460. if b_log:
  1461. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1462. else:
  1463. check_result["doctitle"] = 2
  1464. #added check
  1465. if not check_codes(project_codes_less,project_codes_greater):
  1466. check_result["code"] = 0
  1467. check_result["pass"] = 0
  1468. if b_log:
  1469. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1470. else:
  1471. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1472. check_result["code"] = 2
  1473. else:
  1474. check_result["code"] = 1
  1475. if not check_product(product_less,product_greater):
  1476. check_result["product"] = 0
  1477. check_result["pass"] = 0
  1478. if b_log:
  1479. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1480. else:
  1481. if getLength(product_less)>0 and getLength(product_greater)>0:
  1482. check_result["product"] = 2
  1483. else:
  1484. check_result["product"] = 1
  1485. if not check_demand():
  1486. check_result["pass"] = 0
  1487. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1488. tenderee_less,tenderee_greater,
  1489. agency_less,agency_greater,
  1490. win_tenderer_less,win_tenderer_greater):
  1491. check_result["entity"] = 0
  1492. check_result["pass"] = 0
  1493. if b_log:
  1494. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1495. else:
  1496. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1497. check_result["entity"] = 2
  1498. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1499. check_result["entity"] = 2
  1500. else:
  1501. check_result["entity"] = 1
  1502. if not check_money(bidding_budget_less,bidding_budget_greater,
  1503. win_bid_price_less,win_bid_price_greater):
  1504. if b_log:
  1505. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1506. check_result["money"] = 0
  1507. check_result["pass"] = 0
  1508. else:
  1509. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1510. check_result["money"] = 2
  1511. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1512. check_result["money"] = 2
  1513. else:
  1514. check_result["money"] = 1
  1515. #added check
  1516. if not check_package(package_less,package_greater):
  1517. if b_log:
  1518. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1519. check_result["package"] = 0
  1520. check_result["pass"] = 0
  1521. else:
  1522. if getLength(package_less)>0 and getLength(package_greater)>0:
  1523. check_result["package"] = 2
  1524. else:
  1525. check_result["package"] = 1
  1526. #added check
  1527. if not check_time(json_time_less,json_time_greater):
  1528. if b_log:
  1529. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1530. if isinstance(json_time_less,dict):
  1531. time_less = json_time_less
  1532. else:
  1533. time_less = json.loads(json_time_less)
  1534. if isinstance(json_time_greater,dict):
  1535. time_greater = json_time_greater
  1536. else:
  1537. time_greater = json.loads(json_time_greater)
  1538. for k,v in time_less.items():
  1539. if getLength(v)>0:
  1540. v1 = time_greater.get(k,"")
  1541. if getLength(v1)>0:
  1542. if v!=v1:
  1543. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1544. check_result["time"] = 0
  1545. check_result["pass"] = 0
  1546. else:
  1547. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1548. check_result["time"] = 2
  1549. else:
  1550. check_result["time"] = 1
  1551. if hard_level==2 and check_result["product"]<=1:
  1552. return 0
  1553. if check_result.get("pass",0)==0:
  1554. if b_log:
  1555. logging.info(str(check_result))
  1556. if check_result.get("money",1)==0:
  1557. return 0
  1558. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1559. return _prob
  1560. else:
  1561. return 0
  1562. if check_result.get("time",1)==0:
  1563. return 0
  1564. return _prob
  1565. @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
  1566. class f_dumplicate_check(BaseUDTF):
  1567. def __init__(self):
  1568. import logging
  1569. import json
  1570. global logging,json
  1571. def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
  1572. tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
  1573. bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
  1574. project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
  1575. extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
  1576. page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
  1577. package_less,package_greater,json_time_less,json_time_greater,json_context,
  1578. province_less,province_greater,city_less,city_greater,district_less,district_greater,
  1579. web_source_no_less,web_source_no_greater,
  1580. extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
  1581. min_counts = 100
  1582. if json_context is not None:
  1583. _context = json.loads(json_context)
  1584. for item in _context:
  1585. if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
  1586. min_counts = item["counts"]
  1587. _extract_less = {}
  1588. if extract_json_less is not None:
  1589. _extract_less = json.loads(extract_json_less)
  1590. _extract_less["docid"] = docid_less
  1591. _extract_less["win_tenderer"] = win_tenderer_less
  1592. _extract_less["win_bid_price"] = win_bid_price_less
  1593. _extract_less["bidding_budget"] = bidding_budget_less
  1594. _extract_less["product"] = product_less
  1595. _extract_less["page_attachments"] = page_attachments_less
  1596. _extract_less["page_time"] = page_time_less
  1597. _extract_less["fingerprint"] = fingerprint_less
  1598. _extract_less["project_codes"] = project_codes_less
  1599. _extract_less["tenderee"] = tenderee_less
  1600. _extract_less["agency"] = agency_less
  1601. _extract_less["docchannel"] = docchannel_less
  1602. _extract_less["project_name"] = project_name_less
  1603. _extract_less["doctitle_refine"] = doctitle_refine_less
  1604. _extract_less["province"] = province_less
  1605. _extract_less["city"] = city_less
  1606. _extract_less["district"] = district_less
  1607. _extract_less["web_source_no"] = web_source_no_less
  1608. _extract_less["extract_count"] = extract_count_less
  1609. _extract_less["json_time"] = json_time_less
  1610. _extract_less["nlp_enterprise"] = nlp_enterprise_less
  1611. _extract_less["package"] = package_less
  1612. _extract_greater = {}
  1613. if extract_json_greater is not None:
  1614. _extract_greater = json.loads(extract_json_greater)
  1615. _extract_greater["docid"] = docid_greater
  1616. _extract_greater["win_tenderer"] = win_tenderer_greater
  1617. _extract_greater["win_bid_price"] = win_bid_price_greater
  1618. _extract_greater["bidding_budget"] = bidding_budget_greater
  1619. _extract_greater["product"] = product_greater
  1620. _extract_greater["page_attachments"] = page_attachments_greater
  1621. _extract_greater["page_time"] = page_time_greater
  1622. _extract_greater["fingerprint"] = fingerprint_greater
  1623. _extract_greater["project_codes"] = project_codes_greater
  1624. _extract_greater["tenderee"] = tenderee_greater
  1625. _extract_greater["agency"] = agency_greater
  1626. _extract_greater["docchannel"] = docchannel_greater
  1627. _extract_greater["project_name"] = project_name_greater
  1628. _extract_greater["doctitle_refine"] = doctitle_refine_greater
  1629. _extract_greater["province"] = province_greater
  1630. _extract_greater["city"] = city_greater
  1631. _extract_greater["district"] = district_greater
  1632. _extract_greater["web_source_no"] = web_source_no_greater
  1633. _extract_greater["extract_count"] = extract_count_greater
  1634. _extract_greater["json_time"] = json_time_greater
  1635. _extract_greater["nlp_enterprise"] = nlp_enterprise_greater
  1636. _extract_greater["package"] = package_greater
  1637. moneys_less = set(_extract_less.get("moneys",[]))
  1638. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1639. moneys_greater = set(_extract_greater.get("moneys",[]))
  1640. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1641. if page_attachments_less is None:
  1642. page_attachments_less = '[]'
  1643. if page_attachments_greater is None:
  1644. page_attachments_greater = '[]'
  1645. punish_less = _extract_less.get("punish",{})
  1646. punish_greater = _extract_greater.get("punish",{})
  1647. approval_less = _extract_less.get("approval",[])
  1648. approval_greater = _extract_greater.get("approval",[])
  1649. _prob = check_dumplicate_rule(_extract_less,_extract_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
  1650. self.forward(_prob)
  1651. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  1652. class f_dumplicate_featureMatrix(BaseUDTF):
  1653. def __init__(self):
  1654. import logging
  1655. import json
  1656. global logging,json
  1657. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  1658. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  1659. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  1660. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  1661. #check the page_time by special docchannel
  1662. if docchannel_less in (51,102,103,104,115,116,117):
  1663. if doctitle_refine_less!=doctitle_refine_greater:
  1664. if page_time_less!=page_time_greater:
  1665. self.forward("[1-%s]"%(str(docchannel_less)),0)
  1666. return
  1667. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1668. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  1669. return
  1670. # if not check_codes([project_code_less],[project_code_greater]):
  1671. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  1672. # return
  1673. if not check_demand():
  1674. self.forward("[4-]",0)
  1675. return
  1676. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1677. tenderee_less,tenderee_greater,
  1678. agency_less,agency_greater,
  1679. win_tenderer_less,win_tenderer_greater):
  1680. _error = ""
  1681. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  1682. _error += str(a)
  1683. self.forward("[5-%s]"%_error,0)
  1684. return
  1685. if not check_money(bidding_budget_less,bidding_budget_greater,
  1686. win_bid_price_less,win_bid_price_greater):
  1687. _error = ""
  1688. for a in [bidding_budget_less,bidding_budget_greater,
  1689. win_bid_price_less,win_bid_price_greater]:
  1690. _error += str(a)
  1691. self.forward("[6-%s]"%_error,0)
  1692. return
  1693. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1694. _error = "%s=%s"%(str(product_less),str(product_greater))
  1695. self.forward("7-%s"%_error,0)
  1696. return
  1697. _context = json.loads(json_context)
  1698. min_counts = 100
  1699. dict_context = {}
  1700. for item in _context:
  1701. if item["counts"]<min_counts:
  1702. min_counts = item["counts"]
  1703. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  1704. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1705. list_matrix = []
  1706. #get the featurn of the context into matrix
  1707. # for index_i in range(len(context_key)):
  1708. # for index_j in range(index_i+1,len(context_key)):
  1709. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  1710. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1711. # list_matrix.append(_v)
  1712. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  1713. # for index_i in range(len(context3_key)):
  1714. # for index_j in range(index_i+1,len(context3_key)):
  1715. # for index_k in range(index_j+1,len(context3_key)):
  1716. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  1717. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1718. # list_matrix.append(_v)
  1719. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  1720. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  1721. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  1722. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  1723. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  1724. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  1725. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  1726. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  1727. json_matrix = json.dumps(list_matrix)
  1728. same_count = 0
  1729. all_count = 8
  1730. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  1731. same_count += 1
  1732. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  1733. same_count += 1
  1734. if getSimilarityOfString(agency_less,agency_greater)==1:
  1735. same_count += 1
  1736. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  1737. same_count += 1
  1738. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  1739. same_count += 1
  1740. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  1741. same_count += 1
  1742. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  1743. same_count += 1
  1744. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  1745. same_count += 1
  1746. base_prob = 0
  1747. if min_counts<3:
  1748. base_prob = 0.9
  1749. elif min_counts<5:
  1750. base_prob = 0.8
  1751. elif min_counts<8:
  1752. base_prob = 0.7
  1753. else:
  1754. base_prob = 0.6
  1755. _prob = base_prob*same_count/all_count
  1756. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  1757. self.forward(json_matrix,_prob)
  1758. return
  1759. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
  1760. class f_redump_probability_final_check(BaseUDAF):
  1761. '''
  1762. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1763. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1764. '''
  1765. def __init__(self):
  1766. import logging
  1767. import json,re
  1768. global json,logging,re
  1769. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1770. def new_buffer(self):
  1771. return [list()]
  1772. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
  1773. province,city,district,web_source_no,extract_json,page_attachments):
  1774. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1775. "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1776. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
  1777. "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
  1778. def merge(self, buffer, pbuffer):
  1779. buffer[0].extend(pbuffer[0])
  1780. def terminate(self, buffer):
  1781. list_group = []
  1782. the_group = buffer[0]
  1783. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1784. _index = 0
  1785. final_group = []
  1786. if len(the_group)>0:
  1787. _index = 0
  1788. while _index<len(the_group):
  1789. document_greater = the_group[_index]
  1790. docid_greater = document_greater["docid"]
  1791. docchannel_greater = document_greater["docchannel"]
  1792. page_time_greater = document_greater["page_time"]
  1793. doctitle_refine_greater = document_greater["doctitle_refine"]
  1794. project_codes_greater = document_greater["project_codes"]
  1795. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1796. tenderee_greater = document_greater["tenderee"]
  1797. agency_greater = document_greater["agency"]
  1798. win_tenderer_greater = document_greater["win_tenderer"]
  1799. bidding_budget_greater = document_greater["bidding_budget"]
  1800. win_bid_price_greater = document_greater["win_bid_price"]
  1801. product_greater = document_greater["product"]
  1802. package_greater = document_greater["package"]
  1803. json_time_greater = document_greater["json_dicttime"]
  1804. fingerprint_greater = document_greater.get("fingerprint","")
  1805. project_name_greater = document_greater["project_name"]
  1806. extract_count_greater = document_greater["extract_count"]
  1807. province_greater = document_greater["province"]
  1808. city_greater = document_greater["city"]
  1809. district_greater = document_greater["district"]
  1810. web_source_no_greater = document_greater["web_source_no"]
  1811. extract_json_greater = document_greater["extract_json"]
  1812. page_attachments_greater = document_greater["page_attachments"]
  1813. _pass = True
  1814. for document_less in final_group:
  1815. docid_less = document_less["docid"]
  1816. docchannel_less = document_less["docchannel"]
  1817. page_time_less = document_less["page_time"]
  1818. doctitle_refine_less = document_less["doctitle_refine"]
  1819. project_codes_less = document_less["project_codes"]
  1820. nlp_enterprise_less = document_less["nlp_enterprise"]
  1821. tenderee_less = document_less["tenderee"]
  1822. agency_less = document_less["agency"]
  1823. win_tenderer_less = document_less["win_tenderer"]
  1824. bidding_budget_less = document_less["bidding_budget"]
  1825. win_bid_price_less = document_less["win_bid_price"]
  1826. product_less = document_less["product"]
  1827. package_less = document_less["package"]
  1828. json_time_less = document_less["json_dicttime"]
  1829. fingerprint_less = document_less.get("fingerprint","")
  1830. project_name_less = document_less["project_name"]
  1831. extract_count_less = document_less["extract_count"]
  1832. province_less = document_less["province"]
  1833. city_less = document_less["city"]
  1834. district_less = document_less["district"]
  1835. web_source_no_less = document_less["web_source_no"]
  1836. extract_json_less = document_less["extract_json"]
  1837. page_attachments_less = document_less["page_attachments"]
  1838. _extract_less = {}
  1839. if extract_json_less is not None:
  1840. _extract_less = json.loads(extract_json_less)
  1841. _extract_greater = {}
  1842. if extract_json_greater is not None:
  1843. _extract_greater = json.loads(extract_json_greater)
  1844. moneys_less = set(_extract_less.get("moneys",[]))
  1845. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1846. moneys_greater = set(_extract_greater.get("moneys",[]))
  1847. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1848. if page_attachments_less is None:
  1849. page_attachments_less = '[]'
  1850. if page_attachments_greater is None:
  1851. page_attachments_greater = '[]'
  1852. punish_less = _extract_less.get("punish",{})
  1853. punish_greater = _extract_greater.get("punish",{})
  1854. approval_less = _extract_less.get("approval",[])
  1855. approval_greater = _extract_greater.get("approval",[])
  1856. _prob = check_dumplicate_rule(_extract_less,_extract_greater,len(the_group),b_log=False)
  1857. if _prob<0.1:
  1858. _pass = False
  1859. break
  1860. if _pass:
  1861. final_group.append(document_greater)
  1862. else:
  1863. break
  1864. _index += 1
  1865. dumplicates = ""
  1866. if _index>1:
  1867. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1868. final_group.sort(key=lambda x:x["docid"])
  1869. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1870. _set = set()
  1871. for _d in final_group:
  1872. _docid = _d["docid"]
  1873. if _docid in _set:
  1874. continue
  1875. dumplicates += "%d,"%_docid
  1876. _set.add(_docid)
  1877. dumplicates = dumplicates[:-1]
  1878. return dumplicates
  1879. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  1880. class f_redump_probability_final_check_bak(BaseUDAF):
  1881. '''
  1882. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1883. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1884. '''
  1885. def __init__(self):
  1886. import logging
  1887. import json,re
  1888. global json,logging,re
  1889. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1890. def new_buffer(self):
  1891. return [list()]
  1892. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  1893. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1894. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1895. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  1896. def merge(self, buffer, pbuffer):
  1897. buffer[0].extend(pbuffer[0])
  1898. def terminate(self, buffer):
  1899. list_group = []
  1900. the_group = buffer[0]
  1901. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1902. _index = 0
  1903. if len(the_group)>0:
  1904. _index = 1
  1905. while _index<len(the_group):
  1906. document_greater = the_group[_index]
  1907. docchannel_greater = document_greater["docchannel"]
  1908. page_time_greater = document_greater["page_time"]
  1909. doctitle_refine_greater = document_greater["doctitle_refine"]
  1910. project_code_greater = document_greater["project_code"]
  1911. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1912. tenderee_greater = document_greater["tenderee"]
  1913. agency_greater = document_greater["agency"]
  1914. win_tenderer_greater = document_greater["win_tenderer"]
  1915. bidding_budget_greater = document_greater["bidding_budget"]
  1916. win_bid_price_greater = document_greater["win_bid_price"]
  1917. product_greater = document_greater["product"]
  1918. package_greater = document_greater["package"]
  1919. json_time_greater = document_greater["json_dicttime"]
  1920. _less_index = 0
  1921. while _less_index<_index:
  1922. document_less = the_group[_less_index]
  1923. docchannel_less = document_less["docchannel"]
  1924. page_time_less = document_less["page_time"]
  1925. doctitle_refine_less = document_less["doctitle_refine"]
  1926. project_code_less = document_less["project_code"]
  1927. nlp_enterprise_less = document_less["nlp_enterprise"]
  1928. tenderee_less = document_less["tenderee"]
  1929. agency_less = document_less["agency"]
  1930. win_tenderer_less = document_less["win_tenderer"]
  1931. bidding_budget_less = document_less["bidding_budget"]
  1932. win_bid_price_less = document_less["win_bid_price"]
  1933. product_less = document_less["product"]
  1934. package_less = document_less["package"]
  1935. json_time_less = document_less["json_dicttime"]
  1936. check_result = {"pass":1}
  1937. if docchannel_less in (51,102,103,104,115,116,117):
  1938. if doctitle_refine_less!=doctitle_refine_greater:
  1939. if page_time_less!=page_time_greater:
  1940. check_result["docchannel"] = 0
  1941. check_result["pass"] = 0
  1942. else:
  1943. check_result["docchannel"] = 2
  1944. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1945. check_result["doctitle"] = 0
  1946. check_result["pass"] = 0
  1947. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  1948. else:
  1949. check_result["doctitle"] = 2
  1950. #added check
  1951. if not check_codes([project_code_less],[project_code_greater]):
  1952. check_result["code"] = 0
  1953. check_result["pass"] = 0
  1954. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  1955. else:
  1956. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  1957. check_result["code"] = 2
  1958. else:
  1959. check_result["code"] = 1
  1960. if not check_product(product_less,product_greater):
  1961. check_result["product"] = 0
  1962. check_result["pass"] = 0
  1963. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  1964. else:
  1965. if getLength(product_less)>0 and getLength(product_greater)>0:
  1966. check_result["product"] = 2
  1967. else:
  1968. check_result["product"] = 1
  1969. if not check_demand():
  1970. check_result["pass"] = 0
  1971. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1972. tenderee_less,tenderee_greater,
  1973. agency_less,agency_greater,
  1974. win_tenderer_less,win_tenderer_greater):
  1975. check_result["entity"] = 0
  1976. check_result["pass"] = 0
  1977. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1978. else:
  1979. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1980. check_result["entity"] = 2
  1981. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1982. check_result["entity"] = 2
  1983. else:
  1984. check_result["entity"] = 1
  1985. if not check_money(bidding_budget_less,bidding_budget_greater,
  1986. win_bid_price_less,win_bid_price_greater):
  1987. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1988. check_result["money"] = 0
  1989. check_result["pass"] = 0
  1990. else:
  1991. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1992. check_result["money"] = 2
  1993. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1994. check_result["money"] = 2
  1995. else:
  1996. check_result["money"] = 1
  1997. #added check
  1998. if not check_package(package_less,package_greater):
  1999. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  2000. check_result["package"] = 0
  2001. check_result["pass"] = 0
  2002. else:
  2003. if getLength(package_less)>0 and getLength(package_greater)>0:
  2004. check_result["package"] = 2
  2005. else:
  2006. check_result["package"] = 1
  2007. #added check
  2008. if not check_time(json_time_less,json_time_greater):
  2009. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  2010. check_result["time"] = 0
  2011. check_result["pass"] = 0
  2012. else:
  2013. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  2014. check_result["time"] = 2
  2015. else:
  2016. check_result["time"] = 1
  2017. if check_result.get("pass",0)==0:
  2018. logging.info(str(check_result))
  2019. if check_result.get("time",1)==0:
  2020. break
  2021. if check_result.get("money",1)==0:
  2022. break
  2023. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  2024. pass
  2025. else:
  2026. break
  2027. _less_index += 1
  2028. if _less_index!=_index:
  2029. break
  2030. _index += 1
  2031. dumplicates = ""
  2032. if _index>1:
  2033. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  2034. final_group = the_group[:_index]
  2035. final_group.sort(key=lambda x:x["docid"])
  2036. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  2037. _set = set()
  2038. for _d in final_group:
  2039. _docid = _d["docid"]
  2040. if _docid in _set:
  2041. continue
  2042. dumplicates += "%d,"%_docid
  2043. _set.add(_docid)
  2044. dumplicates = dumplicates[:-1]
  2045. return dumplicates
  2046. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  2047. class f_set_docid_binaryChart(BaseUDAF):
  2048. '''
  2049. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2050. '''
  2051. def __init__(self):
  2052. import json
  2053. global json
  2054. def new_buffer(self):
  2055. return [[]]
  2056. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  2057. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  2058. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  2059. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  2060. "agency":agency,"web_source_no":web_source_no})
  2061. def merge(self, buffer, pbuffer):
  2062. buffer[0].extend(pbuffer[0])
  2063. def terminate(self, buffer):
  2064. list_docs = buffer[0]
  2065. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
  2066. list_group = []
  2067. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  2068. for _timeGroups in list_timeGroups:
  2069. list_empty = []
  2070. list_notEmpty = []
  2071. for _item in _timeGroups:
  2072. empty_flag = True
  2073. for _key in empty_key:
  2074. if not isEmpty(_item[_key]):
  2075. empty_flag = False
  2076. break
  2077. if empty_flag:
  2078. list_empty.append(_item)
  2079. else:
  2080. list_notEmpty.append(_item)
  2081. for _e in list_empty:
  2082. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  2083. _e_tenderee = _e["tenderee"]
  2084. for _ne in list_notEmpty:
  2085. if "set_webSource" not in _ne:
  2086. _ne["set_webSource"] = set()
  2087. _ne["set_webSource"].add(_ne["web_source_no"])
  2088. _suit = False
  2089. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  2090. _suit = True
  2091. elif isEmpty(_e_tenderee):
  2092. _suit = True
  2093. if _suit:
  2094. if _e["web_source_no"] not in _ne["set_webSource"]:
  2095. _ne["set_webSource"].add(_e["web_source_no"])
  2096. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  2097. break
  2098. if len(_group)>1:
  2099. list_group.append(_group)
  2100. return json.dumps(list_group)
  2101. def split_with_time(list_dict,sort_key,timedelta=86400*7):
  2102. if len(list_dict)>0:
  2103. if sort_key in list_dict[0]:
  2104. list_dict.sort(key=lambda x:x[sort_key])
  2105. list_group = []
  2106. _begin = 0
  2107. for i in range(len(list_dict)-1):
  2108. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  2109. continue
  2110. else:
  2111. _group = []
  2112. for j in range(_begin,i+1):
  2113. _group.append(list_dict[j])
  2114. if len(_group)>1:
  2115. list_group.append(_group)
  2116. _begin = i + 1
  2117. if len(list_dict)>1:
  2118. _group = []
  2119. for j in range(_begin,len(list_dict)):
  2120. _group.append(list_dict[j])
  2121. if len(_group)>1:
  2122. list_group.append(_group)
  2123. return list_group
  2124. return [list_dict]
  2125. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  2126. class f_set_docid_limitNum_contain(BaseUDAF):
  2127. '''
  2128. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  2129. '''
  2130. def __init__(self):
  2131. import logging
  2132. import json,re
  2133. global json,logging,re
  2134. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2135. def new_buffer(self):
  2136. return [list()]
  2137. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  2138. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  2139. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  2140. "contain_column":contain_column})
  2141. def merge(self, buffer, pbuffer):
  2142. buffer[0].extend(pbuffer[0])
  2143. def terminate(self, buffer):
  2144. list_split = split_with_time(buffer[0],"page_time_stamp")
  2145. list_group = []
  2146. for _split in list_split:
  2147. flag = True
  2148. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  2149. for _key in keys:
  2150. logging.info(_key+str(getSet(_split,_key)))
  2151. if len(getSet(_split,_key))>1:
  2152. flag = False
  2153. break
  2154. MAX_CONTAIN_COLUMN = None
  2155. #判断组内每条公告是否包含
  2156. if flag:
  2157. for _d in _split:
  2158. contain_column = _d["contain_column"]
  2159. if contain_column is not None and contain_column !="":
  2160. if MAX_CONTAIN_COLUMN is None:
  2161. MAX_CONTAIN_COLUMN = contain_column
  2162. else:
  2163. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  2164. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  2165. flag = False
  2166. break
  2167. MAX_CONTAIN_COLUMN = contain_column
  2168. else:
  2169. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  2170. flag = False
  2171. break
  2172. if flag:
  2173. if len(_split)>1:
  2174. _group = []
  2175. for _item in _split:
  2176. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  2177. list_group.append(_group)
  2178. return json.dumps(list_group)
  2179. @annotate('bigint->string')
  2180. class f_stamp_squence(BaseUDAF):
  2181. '''
  2182. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2183. '''
  2184. def __init__(self):
  2185. import json
  2186. global json
  2187. import logging
  2188. global logging
  2189. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2190. def new_buffer(self):
  2191. return [set()]
  2192. def iterate(self, buffer,page_time_stamp):
  2193. buffer[0].add(page_time_stamp)
  2194. def merge(self, buffer, pbuffer):
  2195. buffer[0] |= pbuffer[0]
  2196. def terminate(self, buffer):
  2197. if 0 in buffer[0]:
  2198. buffer[0].remove(0)
  2199. list_stamp = list(buffer[0])
  2200. list_stamp.sort(key=lambda x:x)
  2201. list_stamp_final = []
  2202. _begin = 0
  2203. _time_decase = 86400*7
  2204. logging.info(str(list_stamp))
  2205. for _index in range(len(list_stamp)-1):
  2206. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  2207. continue
  2208. else:
  2209. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  2210. _begin = _index+1
  2211. if len(list_stamp)>0:
  2212. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  2213. return json.dumps(list_stamp_final)
  2214. @annotate("bigint,string->bigint")
  2215. class in_stamp(object):
  2216. def __init__(self):
  2217. import logging
  2218. import re
  2219. import json
  2220. global logging,re,json
  2221. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2222. def evaluate(self, page_time_stamp,json_stamp):
  2223. list_stamp = json.loads(json_stamp)
  2224. int_flag = 0
  2225. for item in list_stamp:
  2226. if page_time_stamp <item[0]:
  2227. break
  2228. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  2229. int_flag = 1
  2230. break
  2231. return int_flag
  2232. def getConfidence(rule_id):
  2233. if rule_id ==0:
  2234. return 30
  2235. elif rule_id >=1 and rule_id <30:
  2236. return 20
  2237. else:
  2238. return 10
  2239. @annotate('string,string -> string')
  2240. class f_splitStr(BaseUDTF):
  2241. '''
  2242. 将多个组拆解成多条记录
  2243. '''
  2244. def __init__(self):
  2245. import logging
  2246. import json
  2247. global json,logging
  2248. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2249. def process(self, str_split,_split):
  2250. try:
  2251. for _s in str_split.split(_split):
  2252. self.forward(_s)
  2253. except Exception as e:
  2254. pass
  2255. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  2256. class f_split_group_single(BaseUDTF):
  2257. '''
  2258. 将多个组拆解成多条记录
  2259. '''
  2260. def __init__(self):
  2261. import logging
  2262. import json
  2263. global json,logging
  2264. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2265. def process(self, json_set_docid,rule_id):
  2266. list_group = json.loads(json_set_docid)
  2267. for item in list_group:
  2268. if len(item)>100:
  2269. item.sort(key=lambda x:x["docid"],reverse=True)
  2270. index_i = 0
  2271. for index_j in range(1,len(item)):
  2272. if item[index_i]["docid"]!=item[index_j]["docid"]:
  2273. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2274. else:
  2275. for index_i in range(len(item)):
  2276. for index_j in range(len(item)):
  2277. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  2278. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2279. @annotate('bigint,string->string')
  2280. class group_document(BaseUDAF):
  2281. '''
  2282. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2283. '''
  2284. def __init__(self):
  2285. import json
  2286. global json
  2287. def new_buffer(self):
  2288. return [[]]
  2289. def iterate(self, buffer,id,json_set_docid):
  2290. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  2291. def merge(self, buffer, pbuffer):
  2292. buffer[0].extend(pbuffer[0])
  2293. def terminate(self, buffer):
  2294. return json.dumps(buffer[0])
  2295. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  2296. class decare_document(BaseUDTF):
  2297. '''
  2298. 将多个组拆解成多条记录
  2299. '''
  2300. def __init__(self):
  2301. import logging
  2302. import json
  2303. global json,logging
  2304. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2305. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  2306. #y=x,少掉近一半的数据
  2307. if group_id1>=group_id2:
  2308. list_doc1 = json.loads(json_list_doc1)
  2309. list_doc2 = json.loads(json_list_doc2)
  2310. for _doc1 in list_doc1:
  2311. for _doc2 in list_doc2:
  2312. #同一个重复group不做判断
  2313. if _doc1["id"]!=_doc2["id"]:
  2314. #判断两个group是否有重复
  2315. _set1 = set()
  2316. for _item1 in _doc1["json_set_docid"]:
  2317. _set1.add(_item1["docid"])
  2318. _set2 = set()
  2319. for _item2 in _doc2["json_set_docid"]:
  2320. _set2.add(_item2["docid"])
  2321. if len(_set1&_set2)>0:
  2322. new_json_set_docid = _doc1["json_set_docid"]
  2323. for _item2 in _doc2["json_set_docid"]:
  2324. if _item2["docid"] not in _set1:
  2325. new_json_set_docid.append(_item2)
  2326. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  2327. def getBestDocid(list_pair):
  2328. # [docid1,extract_count1,docid2,extract_count2]
  2329. # list_pair.sort(key=lambda x:x[3],reverse=True)
  2330. # _max_count = max(list_pair[0][3],list_pair[0][1])
  2331. # set_candidate = set()
  2332. # if list_pair[0][1]==_max_count:
  2333. # set_candidate.add(list_pair[0][0])
  2334. # for item in list_pair:
  2335. # if item[3]==_max_count:
  2336. # set_candidate.add(item[2])
  2337. # else:
  2338. # break
  2339. # list_candidate = list(set_candidate)
  2340. # list_candidate.sort(key=lambda x:x)
  2341. new_pair = []
  2342. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  2343. for item in list_pair:
  2344. new_pair.append([item[0],item[2],item[3]])
  2345. new_pair.sort(key=lambda x:x[1])
  2346. new_pair.sort(key=lambda x:x[2],reverse=True)
  2347. return new_pair[0][1]
  2348. @annotate('bigint,bigint,bigint,bigint->string')
  2349. class choose_document(BaseUDAF):
  2350. '''
  2351. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2352. '''
  2353. def __init__(self):
  2354. import json
  2355. global json
  2356. def new_buffer(self):
  2357. return [[]]
  2358. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2359. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2360. def merge(self, buffer, pbuffer):
  2361. buffer[0].extend(pbuffer[0])
  2362. def terminate(self, buffer):
  2363. list_pair = buffer[0]
  2364. _set = set()
  2365. for item in buffer[0]:
  2366. _set.add(str(item[2]))
  2367. list_dumplicate = list(_set)
  2368. best_docid = getBestDocid(list_pair)
  2369. if best_docid==list_pair[0][0]:
  2370. save_flag = 1
  2371. else:
  2372. save_flag = 0
  2373. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  2374. @annotate('string -> bigint,string')
  2375. class f_get_choose_document(BaseUDTF):
  2376. '''
  2377. 将多个组拆解成多条记录
  2378. '''
  2379. def __init__(self):
  2380. import logging
  2381. import json
  2382. global json,logging
  2383. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2384. def process(self,json_choose):
  2385. if json_choose is None:
  2386. self.forward(1,None)
  2387. else:
  2388. _choose = json.loads(json_choose)
  2389. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  2390. @annotate('string->bigint')
  2391. class f_get_codes_count(object):
  2392. def evaluate(self,extract_json):
  2393. if extract_json is None or extract_json=="":
  2394. extract_json = "{}"
  2395. _extract = json.loads(extract_json)
  2396. _codes = _extract.get("code",[])
  2397. return len(_codes)
  2398. @annotate('string->string')
  2399. class f_get_codes(object):
  2400. def evaluate(self,extract_json):
  2401. if extract_json is None or extract_json=="":
  2402. extract_json = "{}"
  2403. _extract = json.loads(extract_json)
  2404. _codes = _extract.get("code",[])
  2405. return ",".join(_codes)
  2406. @annotate('bigint,bigint,bigint,bigint->string')
  2407. class group_document_bestFirst(BaseUDAF):
  2408. '''
  2409. 将组里面最优的放在前面
  2410. '''
  2411. def __init__(self):
  2412. import json
  2413. global json
  2414. def new_buffer(self):
  2415. return [[]]
  2416. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2417. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2418. def merge(self, buffer, pbuffer):
  2419. buffer[0].extend(pbuffer[0])
  2420. def terminate(self, buffer):
  2421. list_pair = buffer[0]
  2422. _set = set()
  2423. for item in buffer[0]:
  2424. _set.add(item[2])
  2425. _set.add(list_pair[0][0])
  2426. best_docid = getBestDocid(list_pair)
  2427. _set.remove(best_docid)
  2428. list_dumplicate = list(_set)
  2429. list_dumplicate.sort(key=lambda x:x)
  2430. list_dumplicate.insert(0,best_docid)
  2431. list_dumplicate_str = []
  2432. for item in list_dumplicate:
  2433. list_dumplicate_str.append(str(item))
  2434. return ",".join(list_dumplicate_str)
  2435. @annotate('string -> bigint,string')
  2436. class f_get_best_dumplicates(BaseUDTF):
  2437. '''
  2438. 得到每个分组中最优的那一条及其重复记录
  2439. '''
  2440. def __init__(self):
  2441. import logging
  2442. import json
  2443. global json,logging
  2444. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2445. def process(self,list_dumplicate_str):
  2446. if list_dumplicate_str is None or list_dumplicate_str=='':
  2447. pass
  2448. else:
  2449. list_dumplicate = list_dumplicate_str.split(",")
  2450. if len(list_dumplicate)>0:
  2451. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  2452. else:
  2453. pass
  2454. @annotate('bigint,bigint->string')
  2455. class bridge2group(BaseUDAF):
  2456. '''
  2457. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2458. '''
  2459. def __init__(self):
  2460. import json
  2461. global json
  2462. def new_buffer(self):
  2463. return [set()]
  2464. def iterate(self, buffer,docid1,docid2):
  2465. buffer[0].add(docid1)
  2466. buffer[0].add(docid2)
  2467. def merge(self, buffer, pbuffer):
  2468. buffer[0] |= pbuffer[0]
  2469. def terminate(self, buffer):
  2470. list_pair = list(buffer[0])
  2471. list_pair.sort(key=lambda x:x,reverse=True)
  2472. return json.dumps(list_pair)
  2473. @annotate('string -> bigint,bigint')
  2474. class group2bridge(BaseUDTF):
  2475. '''
  2476. 将多个组拆解成多条记录
  2477. '''
  2478. def __init__(self):
  2479. import logging
  2480. import json
  2481. global json,logging
  2482. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2483. def process(self,json_list_docid):
  2484. list_docid = json.loads(json_list_docid)
  2485. for _docid in list_docid:
  2486. self.forward(list_docid[-1],_docid)
  2487. @annotate('string->string')
  2488. class to_url(object):
  2489. def evaluate(self,_s):
  2490. if _s is None or _s=="":
  2491. return
  2492. else:
  2493. list_l = []
  2494. for l in _s.split(","):
  2495. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  2496. return ",".join(list_l)
  2497. @annotate('bigint,bigint,string -> bigint')
  2498. class f_get_dump_docid(BaseUDTF):
  2499. '''
  2500. 将多个组拆解成多条记录
  2501. '''
  2502. def __init__(self):
  2503. import logging
  2504. import json
  2505. global json,logging
  2506. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2507. def process(self,docid,save_flag,dumplicates):
  2508. if save_flag==0:
  2509. self.forward(docid)
  2510. if dumplicates is not None:
  2511. list_docid = dumplicates.split(",")
  2512. if len(list_docid)>0:
  2513. for _docid in list_docid[1:]:
  2514. self.forward(int(_docid))
  2515. else:
  2516. if dumplicates is not None:
  2517. list_docid = dumplicates.split(",")
  2518. if len(list_docid)>0:
  2519. for _docid in list_docid:
  2520. self.forward(int(_docid))
  2521. @annotate('string -> bigint,bigint')
  2522. class f_get_docid(BaseUDTF):
  2523. '''
  2524. 将多个组拆解成多条记录
  2525. '''
  2526. def __init__(self):
  2527. import logging
  2528. import json
  2529. global json,logging
  2530. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2531. def process(self,json_set_docid):
  2532. team_id = 0
  2533. if json_set_docid is not None:
  2534. list_docses = json.loads(json_set_docid)
  2535. for list_docs in list_docses:
  2536. team_id += 1
  2537. for item in list_docs:
  2538. self.forward(team_id,item["docid"])
  2539. @annotate("string->bigint")
  2540. class get_count_dump(object):
  2541. def __init__(self):
  2542. import logging
  2543. import re
  2544. global logging,re
  2545. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2546. def evaluate(self, title):
  2547. _count = 0
  2548. if title is not None:
  2549. _count = len(title.split(","))
  2550. return _count
  2551. def getSet(list_dict,key):
  2552. _set = set()
  2553. for item in list_dict:
  2554. if key in item:
  2555. if item[key]!='' and item[key] is not None:
  2556. if re.search("^\d[\d\.]*$",item[key]) is not None:
  2557. _set.add(str(float(item[key])))
  2558. else:
  2559. _set.add(str(item[key]))
  2560. return _set
  2561. def getDiffIndex(list_dict,key,confidence=100):
  2562. '''
  2563. 优化为相似度判断
  2564. :param list_dict:
  2565. :param key:
  2566. :param confidence:
  2567. :return:
  2568. '''
  2569. # _set = set()
  2570. # for _i in range(len(list_dict)):
  2571. # item = list_dict[_i]
  2572. # if item["confidence"]>=confidence:
  2573. # continue
  2574. # if key in item:
  2575. # if item[key]!='' and item[key] is not None:
  2576. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2577. # _set.add(str(float(item[key])))
  2578. # else:
  2579. # _set.add(str(item[key]))
  2580. # if len(_set)>1:
  2581. # return _i
  2582. # ==============================
  2583. _set = set()
  2584. _set_m = set()
  2585. base_s = ""
  2586. for _i in range(len(list_dict)):
  2587. item = list_dict[_i]
  2588. if item["confidence"]>=confidence:
  2589. continue
  2590. if key in item:
  2591. if item[key]!='' and item[key] is not None:
  2592. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2593. _m = float(item[key])
  2594. if _m>100000:
  2595. _m = _m//10000*10000
  2596. _set_m.add(str(_m))
  2597. else:
  2598. _s = str(item[key])
  2599. if base_s=="":
  2600. base_s = _s
  2601. else:
  2602. simi = getSimilarityOfString(base_s,_s)
  2603. if simi<0.8:
  2604. return _i
  2605. if len(_set_m)>1:
  2606. return _i
  2607. return len(list_dict)
  2608. @annotate('bigint,string -> bigint,bigint')
  2609. class f_getGroup_dumpFinal(BaseUDTF):
  2610. '''
  2611. 从最后的结果中获取组
  2612. '''
  2613. def __init__(self):
  2614. import logging
  2615. import json
  2616. global json,logging
  2617. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2618. def process(self,docid,dumplicates):
  2619. self.forward(int(docid),int(docid))
  2620. if dumplicates is not None:
  2621. list_docids = dumplicates.split(",")
  2622. for _docid in list_docids:
  2623. self.forward(int(docid),int(_docid))
  2624. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  2625. class f_redump_limit_num(BaseUDAF):
  2626. '''
  2627. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2628. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2629. '''
  2630. def __init__(self):
  2631. import logging
  2632. import json,re
  2633. global json,logging,re
  2634. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2635. def new_buffer(self):
  2636. return [list()]
  2637. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  2638. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  2639. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  2640. "extract_count2":extract_count2,"confidence":confidence})
  2641. def merge(self, buffer, pbuffer):
  2642. buffer[0].extend(pbuffer[0])
  2643. def terminate(self, buffer):
  2644. list_group = []
  2645. the_group = buffer[0]
  2646. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2647. if len(the_group)>5:
  2648. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  2649. else:
  2650. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  2651. final_group = []
  2652. #置信度
  2653. list_key_index = []
  2654. for _k in keys:
  2655. if _k=="doctitle":
  2656. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2657. else:
  2658. list_key_index.append(getDiffIndex(the_group,_k))
  2659. _index = min(list_key_index)
  2660. if _index>1:
  2661. main_docid = the_group[0]["main_docid"]
  2662. for item in the_group[:_index]:
  2663. if item["docid"]!=main_docid:
  2664. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  2665. # stay = True
  2666. # for _key in keys:
  2667. # if len(getSet(the_group,_key))>1:
  2668. # stay = False
  2669. # break
  2670. #
  2671. # if stay:
  2672. # main_docid = the_group[0]["main_docid"]
  2673. # for item in the_group:
  2674. # if item["docid"]!=main_docid:
  2675. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  2676. return json.dumps(final_group)
  2677. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  2678. class f_get_dumpFinal_checked(BaseUDTF):
  2679. '''
  2680. 从最后的结果中获取组
  2681. '''
  2682. def __init__(self):
  2683. import logging
  2684. import json
  2685. global json,logging
  2686. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2687. def process(self,list_group):
  2688. if list_group is not None:
  2689. final_group = json.loads(list_group)
  2690. for _group in final_group:
  2691. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  2692. @annotate('string -> bigint')
  2693. class f_getDumplicateDocids(BaseUDTF):
  2694. '''
  2695. 从最后的结果中获取组
  2696. '''
  2697. def __init__(self):
  2698. import logging
  2699. import json
  2700. global json,logging
  2701. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2702. def process(self,dumplicates):
  2703. list_docids = dumplicates.split(",")
  2704. for _d in list_docids:
  2705. self.forward(int(_d))
  2706. def jaccard_score(source,target):
  2707. source_set = set([s for s in source])
  2708. target_set = set([s for s in target])
  2709. if len(source_set)==0 or len(target_set)==0:
  2710. return 0
  2711. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  2712. def getSimilarityOfString(str1,str2):
  2713. _set1 = set()
  2714. _set2 = set()
  2715. if str1 is not None:
  2716. for i in range(1,len(str1)):
  2717. _set1.add(str1[i-1:i+1])
  2718. for i in range(2,len(str1)):
  2719. _set1.add(str1[i-2:i+1])
  2720. if str2 is not None:
  2721. for i in range(1,len(str2)):
  2722. _set2.add(str2[i-1:i+1])
  2723. for i in range(2,len(str2)):
  2724. _set2.add(str2[i-2:i+1])
  2725. _len = max(1,min(len(_set1),len(_set2)))
  2726. return len(_set1&_set2)/_len
  2727. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  2728. class f_is_legal(object):
  2729. def __init__(self):
  2730. import logging
  2731. import re
  2732. global logging,re
  2733. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2734. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  2735. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  2736. return 0
  2737. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  2738. return 0
  2739. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  2740. return 0
  2741. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  2742. return 0
  2743. _sim = getSimilarityOfString(project_code1,project_code2)
  2744. if _sim>0.7 and _sim<1:
  2745. return 0
  2746. return 1
  2747. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  2748. class f_autorule_group(BaseUDAF):
  2749. '''
  2750. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2751. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2752. '''
  2753. def __init__(self):
  2754. import logging
  2755. import json,re
  2756. global json,logging,re
  2757. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2758. def new_buffer(self):
  2759. return [list()]
  2760. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  2761. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  2762. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  2763. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  2764. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  2765. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  2766. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  2767. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  2768. def merge(self, buffer, pbuffer):
  2769. buffer[0].extend(pbuffer[0][:100])
  2770. buffer[0] = buffer[0][:100]
  2771. def getSameKeys(self,_dict1,_dict2):
  2772. list_keys = []
  2773. for k,v in _dict1.items():
  2774. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  2775. continue
  2776. v2 = _dict2.get(k,"")
  2777. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  2778. list_keys.append(k)
  2779. list_keys.sort(key=lambda x:x)
  2780. return "=".join(list_keys)
  2781. def terminate(self, buffer):
  2782. list_group = []
  2783. the_group = buffer[0]
  2784. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2785. if len(the_group)>5:
  2786. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2787. else:
  2788. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2789. #置信度
  2790. list_key_index = []
  2791. for _k in keys:
  2792. if _k=="doctitle":
  2793. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2794. else:
  2795. list_key_index.append(getDiffIndex(the_group,_k))
  2796. final_group = []
  2797. _index = min(list_key_index)
  2798. if _index>1:
  2799. for item in the_group[:_index]:
  2800. final_group.append(item)
  2801. list_rules = []
  2802. for i in range(len(final_group)):
  2803. for j in range(i+1,len(final_group)):
  2804. _dict1 = final_group[i]
  2805. _dict2 = final_group[j]
  2806. _rule = self.getSameKeys(_dict1,_dict2)
  2807. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  2808. return json.dumps(list_rules)
  2809. @annotate('string -> string,bigint,bigint')
  2810. class f_autorule_group_extract(BaseUDTF):
  2811. '''
  2812. 从最后的结果中获取组
  2813. '''
  2814. def __init__(self):
  2815. import logging
  2816. import json
  2817. global json,logging
  2818. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2819. def process(self,rules_json):
  2820. list_rules = json.loads(rules_json)
  2821. for _rule in list_rules:
  2822. self.forward(_rule[0],_rule[1],_rule[2])
  2823. if __name__ == '__main__':
  2824. # f = f_decode_for_dumplicate()
  2825. # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
  2826. # print(b)
  2827. print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
  2828. # f = f_get_extractCount()
  2829. # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
  2830. # print(f.evaluate(j))
  2831. # _str1 = "PMJJ-202211030004001"
  2832. # _str2 = "PMJJ-202211030001001"
  2833. # print(getSimilarityOfString(_str1,_str2))
  2834. # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  2835. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  2836. # print(check_product(None,None))
  2837. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  2838. # print(check_money("550.0","440.0","",""))
  2839. # for i in range(0,2):
  2840. # print(i)
  2841. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  2842. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  2843. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  2844. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  2845. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  2846. # if _match is not None:
  2847. # print(_match.groupdict()["name"])
  2848. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  2849. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  2850. # c = f_get_extractCount()
  2851. # _json = '''
  2852. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  2853. # '''
  2854. # c = f_get_nlp_enterprise()
  2855. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  2856. # print(c.evaluate(_json))
  2857. # c = f_set_docid()
  2858. # _s = '''
  2859. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2860. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2861. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2862. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  2863. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2864. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2865. # '''
  2866. # buffer = c.new_buffer()
  2867. # for _line in _s.split("\n"):
  2868. # _line = _line.strip()
  2869. # if _line=="":
  2870. # continue
  2871. # l_column = _line.split("\t")
  2872. # print(l_column)
  2873. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  2874. # page_time_stamp = int(page_time_stamp)
  2875. # extract_count = int(extract_count)
  2876. # num = 1
  2877. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  2878. # print(c.terminate(buffer))