documentDumplicate.py 134 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  191. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  192. dict_pack = _extract.get("prem",{})
  193. for _key in dict_pack.keys():
  194. for _role in dict_pack[_key]["roleList"]:
  195. if isinstance(_role,list):
  196. _entity = _role[1]
  197. nlp_enterprise.append(_entity)
  198. if isinstance(_role,dict):
  199. _entity = _role["role_text"]
  200. nlp_enterprise.append(_entity)
  201. nlp_enterprise = list(set(nlp_enterprise))
  202. dict_entity = {"indoctextcon":nlp_enterprise,
  203. "notindoctextcon":nlp_enterprise_attachment}
  204. return json.dumps(dict_entity,ensure_ascii=False)
  205. @annotate("string->bigint")
  206. class f_get_extractCount(object):
  207. def __init__(self):
  208. import time
  209. global time
  210. import logging
  211. import json
  212. import re
  213. global json,logging,re
  214. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  215. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  216. def evaluate(self, extractjson):
  217. if extractjson is not None:
  218. _extract = json.loads(extractjson)
  219. return _extract.get("extract_count",0)
  220. else:
  221. _extract = {}
  222. dict_pack = _extract.get("prem",{})
  223. extract_count = 0
  224. list_code = _extract.get("code",[])
  225. if len(list_code)>0:
  226. project_code = list_code[0]
  227. else:
  228. project_code = ""
  229. project_name = _extract.get("name","")
  230. bidding_budget = ""
  231. win_tenderer = ""
  232. win_bid_price = ""
  233. linklist_count = 0
  234. for _key in dict_pack.keys():
  235. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  236. extract_count += 1
  237. if bidding_budget=="":
  238. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  239. for _role in dict_pack[_key]["roleList"]:
  240. if isinstance(_role,list):
  241. extract_count += 1
  242. if _role[2]!='' and float(_role[2])>0:
  243. extract_count += 1
  244. if _role[0]=="tenderee":
  245. tenderee = _role[1]
  246. if _role[0]=="win_tenderer":
  247. if win_tenderer=="":
  248. win_tenderer = _role[1]
  249. if _role[2]!='' and float(_role[2])>0:
  250. extract_count += 1
  251. if win_bid_price=="":
  252. win_bid_price = str(float(_role[2]))
  253. if _role[0]=="agency":
  254. agency = _role[1]
  255. if isinstance(_role,dict):
  256. extract_count += 1
  257. if "role_money" in _role:
  258. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  259. extract_count += 1
  260. if _role.get("role_name")=="tenderee":
  261. tenderee = _role["role_text"]
  262. if _role.get("role_name")=="win_tenderer":
  263. if win_tenderer=="":
  264. win_tenderer = _role["role_text"]
  265. if "role_money" in _role:
  266. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  267. extract_count += 1
  268. if win_bid_price=="":
  269. win_bid_price = str(float(_role["role_money"]["money"]))
  270. if _role["role_name"]=="agency":
  271. agency = _role["role_text"]
  272. linklist = _role.get("linklist",[])
  273. for link in linklist:
  274. for l in link:
  275. if l!="":
  276. linklist_count += 1
  277. extract_count += linklist_count//2
  278. if project_code!="":
  279. extract_count += 1
  280. if project_name!="":
  281. extract_count += 1
  282. return extract_count
  283. @annotate('string,string,string,string,string -> string,string,string,bigint')
  284. class f_decode_sub_docs_json(BaseUDTF):
  285. def __init__(self):
  286. import logging
  287. import json
  288. global json,logging
  289. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  290. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  291. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  292. extract_count = 0
  293. if project_code is not None and project_code!="":
  294. extract_count += 1
  295. if project_name is not None and project_name!="":
  296. extract_count += 1
  297. if tenderee is not None and tenderee!="":
  298. extract_count += 1
  299. if agency is not None and agency!="":
  300. extract_count += 1
  301. if sub_docs_json is not None:
  302. for sub_docs in json.loads(sub_docs_json):
  303. for _key_sub_docs in sub_docs.keys():
  304. extract_count += 1
  305. if _key_sub_docs in columns:
  306. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  307. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  308. if float(sub_docs[_key_sub_docs])>0:
  309. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  310. else:
  311. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  312. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  313. @annotate('string,string,string -> string,string,string,string,string,string,string')
  314. class f_decode_for_dumplicate(BaseUDTF):
  315. def __init__(self):
  316. import logging
  317. import json
  318. global json,logging
  319. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  320. def process(self,sub_docs_json,extractjson,extract):
  321. if extractjson is None or extractjson=="":
  322. extractjson = "{}"
  323. try:
  324. _extract = json.loads(extractjson)
  325. except Exception as e:
  326. _extract = {}
  327. product = ",".join(_extract.get("product",[]))
  328. list_product = product.split(",")
  329. project_codes = ",".join(_extract.get("code",[]))
  330. list_code = project_codes.split(",")
  331. if sub_docs_json is not None:
  332. list_sub_docs = json.loads(sub_docs_json)
  333. else:
  334. list_sub_docs = [{}]
  335. max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
  336. if extract!="extract":
  337. win_tenderer = ""
  338. bidding_budget = ""
  339. win_bid_price = ""
  340. for _subdoc in list_sub_docs:
  341. win_tenderer = _subdoc.get("win_tenderer","")
  342. bidding_budget = _subdoc.get("bidding_budget","0")
  343. if float(bidding_budget)==0:
  344. bidding_budget = ""
  345. else:
  346. bidding_budget = str(float(bidding_budget))
  347. win_bid_price = _subdoc.get("win_bid_price","0")
  348. if float(win_bid_price)==0:
  349. win_bid_price = ""
  350. else:
  351. win_bid_price = str(float(win_bid_price))
  352. if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
  353. break
  354. print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
  355. self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
  356. else:
  357. for _i in range(max_len):
  358. _product = list_product[_i%len(list_product)]
  359. _code = list_code[_i%len(list_code)]
  360. _subdoc = list_sub_docs[_i%len(list_sub_docs)]
  361. win_tenderer = _subdoc.get("win_tenderer","")
  362. bidding_budget = _subdoc.get("bidding_budget","0")
  363. if float(bidding_budget)==0:
  364. bidding_budget = ""
  365. else:
  366. bidding_budget = str(float(bidding_budget))
  367. win_bid_price = _subdoc.get("win_bid_price","0")
  368. if float(win_bid_price)==0:
  369. win_bid_price = ""
  370. else:
  371. win_bid_price = str(float(win_bid_price))
  372. self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
  373. @annotate("string->bigint")
  374. class totimestamp(object):
  375. def __init__(self):
  376. import time
  377. global time
  378. import logging
  379. import json
  380. import re
  381. global json,logging,re
  382. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  383. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  384. def evaluate(self, str_time):
  385. try:
  386. logging.info(str_time)
  387. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  388. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  389. timeStamp = int(time.mktime(timeArray))
  390. return timeStamp
  391. else:
  392. return 0
  393. except Exception as e:
  394. return 0
  395. @annotate("string->string")
  396. class refind_name(object):
  397. def __init__(self):
  398. import logging
  399. import re
  400. global logging,re
  401. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  402. def evaluate(self, title):
  403. if title is not None:
  404. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  405. return ""
  406. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  407. class f_set_docid(BaseUDAF):
  408. '''
  409. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  410. '''
  411. def __init__(self):
  412. import json
  413. global json
  414. def new_buffer(self):
  415. return [[]]
  416. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  417. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  418. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  419. def merge(self, buffer, pbuffer):
  420. buffer[0].extend(pbuffer[0])
  421. def terminate(self, buffer):
  422. list_docs = buffer[0]
  423. list_docs.sort(key=lambda x:x["page_time_stamp"])
  424. list_group = []
  425. _begin = 0
  426. defind_count = 0
  427. if len(list_docs)>0:
  428. defind_count = list_docs[0]["defind_count"]
  429. print(defind_count)
  430. for i in range(len(list_docs)-1):
  431. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
  432. continue
  433. else:
  434. _group = []
  435. _set_column = set()
  436. _set_tenderee = set()
  437. for j in range(_begin,i+1):
  438. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  439. _set_tenderee.add(list_docs[j]["tenderee"])
  440. _set_column.add(list_docs[j]["defind_column"])
  441. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  442. if len(_group)>=3 and len(_set_tenderee)>1:
  443. pass
  444. else:
  445. print(defind_count,len(_set_column))
  446. if len(_group)>1:
  447. if defind_count==2:
  448. if len(_set_column)>=2:
  449. list_group.append(_group)
  450. elif defind_count==1:
  451. if len(_set_column)==1:
  452. list_group.append(_group)
  453. elif defind_count==0:
  454. list_group.append(_group)
  455. _begin = i+1
  456. if len(list_docs)>1:
  457. _set_column = set()
  458. _set_tenderee = set()
  459. _group = []
  460. for j in range(_begin,len(list_docs)):
  461. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  462. _set_tenderee.add(list_docs[j]["tenderee"])
  463. _set_column.add(list_docs[j]["defind_column"])
  464. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  465. if len(_group)>=3 and len(_set_tenderee)>1:
  466. pass
  467. else:
  468. if len(_group)>1:
  469. if defind_count==2:
  470. if len(_set_column)>=2:
  471. list_group.append(_group)
  472. elif defind_count==1:
  473. if len(_set_column)==1:
  474. list_group.append(_group)
  475. elif defind_count==0:
  476. list_group.append(_group)
  477. return json.dumps(list_group)
  478. # def terminate(self, buffer):
  479. #
  480. #
  481. # list_docs = buffer[0]
  482. # if len(list_docs)>0:
  483. # defind_count = list_docs[0]["defind_count"]
  484. #
  485. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  486. #
  487. # list_group = []
  488. # for time_group in list_time_group:
  489. # _group = []
  490. # _set_column = set()
  491. # base_tenderee = ""
  492. # _set_tenderee = set()
  493. # for j in range(len(time_group)):
  494. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  495. # # if base_tenderee =="":
  496. # # base_tenderee = time_group[j]["tenderee"]
  497. # # _set_tenderee.add(time_group[j]["tenderee"])
  498. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  499. # # if simi<0.8:
  500. # # _set_tenderee.add(time_group[j]["tenderee"])
  501. #
  502. # _set_tenderee.add(time_group[j]["tenderee"])
  503. # _set_column.add(time_group[j]["defind_column"])
  504. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  505. #
  506. # if len(_group)>=3 and len(_set_tenderee)>1:
  507. # pass
  508. # else:
  509. # if len(_group)>1:
  510. # if defind_count==2:
  511. # if len(_set_column)>=2:
  512. # list_group.append(_group)
  513. # elif defind_count==1:
  514. # if len(_set_column)==1:
  515. # list_group.append(_group)
  516. # elif defind_count==0:
  517. # list_group.append(_group)
  518. #
  519. # return json.dumps(list_group)
  520. def isEmpty(_str):
  521. if _str is None or _str=="":
  522. return True
  523. return False
  524. @annotate('bigint->string')
  525. class f_group_fingerprint(BaseUDAF):
  526. def __init__(self):
  527. import json
  528. global json
  529. def new_buffer(self):
  530. return [[]]
  531. def iterate(self, buffer,docid):
  532. buffer[0].append(docid)
  533. def merge(self, buffer, pbuffer):
  534. buffer[0].extend(pbuffer[0][:100000])
  535. def terminate(self, buffer):
  536. list_docid = buffer[0][:100000]
  537. list_docid.sort(key=lambda x:x)
  538. return ",".join([str(a) for a in list_docid])
  539. @annotate('string->bigint,string')
  540. class f_ungroup_fingerprint(BaseUDTF):
  541. def process(self,dumplicates):
  542. list_docid = dumplicates.split(",")
  543. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  544. @annotate('bigint,bigint,string->string')
  545. class f_dump_probability(BaseUDAF):
  546. '''
  547. 合并组为一条记录
  548. '''
  549. def __init__(self):
  550. import json
  551. global json
  552. def new_buffer(self):
  553. return [[]]
  554. def iterate(self, buffer,docid,page_time_stamp,_type):
  555. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  556. def merge(self, buffer, pbuffer):
  557. buffer[0].extend(pbuffer[0])
  558. def terminate(self, buffer):
  559. list_dict = buffer[0]
  560. _set = set()
  561. list_data = []
  562. for _dict in list_dict:
  563. docid = _dict["docid"]
  564. if docid in _set:
  565. continue
  566. _set.add(docid)
  567. list_data.append(_dict)
  568. if len(list_data)>10000:
  569. break
  570. list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
  571. return json.dumps(list_group)
  572. @annotate('string -> bigint,bigint,bigint,bigint,string')
  573. class f_split_dumplicate_probability(BaseUDTF):
  574. def __init__(self):
  575. import logging
  576. import json
  577. global logging,json
  578. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  579. def process(self,list_group_str):
  580. logging.info("0")
  581. logging.info(list_group_str)
  582. if list_group_str is not None:
  583. logging.info("1")
  584. try:
  585. list_group = json.loads(list_group_str)
  586. logging.info("2")
  587. for _group in list_group:
  588. if len(_group)>0:
  589. _type = _group[0].get("type","")
  590. logging.info("3%d"%len(list_group))
  591. # _group.sort(key=lambda x:x["page_time_stamp"])
  592. _len = min(100,len(_group))
  593. for _index_i in range(_len):
  594. _count = 0
  595. for _index_j in range(_index_i+1,_len):
  596. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  597. break
  598. _count += 1
  599. _docid1 = _group[_index_i]["docid"]
  600. _docid2 = _group[_index_j]["docid"]
  601. if _docid1<_docid2:
  602. self.forward(_docid1,_docid2,1,_len,_type)
  603. elif _docid1>_docid2:
  604. self.forward(_docid2,_docid1,1,_len,_type)
  605. except Exception as e:
  606. logging(str(e))
  607. @annotate('bigint,bigint,string->string')
  608. class f_dumplicate_groupPairs(BaseUDAF):
  609. '''
  610. 合并组为一条记录
  611. '''
  612. def __init__(self):
  613. import json
  614. global json
  615. def new_buffer(self):
  616. return [[]]
  617. def iterate(self, buffer,is_exists,counts,_type):
  618. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  619. def merge(self, buffer, pbuffer):
  620. buffer[0].extend(pbuffer[0])
  621. def terminate(self, buffer):
  622. list_dict = buffer[0]
  623. list_dict = list_dict[:10000]
  624. return json.dumps(list_dict)
  625. def check_columns(tenderee_less,tenderee_greater,
  626. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  627. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  628. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  629. flag = True
  630. _set_tenderee = set()
  631. if tenderee_less is not None and tenderee_less!="":
  632. _set_tenderee.add(tenderee_less)
  633. if tenderee_greater is not None and tenderee_greater!="":
  634. _set_tenderee.add(tenderee_greater)
  635. if len(_set_tenderee)>1:
  636. return False
  637. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  638. if code_sim>0.6 and code_sim<1:
  639. return False
  640. #同批次不同编号
  641. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  642. _split_code_less = project_code_less.split("-")
  643. _split_code_greater = project_code_greater.split("-")
  644. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  645. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  646. return False
  647. _set_win_tenderer = set()
  648. if win_tenderer_less is not None and win_tenderer_less!="":
  649. _set_win_tenderer.add(win_tenderer_less)
  650. if win_tenderer_greater is not None and win_tenderer_greater!="":
  651. _set_win_tenderer.add(win_tenderer_greater)
  652. if len(_set_win_tenderer)>1:
  653. return False
  654. _set_win_bid_price = set()
  655. if win_bid_price_less is not None and win_bid_price_less!="":
  656. _set_win_bid_price.add(float(win_bid_price_less))
  657. if win_bid_price_greater is not None and win_bid_price_greater!="":
  658. _set_win_bid_price.add(float(win_bid_price_greater))
  659. if len(_set_win_bid_price)>1:
  660. return False
  661. _set_bidding_budget = set()
  662. if bidding_budget_less is not None and bidding_budget_less!="":
  663. _set_bidding_budget.add(float(bidding_budget_less))
  664. if bidding_budget_greater is not None and bidding_budget_greater!="":
  665. _set_bidding_budget.add(float(bidding_budget_greater))
  666. if len(_set_bidding_budget)>1:
  667. return False
  668. return True
  669. import math
  670. def featurnCount(_count,max_count=100):
  671. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  672. def getSimLevel(str1,str2):
  673. str1_null = False
  674. str2_null = False
  675. _v = 0
  676. if str1 is None or str1=="":
  677. str1_null = True
  678. if str2 is None or str2=="":
  679. str2_null = True
  680. if str1_null and str2_null:
  681. _v = 2
  682. elif str1_null and not str2_null:
  683. _v = 4
  684. elif not str1_null and str2_null:
  685. _v = 6
  686. elif not str1_null and not str2_null:
  687. if str1==str2:
  688. _v = 10
  689. else:
  690. _v = 0
  691. return _v
  692. def getLength(_str):
  693. return len(_str if _str is not None else "")
  694. def check_money(bidding_budget_less,bidding_budget_greater,
  695. win_bid_price_less,win_bid_price_greater,
  696. moneys_less,moneys_greater,
  697. moneys_attachment_less,moneys_attachment_greater):
  698. #只判断最高前六位
  699. if getLength(bidding_budget_less)>0:
  700. bidding_budget_less = round(float(bidding_budget_less))
  701. bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
  702. if getLength(bidding_budget_greater)>0:
  703. bidding_budget_greater = round(float(bidding_budget_greater))
  704. bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  705. if getLength(win_bid_price_less)>0:
  706. win_bid_price_less = round(float(win_bid_price_less))
  707. win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
  708. if getLength(win_bid_price_greater)>0:
  709. win_bid_price_greater = round(float(win_bid_price_greater))
  710. win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  711. #check saming
  712. budget_is_same = ""
  713. price_is_same = ""
  714. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  715. budget_less = float(bidding_budget_less)
  716. budget_greater = float(bidding_budget_greater)
  717. if budget_less!=budget_greater:
  718. if min(budget_less,budget_greater)>0:
  719. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  720. budget_is_same = True
  721. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  722. budget_is_same = True
  723. if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
  724. budget_is_same = True
  725. if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
  726. budget_is_same = True
  727. if budget_is_same=="":
  728. return False
  729. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  730. price_less = float(win_bid_price_less)
  731. price_greater = float(win_bid_price_greater)
  732. if price_less!=price_greater:
  733. if min(price_less,price_greater)>0:
  734. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  735. price_is_same = True
  736. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  737. price_is_same = True
  738. if price_less in moneys_greater or price_less in moneys_attachment_greater:
  739. price_is_same = True
  740. if price_greater in moneys_less or price_greater in moneys_attachment_less:
  741. price_is_same = True
  742. if price_is_same=="":
  743. return False
  744. return True
  745. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  746. tenderee_less,tenderee_greater,
  747. agency_less,agency_greater,
  748. win_tenderer_less,win_tenderer_greater,
  749. similarity=0.85):
  750. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  751. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  752. if entity_less!=entity_greater:
  753. is_same = ''
  754. _sim = jaccard_score(entity_less,entity_greater)
  755. if _sim>similarity:
  756. is_same = True
  757. if is_same=='':
  758. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  759. is_same = True
  760. if is_same=='':
  761. return False
  762. return True
  763. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  764. return False
  765. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  766. return False
  767. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  768. return False
  769. return True
  770. def check_codes(project_codes_less,project_codes_greater):
  771. #check the similarity
  772. is_same = False
  773. is_sim = False
  774. for project_code_less in project_codes_less:
  775. for project_code_greater in project_codes_greater:
  776. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  777. if project_code_less is not None and project_code_greater is not None:
  778. if code_sim>0.6:
  779. if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
  780. is_same = True
  781. else:
  782. is_sim = True
  783. if project_code_less!=project_code_greater:
  784. if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
  785. is_sim = True
  786. if is_same:
  787. return True
  788. if is_sim:
  789. return False
  790. return True
  791. def check_demand():
  792. return True
  793. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  794. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  795. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  796. num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
  797. location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
  798. building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  799. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  800. def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
  801. if code_greater is None:
  802. code_greater = []
  803. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  804. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  805. for _c in codes_less:
  806. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  807. for _c in code_greater:
  808. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  809. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  810. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  811. #check the package
  812. if doctitle_refind_less is None:
  813. doctitle_refind_less = ""
  814. if doctitle_refind_greater is None:
  815. doctitle_refind_greater = ""
  816. _pack1 = None
  817. _pack2 = None
  818. #if contain then pass
  819. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  820. return True
  821. #check the package in title
  822. _match = re.search(package_number_pattern,doctitle_refind_less)
  823. if _match is not None:
  824. _pack1 = _match.groupdict()["name"]
  825. _match = re.search(package_number_pattern,doctitle_refind_greater)
  826. if _match is not None:
  827. _pack2 = _match.groupdict()["name"]
  828. if _pack1 is not None and _pack2 is not None:
  829. if _pack1!=_pack2:
  830. return False
  831. #check the nums in title
  832. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  833. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  834. #check the nums,location,building in title
  835. for _p in [code_pattern]:
  836. num_all_l = re.findall(_p,doctitle_refind_less)
  837. num_all_g = re.findall(_p,doctitle_refind_greater)
  838. set_num_l = set()
  839. set_num_g = set()
  840. for _l in num_all_l:
  841. if re.search(num_pattern,_l) is not None:
  842. if _l.find(".")>0:
  843. set_num_l.add(_l)
  844. elif len(_l)<4:
  845. set_num_l.add(_l)
  846. for _g in num_all_g:
  847. if re.search(num_pattern,_g) is not None:
  848. if _g.find(".")>0:
  849. set_num_g.add(_g)
  850. elif len(_g)<4:
  851. set_num_g.add(_g)
  852. if len(set_num_l)>0 and len(set_num_g)>0:
  853. if len(set_num_l&set_num_g)!=len(set_num_l):
  854. return False
  855. #check location and keywords
  856. for _p in [num1_pattern,building_pattern]:
  857. num_all_l = re.findall(_p,doctitle_refind_less)
  858. num_all_g = re.findall(_p,doctitle_refind_greater)
  859. set_num_l = set(num_all_l)
  860. set_num_g = set(num_all_g)
  861. if len(set_num_l)==len(set_num_g):
  862. if len(set_num_l&set_num_g)!=len(set_num_l):
  863. return False
  864. #check the location has conflict
  865. for _p in [location_pattern]:
  866. num_all_l = re.findall(_p,doctitle_refind_less)
  867. num_all_g = re.findall(_p,doctitle_refind_greater)
  868. dict_num_l = {}
  869. dict_num_g = {}
  870. for _l in num_all_l:
  871. if len(_l)>0:
  872. key = _l[-1:]
  873. if key not in dict_num_l:
  874. dict_num_l[key] = set()
  875. dict_num_l[key].add(_l)
  876. for _g in num_all_g:
  877. if len(_g)>0:
  878. key = _g[-1:]
  879. if key not in dict_num_g:
  880. dict_num_g[key] = set()
  881. dict_num_g[key].add(_g)
  882. for k,v in dict_num_l.items():
  883. if k in dict_num_g:
  884. if len(v&dict_num_g[k])==0:
  885. return False
  886. return True
  887. def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
  888. if getLength(product_less)>0 and getLength(product_greater)>0:
  889. _product_l = product_less.split(split_char)
  890. _product_g = product_greater.split(split_char)
  891. same_count = 0
  892. if len(_product_l)>len(_product_g):
  893. a = _product_g
  894. _product_g = _product_l
  895. _product_l = a
  896. for _l in _product_l:
  897. for _g in _product_g:
  898. if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
  899. same_count += 1
  900. break
  901. if same_count/len(_product_l)>=0.5:
  902. return True
  903. return False
  904. return True
  905. def check_package(package_less,package_greater,split_char=","):
  906. if getLength(package_less)>0 and getLength(package_greater)>0:
  907. _product_l = package_less.split(split_char)
  908. _product_g = package_greater.split(split_char)
  909. for _l in _product_l:
  910. for _g in _product_g:
  911. if _l==_g:
  912. return True
  913. return False
  914. return True
  915. def check_time(json_time_less,json_time_greater):
  916. has_same = False
  917. has_diff = False
  918. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  919. if isinstance(json_time_less,dict):
  920. time_less = json_time_less
  921. else:
  922. time_less = json.loads(json_time_less)
  923. if isinstance(json_time_greater,dict):
  924. time_greater = json_time_greater
  925. else:
  926. time_greater = json.loads(json_time_greater)
  927. for k,v in time_less.items():
  928. if getLength(v)>0:
  929. v1 = time_greater.get(k,"")
  930. if getLength(v1)>0:
  931. if v[:10]!=v1[:10]:
  932. has_diff = True
  933. else:
  934. has_same = True
  935. if has_same:
  936. if has_diff:
  937. return 1
  938. return 2
  939. if has_diff:
  940. return 0
  941. return 1
  942. def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
  943. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  944. return 1
  945. #一篇要素都在附件,且两篇附件md5有重叠
  946. set_md5_less = set()
  947. set_md5_greater = set()
  948. list_md5_less = []
  949. if page_attachments_less:
  950. try:
  951. list_md5_less = json.loads(page_attachments_less)
  952. except Exception as e:
  953. pass
  954. list_md5_greater = []
  955. if page_attachments_greater:
  956. try:
  957. list_md5_greater = json.loads(page_attachments_greater)
  958. except Exception as e:
  959. pass
  960. for _l in list_md5_less:
  961. _md5 = _l.get("fileMd5")
  962. if _md5 is not None:
  963. set_md5_less.add(_md5)
  964. for _l in list_md5_greater:
  965. _md5 = _l.get("fileMd5")
  966. if _md5 is not None:
  967. set_md5_greater.add(_md5)
  968. if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
  969. one_in_attach = False
  970. dict_enterprise_less = json.loads(nlp_enterprise_less)
  971. dict_enterprise_greater = json.loads(nlp_enterprise_greater)
  972. indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
  973. notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
  974. indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
  975. notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
  976. if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
  977. one_in_attach = True
  978. if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
  979. one_in_attach = True
  980. if one_in_attach:
  981. if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  982. return 1
  983. if isinstance(project_codes_less,str):
  984. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  985. elif project_codes_less is None:
  986. project_codes_less = []
  987. if isinstance(project_codes_greater,str):
  988. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  989. elif project_codes_greater is None:
  990. project_codes_greater = []
  991. same_count = 0
  992. all_count = 8
  993. if len(set(project_codes_less) & set(project_codes_greater))>0:
  994. same_count += 1
  995. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  996. same_count += 1
  997. if getLength(agency_less)>0 and agency_less==agency_greater:
  998. same_count += 1
  999. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1000. same_count += 1
  1001. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1002. same_count += 1
  1003. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1004. same_count += 1
  1005. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1006. same_count += 1
  1007. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1008. same_count += 1
  1009. base_prob = 0
  1010. if min_counts<3:
  1011. base_prob = 0.9
  1012. elif min_counts<5:
  1013. base_prob = 0.8
  1014. elif min_counts<8:
  1015. base_prob = 0.7
  1016. else:
  1017. base_prob = 0.6
  1018. _prob = base_prob*same_count/all_count
  1019. if min(extract_count_less,extract_count_greater)<=3:
  1020. if _prob<0.1:
  1021. _prob = 0.15
  1022. if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
  1023. return 0
  1024. if _prob<0.1:
  1025. return _prob
  1026. check_result = {"pass":1}
  1027. if docchannel_less in (51,102,103,104,115,116,117):
  1028. if doctitle_refine_less!=doctitle_refine_greater:
  1029. if page_time_less!=page_time_greater:
  1030. check_result["docchannel"] = 0
  1031. check_result["pass"] = 0
  1032. else:
  1033. check_result["docchannel"] = 2
  1034. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1035. check_result["doctitle"] = 0
  1036. check_result["pass"] = 0
  1037. if b_log:
  1038. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1039. else:
  1040. check_result["doctitle"] = 2
  1041. #added check
  1042. if not check_codes(project_codes_less,project_codes_greater):
  1043. check_result["code"] = 0
  1044. check_result["pass"] = 0
  1045. if b_log:
  1046. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1047. else:
  1048. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1049. check_result["code"] = 2
  1050. else:
  1051. check_result["code"] = 1
  1052. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1053. check_result["product"] = 0
  1054. check_result["pass"] = 0
  1055. if b_log:
  1056. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1057. else:
  1058. if getLength(product_less)>0 and getLength(product_greater)>0:
  1059. check_result["product"] = 2
  1060. else:
  1061. check_result["product"] = 1
  1062. if not check_demand():
  1063. check_result["pass"] = 0
  1064. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1065. tenderee_less,tenderee_greater,
  1066. agency_less,agency_greater,
  1067. win_tenderer_less,win_tenderer_greater):
  1068. check_result["entity"] = 0
  1069. check_result["pass"] = 0
  1070. if b_log:
  1071. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1072. else:
  1073. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1074. check_result["entity"] = 2
  1075. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1076. check_result["entity"] = 2
  1077. else:
  1078. check_result["entity"] = 1
  1079. logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
  1080. logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
  1081. if not check_money(bidding_budget_less,bidding_budget_greater,
  1082. win_bid_price_less,win_bid_price_greater,
  1083. moneys_less,moneys_greater,
  1084. moneys_attachment_less,moneys_attachment_greater):
  1085. if b_log:
  1086. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1087. check_result["money"] = 0
  1088. check_result["pass"] = 0
  1089. else:
  1090. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1091. check_result["money"] = 2
  1092. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1093. check_result["money"] = 2
  1094. else:
  1095. check_result["money"] = 1
  1096. #added check
  1097. if not check_package(package_less,package_greater):
  1098. if b_log:
  1099. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1100. check_result["package"] = 0
  1101. check_result["pass"] = 0
  1102. else:
  1103. if getLength(package_less)>0 and getLength(package_greater)>0:
  1104. check_result["package"] = 2
  1105. else:
  1106. check_result["package"] = 1
  1107. #added check
  1108. _time_check = check_time(json_time_less,json_time_greater)
  1109. if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
  1110. if b_log:
  1111. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1112. if isinstance(json_time_less,dict):
  1113. time_less = json_time_less
  1114. else:
  1115. time_less = json.loads(json_time_less)
  1116. if isinstance(json_time_greater,dict):
  1117. time_greater = json_time_greater
  1118. else:
  1119. time_greater = json.loads(json_time_greater)
  1120. for k,v in time_less.items():
  1121. if getLength(v)>0:
  1122. v1 = time_greater.get(k,"")
  1123. if getLength(v1)>0:
  1124. if v!=v1:
  1125. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1126. check_result["time"] = 0
  1127. check_result["pass"] = 0
  1128. else:
  1129. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1130. check_result["time"] = 2
  1131. else:
  1132. check_result["time"] = 1
  1133. if hard_level==2 and check_result["product"]<=1:
  1134. return 0
  1135. if check_result.get("pass",0)==0:
  1136. if b_log:
  1137. logging.info(str(check_result))
  1138. if check_result.get("money",1)==0:
  1139. return 0
  1140. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1141. return _prob
  1142. else:
  1143. return 0
  1144. return _prob
  1145. def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
  1146. if web_source_no_less==web_source_no_greater:
  1147. if fingerprint_less==fingerprint_greater:
  1148. return 1
  1149. else:
  1150. return 0
  1151. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1152. return 1
  1153. if isinstance(project_codes_less,str):
  1154. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1155. elif project_codes_less is None:
  1156. project_codes_less = []
  1157. if isinstance(project_codes_greater,str):
  1158. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1159. elif project_codes_greater is None:
  1160. project_codes_greater = []
  1161. same_count = 0
  1162. all_count = 8
  1163. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1164. same_count += 1
  1165. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1166. same_count += 1
  1167. if getLength(agency_less)>0 and agency_less==agency_greater:
  1168. same_count += 1
  1169. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1170. same_count += 1
  1171. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1172. same_count += 1
  1173. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1174. same_count += 1
  1175. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1176. same_count += 1
  1177. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1178. same_count += 1
  1179. base_prob = 0
  1180. if min_counts<3:
  1181. base_prob = 0.9
  1182. elif min_counts<5:
  1183. base_prob = 0.8
  1184. elif min_counts<8:
  1185. base_prob = 0.7
  1186. else:
  1187. base_prob = 0.6
  1188. _prob = base_prob*same_count/all_count
  1189. if min(extract_count_less,extract_count_greater)<=3:
  1190. if _prob<0.1:
  1191. _prob = 0.15
  1192. if province_less!=province_greater:
  1193. return 0
  1194. if _prob<0.1:
  1195. return _prob
  1196. check_result = {"pass":1}
  1197. if docchannel_less in (51,102,103,104,115,116,117):
  1198. if doctitle_refine_less!=doctitle_refine_greater:
  1199. if page_time_less!=page_time_greater:
  1200. check_result["docchannel"] = 0
  1201. check_result["pass"] = 0
  1202. else:
  1203. check_result["docchannel"] = 2
  1204. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1205. check_result["doctitle"] = 0
  1206. check_result["pass"] = 0
  1207. if b_log:
  1208. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1209. else:
  1210. check_result["doctitle"] = 2
  1211. #added check
  1212. if not check_codes(project_codes_less,project_codes_greater):
  1213. check_result["code"] = 0
  1214. check_result["pass"] = 0
  1215. if b_log:
  1216. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1217. else:
  1218. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1219. check_result["code"] = 2
  1220. else:
  1221. check_result["code"] = 1
  1222. if not check_product(product_less,product_greater):
  1223. check_result["product"] = 0
  1224. check_result["pass"] = 0
  1225. if b_log:
  1226. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1227. else:
  1228. if getLength(product_less)>0 and getLength(product_greater)>0:
  1229. check_result["product"] = 2
  1230. else:
  1231. check_result["product"] = 1
  1232. if not check_demand():
  1233. check_result["pass"] = 0
  1234. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1235. tenderee_less,tenderee_greater,
  1236. agency_less,agency_greater,
  1237. win_tenderer_less,win_tenderer_greater):
  1238. check_result["entity"] = 0
  1239. check_result["pass"] = 0
  1240. if b_log:
  1241. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1242. else:
  1243. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1244. check_result["entity"] = 2
  1245. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1246. check_result["entity"] = 2
  1247. else:
  1248. check_result["entity"] = 1
  1249. if not check_money(bidding_budget_less,bidding_budget_greater,
  1250. win_bid_price_less,win_bid_price_greater):
  1251. if b_log:
  1252. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1253. check_result["money"] = 0
  1254. check_result["pass"] = 0
  1255. else:
  1256. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1257. check_result["money"] = 2
  1258. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1259. check_result["money"] = 2
  1260. else:
  1261. check_result["money"] = 1
  1262. #added check
  1263. if not check_package(package_less,package_greater):
  1264. if b_log:
  1265. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1266. check_result["package"] = 0
  1267. check_result["pass"] = 0
  1268. else:
  1269. if getLength(package_less)>0 and getLength(package_greater)>0:
  1270. check_result["package"] = 2
  1271. else:
  1272. check_result["package"] = 1
  1273. #added check
  1274. if not check_time(json_time_less,json_time_greater):
  1275. if b_log:
  1276. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1277. if isinstance(json_time_less,dict):
  1278. time_less = json_time_less
  1279. else:
  1280. time_less = json.loads(json_time_less)
  1281. if isinstance(json_time_greater,dict):
  1282. time_greater = json_time_greater
  1283. else:
  1284. time_greater = json.loads(json_time_greater)
  1285. for k,v in time_less.items():
  1286. if getLength(v)>0:
  1287. v1 = time_greater.get(k,"")
  1288. if getLength(v1)>0:
  1289. if v!=v1:
  1290. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1291. check_result["time"] = 0
  1292. check_result["pass"] = 0
  1293. else:
  1294. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1295. check_result["time"] = 2
  1296. else:
  1297. check_result["time"] = 1
  1298. if hard_level==2 and check_result["product"]<=1:
  1299. return 0
  1300. if check_result.get("pass",0)==0:
  1301. if b_log:
  1302. logging.info(str(check_result))
  1303. if check_result.get("money",1)==0:
  1304. return 0
  1305. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1306. return _prob
  1307. else:
  1308. return 0
  1309. if check_result.get("time",1)==0:
  1310. return 0
  1311. return _prob
  1312. @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
  1313. class f_dumplicate_check(BaseUDTF):
  1314. def __init__(self):
  1315. import logging
  1316. import json
  1317. global logging,json
  1318. def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
  1319. tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
  1320. bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
  1321. project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
  1322. extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
  1323. page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
  1324. package_less,package_greater,json_time_less,json_time_greater,json_context,
  1325. province_less,province_greater,city_less,city_greater,district_less,district_greater,
  1326. web_source_no_less,web_source_no_greater,
  1327. extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
  1328. min_counts = 100
  1329. if json_context is not None:
  1330. _context = json.loads(json_context)
  1331. for item in _context:
  1332. if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
  1333. min_counts = item["counts"]
  1334. _extract_less = {}
  1335. if extract_json_less is not None:
  1336. _extract_less = json.loads(extract_json_less)
  1337. _extract_greater = {}
  1338. if extract_json_greater is not None:
  1339. _extract_greater = json.loads(extract_json_greater)
  1340. moneys_less = set(_extract_less.get("moneys",[]))
  1341. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1342. moneys_greater = set(_extract_greater.get("moneys",[]))
  1343. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1344. if page_attachments_less is None:
  1345. page_attachments_less = '[]'
  1346. if page_attachments_greater is None:
  1347. page_attachments_greater = '[]'
  1348. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1349. self.forward(_prob)
  1350. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  1351. class f_dumplicate_featureMatrix(BaseUDTF):
  1352. def __init__(self):
  1353. import logging
  1354. import json
  1355. global logging,json
  1356. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  1357. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  1358. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  1359. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  1360. #check the page_time by special docchannel
  1361. if docchannel_less in (51,102,103,104,115,116,117):
  1362. if doctitle_refine_less!=doctitle_refine_greater:
  1363. if page_time_less!=page_time_greater:
  1364. self.forward("[1-%s]"%(str(docchannel_less)),0)
  1365. return
  1366. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1367. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  1368. return
  1369. # if not check_codes([project_code_less],[project_code_greater]):
  1370. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  1371. # return
  1372. if not check_demand():
  1373. self.forward("[4-]",0)
  1374. return
  1375. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1376. tenderee_less,tenderee_greater,
  1377. agency_less,agency_greater,
  1378. win_tenderer_less,win_tenderer_greater):
  1379. _error = ""
  1380. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  1381. _error += str(a)
  1382. self.forward("[5-%s]"%_error,0)
  1383. return
  1384. if not check_money(bidding_budget_less,bidding_budget_greater,
  1385. win_bid_price_less,win_bid_price_greater):
  1386. _error = ""
  1387. for a in [bidding_budget_less,bidding_budget_greater,
  1388. win_bid_price_less,win_bid_price_greater]:
  1389. _error += str(a)
  1390. self.forward("[6-%s]"%_error,0)
  1391. return
  1392. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1393. _error = "%s=%s"%(str(product_less),str(product_greater))
  1394. self.forward("7-%s"%_error,0)
  1395. return
  1396. _context = json.loads(json_context)
  1397. min_counts = 100
  1398. dict_context = {}
  1399. for item in _context:
  1400. if item["counts"]<min_counts:
  1401. min_counts = item["counts"]
  1402. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  1403. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1404. list_matrix = []
  1405. #get the featurn of the context into matrix
  1406. # for index_i in range(len(context_key)):
  1407. # for index_j in range(index_i+1,len(context_key)):
  1408. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  1409. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1410. # list_matrix.append(_v)
  1411. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  1412. # for index_i in range(len(context3_key)):
  1413. # for index_j in range(index_i+1,len(context3_key)):
  1414. # for index_k in range(index_j+1,len(context3_key)):
  1415. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  1416. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1417. # list_matrix.append(_v)
  1418. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  1419. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  1420. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  1421. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  1422. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  1423. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  1424. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  1425. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  1426. json_matrix = json.dumps(list_matrix)
  1427. same_count = 0
  1428. all_count = 8
  1429. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  1430. same_count += 1
  1431. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  1432. same_count += 1
  1433. if getSimilarityOfString(agency_less,agency_greater)==1:
  1434. same_count += 1
  1435. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  1436. same_count += 1
  1437. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  1438. same_count += 1
  1439. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  1440. same_count += 1
  1441. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  1442. same_count += 1
  1443. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  1444. same_count += 1
  1445. base_prob = 0
  1446. if min_counts<3:
  1447. base_prob = 0.9
  1448. elif min_counts<5:
  1449. base_prob = 0.8
  1450. elif min_counts<8:
  1451. base_prob = 0.7
  1452. else:
  1453. base_prob = 0.6
  1454. _prob = base_prob*same_count/all_count
  1455. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  1456. self.forward(json_matrix,_prob)
  1457. return
  1458. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
  1459. class f_redump_probability_final_check(BaseUDAF):
  1460. '''
  1461. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1462. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1463. '''
  1464. def __init__(self):
  1465. import logging
  1466. import json,re
  1467. global json,logging,re
  1468. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1469. def new_buffer(self):
  1470. return [list()]
  1471. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
  1472. province,city,district,web_source_no,extract_json,page_attachments):
  1473. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1474. "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1475. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
  1476. "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
  1477. def merge(self, buffer, pbuffer):
  1478. buffer[0].extend(pbuffer[0])
  1479. def terminate(self, buffer):
  1480. list_group = []
  1481. the_group = buffer[0]
  1482. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1483. _index = 0
  1484. final_group = []
  1485. if len(the_group)>0:
  1486. _index = 0
  1487. while _index<len(the_group):
  1488. document_greater = the_group[_index]
  1489. docid_greater = document_greater["docid"]
  1490. docchannel_greater = document_greater["docchannel"]
  1491. page_time_greater = document_greater["page_time"]
  1492. doctitle_refine_greater = document_greater["doctitle_refine"]
  1493. project_codes_greater = document_greater["project_codes"]
  1494. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1495. tenderee_greater = document_greater["tenderee"]
  1496. agency_greater = document_greater["agency"]
  1497. win_tenderer_greater = document_greater["win_tenderer"]
  1498. bidding_budget_greater = document_greater["bidding_budget"]
  1499. win_bid_price_greater = document_greater["win_bid_price"]
  1500. product_greater = document_greater["product"]
  1501. package_greater = document_greater["package"]
  1502. json_time_greater = document_greater["json_dicttime"]
  1503. fingerprint_greater = document_greater.get("fingerprint","")
  1504. project_name_greater = document_greater["project_name"]
  1505. extract_count_greater = document_greater["extract_count"]
  1506. province_greater = document_greater["province"]
  1507. city_greater = document_greater["city"]
  1508. district_greater = document_greater["district"]
  1509. web_source_no_greater = document_greater["web_source_no"]
  1510. extract_json_greater = document_greater["extract_json"]
  1511. page_attachments_greater = document_greater["page_attachments"]
  1512. _pass = True
  1513. for document_less in final_group:
  1514. docid_less = document_less["docid"]
  1515. docchannel_less = document_less["docchannel"]
  1516. page_time_less = document_less["page_time"]
  1517. doctitle_refine_less = document_less["doctitle_refine"]
  1518. project_codes_less = document_less["project_codes"]
  1519. nlp_enterprise_less = document_less["nlp_enterprise"]
  1520. tenderee_less = document_less["tenderee"]
  1521. agency_less = document_less["agency"]
  1522. win_tenderer_less = document_less["win_tenderer"]
  1523. bidding_budget_less = document_less["bidding_budget"]
  1524. win_bid_price_less = document_less["win_bid_price"]
  1525. product_less = document_less["product"]
  1526. package_less = document_less["package"]
  1527. json_time_less = document_less["json_dicttime"]
  1528. fingerprint_less = document_less.get("fingerprint","")
  1529. project_name_less = document_less["project_name"]
  1530. extract_count_less = document_less["extract_count"]
  1531. province_less = document_less["province"]
  1532. city_less = document_less["city"]
  1533. district_less = document_less["district"]
  1534. web_source_no_less = document_less["web_source_no"]
  1535. extract_json_less = document_less["extract_json"]
  1536. page_attachments_less = document_less["page_attachments"]
  1537. _extract_less = {}
  1538. if extract_json_less is not None:
  1539. _extract_less = json.loads(extract_json_less)
  1540. _extract_greater = {}
  1541. if extract_json_greater is not None:
  1542. _extract_greater = json.loads(extract_json_greater)
  1543. moneys_less = set(_extract_less.get("moneys",[]))
  1544. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1545. moneys_greater = set(_extract_greater.get("moneys",[]))
  1546. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1547. if page_attachments_less is None:
  1548. page_attachments_less = '[]'
  1549. if page_attachments_greater is None:
  1550. page_attachments_greater = '[]'
  1551. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1552. if _prob<0.1:
  1553. _pass = False
  1554. break
  1555. if _pass:
  1556. final_group.append(document_greater)
  1557. else:
  1558. break
  1559. _index += 1
  1560. dumplicates = ""
  1561. if _index>1:
  1562. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1563. final_group.sort(key=lambda x:x["docid"])
  1564. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1565. _set = set()
  1566. for _d in final_group:
  1567. _docid = _d["docid"]
  1568. if _docid in _set:
  1569. continue
  1570. dumplicates += "%d,"%_docid
  1571. _set.add(_docid)
  1572. dumplicates = dumplicates[:-1]
  1573. return dumplicates
  1574. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  1575. class f_redump_probability_final_check_bak(BaseUDAF):
  1576. '''
  1577. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1578. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1579. '''
  1580. def __init__(self):
  1581. import logging
  1582. import json,re
  1583. global json,logging,re
  1584. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1585. def new_buffer(self):
  1586. return [list()]
  1587. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  1588. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1589. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1590. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  1591. def merge(self, buffer, pbuffer):
  1592. buffer[0].extend(pbuffer[0])
  1593. def terminate(self, buffer):
  1594. list_group = []
  1595. the_group = buffer[0]
  1596. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1597. _index = 0
  1598. if len(the_group)>0:
  1599. _index = 1
  1600. while _index<len(the_group):
  1601. document_greater = the_group[_index]
  1602. docchannel_greater = document_greater["docchannel"]
  1603. page_time_greater = document_greater["page_time"]
  1604. doctitle_refine_greater = document_greater["doctitle_refine"]
  1605. project_code_greater = document_greater["project_code"]
  1606. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1607. tenderee_greater = document_greater["tenderee"]
  1608. agency_greater = document_greater["agency"]
  1609. win_tenderer_greater = document_greater["win_tenderer"]
  1610. bidding_budget_greater = document_greater["bidding_budget"]
  1611. win_bid_price_greater = document_greater["win_bid_price"]
  1612. product_greater = document_greater["product"]
  1613. package_greater = document_greater["package"]
  1614. json_time_greater = document_greater["json_dicttime"]
  1615. _less_index = 0
  1616. while _less_index<_index:
  1617. document_less = the_group[_less_index]
  1618. docchannel_less = document_less["docchannel"]
  1619. page_time_less = document_less["page_time"]
  1620. doctitle_refine_less = document_less["doctitle_refine"]
  1621. project_code_less = document_less["project_code"]
  1622. nlp_enterprise_less = document_less["nlp_enterprise"]
  1623. tenderee_less = document_less["tenderee"]
  1624. agency_less = document_less["agency"]
  1625. win_tenderer_less = document_less["win_tenderer"]
  1626. bidding_budget_less = document_less["bidding_budget"]
  1627. win_bid_price_less = document_less["win_bid_price"]
  1628. product_less = document_less["product"]
  1629. package_less = document_less["package"]
  1630. json_time_less = document_less["json_dicttime"]
  1631. check_result = {"pass":1}
  1632. if docchannel_less in (51,102,103,104,115,116,117):
  1633. if doctitle_refine_less!=doctitle_refine_greater:
  1634. if page_time_less!=page_time_greater:
  1635. check_result["docchannel"] = 0
  1636. check_result["pass"] = 0
  1637. else:
  1638. check_result["docchannel"] = 2
  1639. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1640. check_result["doctitle"] = 0
  1641. check_result["pass"] = 0
  1642. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  1643. else:
  1644. check_result["doctitle"] = 2
  1645. #added check
  1646. if not check_codes([project_code_less],[project_code_greater]):
  1647. check_result["code"] = 0
  1648. check_result["pass"] = 0
  1649. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  1650. else:
  1651. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  1652. check_result["code"] = 2
  1653. else:
  1654. check_result["code"] = 1
  1655. if not check_product(product_less,product_greater):
  1656. check_result["product"] = 0
  1657. check_result["pass"] = 0
  1658. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  1659. else:
  1660. if getLength(product_less)>0 and getLength(product_greater)>0:
  1661. check_result["product"] = 2
  1662. else:
  1663. check_result["product"] = 1
  1664. if not check_demand():
  1665. check_result["pass"] = 0
  1666. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1667. tenderee_less,tenderee_greater,
  1668. agency_less,agency_greater,
  1669. win_tenderer_less,win_tenderer_greater):
  1670. check_result["entity"] = 0
  1671. check_result["pass"] = 0
  1672. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1673. else:
  1674. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1675. check_result["entity"] = 2
  1676. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1677. check_result["entity"] = 2
  1678. else:
  1679. check_result["entity"] = 1
  1680. if not check_money(bidding_budget_less,bidding_budget_greater,
  1681. win_bid_price_less,win_bid_price_greater):
  1682. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1683. check_result["money"] = 0
  1684. check_result["pass"] = 0
  1685. else:
  1686. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1687. check_result["money"] = 2
  1688. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1689. check_result["money"] = 2
  1690. else:
  1691. check_result["money"] = 1
  1692. #added check
  1693. if not check_package(package_less,package_greater):
  1694. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  1695. check_result["package"] = 0
  1696. check_result["pass"] = 0
  1697. else:
  1698. if getLength(package_less)>0 and getLength(package_greater)>0:
  1699. check_result["package"] = 2
  1700. else:
  1701. check_result["package"] = 1
  1702. #added check
  1703. if not check_time(json_time_less,json_time_greater):
  1704. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  1705. check_result["time"] = 0
  1706. check_result["pass"] = 0
  1707. else:
  1708. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1709. check_result["time"] = 2
  1710. else:
  1711. check_result["time"] = 1
  1712. if check_result.get("pass",0)==0:
  1713. logging.info(str(check_result))
  1714. if check_result.get("time",1)==0:
  1715. break
  1716. if check_result.get("money",1)==0:
  1717. break
  1718. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  1719. pass
  1720. else:
  1721. break
  1722. _less_index += 1
  1723. if _less_index!=_index:
  1724. break
  1725. _index += 1
  1726. dumplicates = ""
  1727. if _index>1:
  1728. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1729. final_group = the_group[:_index]
  1730. final_group.sort(key=lambda x:x["docid"])
  1731. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1732. _set = set()
  1733. for _d in final_group:
  1734. _docid = _d["docid"]
  1735. if _docid in _set:
  1736. continue
  1737. dumplicates += "%d,"%_docid
  1738. _set.add(_docid)
  1739. dumplicates = dumplicates[:-1]
  1740. return dumplicates
  1741. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  1742. class f_set_docid_binaryChart(BaseUDAF):
  1743. '''
  1744. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1745. '''
  1746. def __init__(self):
  1747. import json
  1748. global json
  1749. def new_buffer(self):
  1750. return [[]]
  1751. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  1752. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  1753. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  1754. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  1755. "agency":agency,"web_source_no":web_source_no})
  1756. def merge(self, buffer, pbuffer):
  1757. buffer[0].extend(pbuffer[0])
  1758. def terminate(self, buffer):
  1759. list_docs = buffer[0]
  1760. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
  1761. list_group = []
  1762. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  1763. for _timeGroups in list_timeGroups:
  1764. list_empty = []
  1765. list_notEmpty = []
  1766. for _item in _timeGroups:
  1767. empty_flag = True
  1768. for _key in empty_key:
  1769. if not isEmpty(_item[_key]):
  1770. empty_flag = False
  1771. break
  1772. if empty_flag:
  1773. list_empty.append(_item)
  1774. else:
  1775. list_notEmpty.append(_item)
  1776. for _e in list_empty:
  1777. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  1778. _e_tenderee = _e["tenderee"]
  1779. for _ne in list_notEmpty:
  1780. if "set_webSource" not in _ne:
  1781. _ne["set_webSource"] = set()
  1782. _ne["set_webSource"].add(_ne["web_source_no"])
  1783. _suit = False
  1784. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  1785. _suit = True
  1786. elif isEmpty(_e_tenderee):
  1787. _suit = True
  1788. if _suit:
  1789. if _e["web_source_no"] not in _ne["set_webSource"]:
  1790. _ne["set_webSource"].add(_e["web_source_no"])
  1791. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  1792. break
  1793. if len(_group)>1:
  1794. list_group.append(_group)
  1795. return json.dumps(list_group)
  1796. def split_with_time(list_dict,sort_key,timedelta=86400*7):
  1797. if len(list_dict)>0:
  1798. if sort_key in list_dict[0]:
  1799. list_dict.sort(key=lambda x:x[sort_key])
  1800. list_group = []
  1801. _begin = 0
  1802. for i in range(len(list_dict)-1):
  1803. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  1804. continue
  1805. else:
  1806. _group = []
  1807. for j in range(_begin,i+1):
  1808. _group.append(list_dict[j])
  1809. if len(_group)>1:
  1810. list_group.append(_group)
  1811. _begin = i + 1
  1812. if len(list_dict)>1:
  1813. _group = []
  1814. for j in range(_begin,len(list_dict)):
  1815. _group.append(list_dict[j])
  1816. if len(_group)>1:
  1817. list_group.append(_group)
  1818. return list_group
  1819. return [list_dict]
  1820. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  1821. class f_set_docid_limitNum_contain(BaseUDAF):
  1822. '''
  1823. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  1824. '''
  1825. def __init__(self):
  1826. import logging
  1827. import json,re
  1828. global json,logging,re
  1829. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1830. def new_buffer(self):
  1831. return [list()]
  1832. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  1833. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  1834. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  1835. "contain_column":contain_column})
  1836. def merge(self, buffer, pbuffer):
  1837. buffer[0].extend(pbuffer[0])
  1838. def terminate(self, buffer):
  1839. list_split = split_with_time(buffer[0],"page_time_stamp")
  1840. list_group = []
  1841. for _split in list_split:
  1842. flag = True
  1843. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  1844. for _key in keys:
  1845. logging.info(_key+str(getSet(_split,_key)))
  1846. if len(getSet(_split,_key))>1:
  1847. flag = False
  1848. break
  1849. MAX_CONTAIN_COLUMN = None
  1850. #判断组内每条公告是否包含
  1851. if flag:
  1852. for _d in _split:
  1853. contain_column = _d["contain_column"]
  1854. if contain_column is not None and contain_column !="":
  1855. if MAX_CONTAIN_COLUMN is None:
  1856. MAX_CONTAIN_COLUMN = contain_column
  1857. else:
  1858. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  1859. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  1860. flag = False
  1861. break
  1862. MAX_CONTAIN_COLUMN = contain_column
  1863. else:
  1864. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  1865. flag = False
  1866. break
  1867. if flag:
  1868. if len(_split)>1:
  1869. _group = []
  1870. for _item in _split:
  1871. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  1872. list_group.append(_group)
  1873. return json.dumps(list_group)
  1874. @annotate('bigint->string')
  1875. class f_stamp_squence(BaseUDAF):
  1876. '''
  1877. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1878. '''
  1879. def __init__(self):
  1880. import json
  1881. global json
  1882. import logging
  1883. global logging
  1884. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1885. def new_buffer(self):
  1886. return [set()]
  1887. def iterate(self, buffer,page_time_stamp):
  1888. buffer[0].add(page_time_stamp)
  1889. def merge(self, buffer, pbuffer):
  1890. buffer[0] |= pbuffer[0]
  1891. def terminate(self, buffer):
  1892. if 0 in buffer[0]:
  1893. buffer[0].remove(0)
  1894. list_stamp = list(buffer[0])
  1895. list_stamp.sort(key=lambda x:x)
  1896. list_stamp_final = []
  1897. _begin = 0
  1898. _time_decase = 86400*7
  1899. logging.info(str(list_stamp))
  1900. for _index in range(len(list_stamp)-1):
  1901. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  1902. continue
  1903. else:
  1904. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  1905. _begin = _index+1
  1906. if len(list_stamp)>0:
  1907. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  1908. return json.dumps(list_stamp_final)
  1909. @annotate("bigint,string->bigint")
  1910. class in_stamp(object):
  1911. def __init__(self):
  1912. import logging
  1913. import re
  1914. import json
  1915. global logging,re,json
  1916. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1917. def evaluate(self, page_time_stamp,json_stamp):
  1918. list_stamp = json.loads(json_stamp)
  1919. int_flag = 0
  1920. for item in list_stamp:
  1921. if page_time_stamp <item[0]:
  1922. break
  1923. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  1924. int_flag = 1
  1925. break
  1926. return int_flag
  1927. def getConfidence(rule_id):
  1928. if rule_id ==0:
  1929. return 30
  1930. elif rule_id >=1 and rule_id <30:
  1931. return 20
  1932. else:
  1933. return 10
  1934. @annotate('string,string -> string')
  1935. class f_splitStr(BaseUDTF):
  1936. '''
  1937. 将多个组拆解成多条记录
  1938. '''
  1939. def __init__(self):
  1940. import logging
  1941. import json
  1942. global json,logging
  1943. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1944. def process(self, str_split,_split):
  1945. try:
  1946. for _s in str_split.split(_split):
  1947. self.forward(_s)
  1948. except Exception as e:
  1949. pass
  1950. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  1951. class f_split_group_single(BaseUDTF):
  1952. '''
  1953. 将多个组拆解成多条记录
  1954. '''
  1955. def __init__(self):
  1956. import logging
  1957. import json
  1958. global json,logging
  1959. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1960. def process(self, json_set_docid,rule_id):
  1961. list_group = json.loads(json_set_docid)
  1962. for item in list_group:
  1963. if len(item)>100:
  1964. item.sort(key=lambda x:x["docid"],reverse=True)
  1965. index_i = 0
  1966. for index_j in range(1,len(item)):
  1967. if item[index_i]["docid"]!=item[index_j]["docid"]:
  1968. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1969. else:
  1970. for index_i in range(len(item)):
  1971. for index_j in range(len(item)):
  1972. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  1973. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1974. @annotate('bigint,string->string')
  1975. class group_document(BaseUDAF):
  1976. '''
  1977. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1978. '''
  1979. def __init__(self):
  1980. import json
  1981. global json
  1982. def new_buffer(self):
  1983. return [[]]
  1984. def iterate(self, buffer,id,json_set_docid):
  1985. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  1986. def merge(self, buffer, pbuffer):
  1987. buffer[0].extend(pbuffer[0])
  1988. def terminate(self, buffer):
  1989. return json.dumps(buffer[0])
  1990. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  1991. class decare_document(BaseUDTF):
  1992. '''
  1993. 将多个组拆解成多条记录
  1994. '''
  1995. def __init__(self):
  1996. import logging
  1997. import json
  1998. global json,logging
  1999. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2000. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  2001. #y=x,少掉近一半的数据
  2002. if group_id1>=group_id2:
  2003. list_doc1 = json.loads(json_list_doc1)
  2004. list_doc2 = json.loads(json_list_doc2)
  2005. for _doc1 in list_doc1:
  2006. for _doc2 in list_doc2:
  2007. #同一个重复group不做判断
  2008. if _doc1["id"]!=_doc2["id"]:
  2009. #判断两个group是否有重复
  2010. _set1 = set()
  2011. for _item1 in _doc1["json_set_docid"]:
  2012. _set1.add(_item1["docid"])
  2013. _set2 = set()
  2014. for _item2 in _doc2["json_set_docid"]:
  2015. _set2.add(_item2["docid"])
  2016. if len(_set1&_set2)>0:
  2017. new_json_set_docid = _doc1["json_set_docid"]
  2018. for _item2 in _doc2["json_set_docid"]:
  2019. if _item2["docid"] not in _set1:
  2020. new_json_set_docid.append(_item2)
  2021. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  2022. def getBestDocid(list_pair):
  2023. # [docid1,extract_count1,docid2,extract_count2]
  2024. # list_pair.sort(key=lambda x:x[3],reverse=True)
  2025. # _max_count = max(list_pair[0][3],list_pair[0][1])
  2026. # set_candidate = set()
  2027. # if list_pair[0][1]==_max_count:
  2028. # set_candidate.add(list_pair[0][0])
  2029. # for item in list_pair:
  2030. # if item[3]==_max_count:
  2031. # set_candidate.add(item[2])
  2032. # else:
  2033. # break
  2034. # list_candidate = list(set_candidate)
  2035. # list_candidate.sort(key=lambda x:x)
  2036. new_pair = []
  2037. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  2038. for item in list_pair:
  2039. new_pair.append([item[0],item[2],item[3]])
  2040. new_pair.sort(key=lambda x:x[1])
  2041. new_pair.sort(key=lambda x:x[2],reverse=True)
  2042. return new_pair[0][1]
  2043. @annotate('bigint,bigint,bigint,bigint->string')
  2044. class choose_document(BaseUDAF):
  2045. '''
  2046. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2047. '''
  2048. def __init__(self):
  2049. import json
  2050. global json
  2051. def new_buffer(self):
  2052. return [[]]
  2053. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2054. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2055. def merge(self, buffer, pbuffer):
  2056. buffer[0].extend(pbuffer[0])
  2057. def terminate(self, buffer):
  2058. list_pair = buffer[0]
  2059. _set = set()
  2060. for item in buffer[0]:
  2061. _set.add(str(item[2]))
  2062. list_dumplicate = list(_set)
  2063. best_docid = getBestDocid(list_pair)
  2064. if best_docid==list_pair[0][0]:
  2065. save_flag = 1
  2066. else:
  2067. save_flag = 0
  2068. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  2069. @annotate('string -> bigint,string')
  2070. class f_get_choose_document(BaseUDTF):
  2071. '''
  2072. 将多个组拆解成多条记录
  2073. '''
  2074. def __init__(self):
  2075. import logging
  2076. import json
  2077. global json,logging
  2078. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2079. def process(self,json_choose):
  2080. if json_choose is None:
  2081. self.forward(1,None)
  2082. else:
  2083. _choose = json.loads(json_choose)
  2084. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  2085. @annotate('string->bigint')
  2086. class f_get_codes_count(object):
  2087. def evaluate(self,extract_json):
  2088. if extract_json is None or extract_json=="":
  2089. extract_json = "{}"
  2090. _extract = json.loads(extract_json)
  2091. _codes = _extract.get("code",[])
  2092. return len(_codes)
  2093. @annotate('string->string')
  2094. class f_get_codes(object):
  2095. def evaluate(self,extract_json):
  2096. if extract_json is None or extract_json=="":
  2097. extract_json = "{}"
  2098. _extract = json.loads(extract_json)
  2099. _codes = _extract.get("code",[])
  2100. return ",".join(_codes)
  2101. @annotate('bigint,bigint,bigint,bigint->string')
  2102. class group_document_bestFirst(BaseUDAF):
  2103. '''
  2104. 将组里面最优的放在前面
  2105. '''
  2106. def __init__(self):
  2107. import json
  2108. global json
  2109. def new_buffer(self):
  2110. return [[]]
  2111. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2112. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2113. def merge(self, buffer, pbuffer):
  2114. buffer[0].extend(pbuffer[0])
  2115. def terminate(self, buffer):
  2116. list_pair = buffer[0]
  2117. _set = set()
  2118. for item in buffer[0]:
  2119. _set.add(item[2])
  2120. _set.add(list_pair[0][0])
  2121. best_docid = getBestDocid(list_pair)
  2122. _set.remove(best_docid)
  2123. list_dumplicate = list(_set)
  2124. list_dumplicate.sort(key=lambda x:x)
  2125. list_dumplicate.insert(0,best_docid)
  2126. list_dumplicate_str = []
  2127. for item in list_dumplicate:
  2128. list_dumplicate_str.append(str(item))
  2129. return ",".join(list_dumplicate_str)
  2130. @annotate('string -> bigint,string')
  2131. class f_get_best_dumplicates(BaseUDTF):
  2132. '''
  2133. 得到每个分组中最优的那一条及其重复记录
  2134. '''
  2135. def __init__(self):
  2136. import logging
  2137. import json
  2138. global json,logging
  2139. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2140. def process(self,list_dumplicate_str):
  2141. if list_dumplicate_str is None or list_dumplicate_str=='':
  2142. pass
  2143. else:
  2144. list_dumplicate = list_dumplicate_str.split(",")
  2145. if len(list_dumplicate)>0:
  2146. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  2147. else:
  2148. pass
  2149. @annotate('bigint,bigint->string')
  2150. class bridge2group(BaseUDAF):
  2151. '''
  2152. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2153. '''
  2154. def __init__(self):
  2155. import json
  2156. global json
  2157. def new_buffer(self):
  2158. return [set()]
  2159. def iterate(self, buffer,docid1,docid2):
  2160. buffer[0].add(docid1)
  2161. buffer[0].add(docid2)
  2162. def merge(self, buffer, pbuffer):
  2163. buffer[0] |= pbuffer[0]
  2164. def terminate(self, buffer):
  2165. list_pair = list(buffer[0])
  2166. list_pair.sort(key=lambda x:x,reverse=True)
  2167. return json.dumps(list_pair)
  2168. @annotate('string -> bigint,bigint')
  2169. class group2bridge(BaseUDTF):
  2170. '''
  2171. 将多个组拆解成多条记录
  2172. '''
  2173. def __init__(self):
  2174. import logging
  2175. import json
  2176. global json,logging
  2177. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2178. def process(self,json_list_docid):
  2179. list_docid = json.loads(json_list_docid)
  2180. for _docid in list_docid:
  2181. self.forward(list_docid[-1],_docid)
  2182. @annotate('string->string')
  2183. class to_url(object):
  2184. def evaluate(self,_s):
  2185. if _s is None or _s=="":
  2186. return
  2187. else:
  2188. list_l = []
  2189. for l in _s.split(","):
  2190. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  2191. return ",".join(list_l)
  2192. @annotate('bigint,bigint,string -> bigint')
  2193. class f_get_dump_docid(BaseUDTF):
  2194. '''
  2195. 将多个组拆解成多条记录
  2196. '''
  2197. def __init__(self):
  2198. import logging
  2199. import json
  2200. global json,logging
  2201. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2202. def process(self,docid,save_flag,dumplicates):
  2203. if save_flag==0:
  2204. self.forward(docid)
  2205. if dumplicates is not None:
  2206. list_docid = dumplicates.split(",")
  2207. if len(list_docid)>0:
  2208. for _docid in list_docid[1:]:
  2209. self.forward(int(_docid))
  2210. else:
  2211. if dumplicates is not None:
  2212. list_docid = dumplicates.split(",")
  2213. if len(list_docid)>0:
  2214. for _docid in list_docid:
  2215. self.forward(int(_docid))
  2216. @annotate('string -> bigint,bigint')
  2217. class f_get_docid(BaseUDTF):
  2218. '''
  2219. 将多个组拆解成多条记录
  2220. '''
  2221. def __init__(self):
  2222. import logging
  2223. import json
  2224. global json,logging
  2225. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2226. def process(self,json_set_docid):
  2227. team_id = 0
  2228. if json_set_docid is not None:
  2229. list_docses = json.loads(json_set_docid)
  2230. for list_docs in list_docses:
  2231. team_id += 1
  2232. for item in list_docs:
  2233. self.forward(team_id,item["docid"])
  2234. @annotate("string->bigint")
  2235. class get_count_dump(object):
  2236. def __init__(self):
  2237. import logging
  2238. import re
  2239. global logging,re
  2240. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2241. def evaluate(self, title):
  2242. _count = 0
  2243. if title is not None:
  2244. _count = len(title.split(","))
  2245. return _count
  2246. def getSet(list_dict,key):
  2247. _set = set()
  2248. for item in list_dict:
  2249. if key in item:
  2250. if item[key]!='' and item[key] is not None:
  2251. if re.search("^\d[\d\.]*$",item[key]) is not None:
  2252. _set.add(str(float(item[key])))
  2253. else:
  2254. _set.add(str(item[key]))
  2255. return _set
  2256. def getDiffIndex(list_dict,key,confidence=100):
  2257. '''
  2258. 优化为相似度判断
  2259. :param list_dict:
  2260. :param key:
  2261. :param confidence:
  2262. :return:
  2263. '''
  2264. # _set = set()
  2265. # for _i in range(len(list_dict)):
  2266. # item = list_dict[_i]
  2267. # if item["confidence"]>=confidence:
  2268. # continue
  2269. # if key in item:
  2270. # if item[key]!='' and item[key] is not None:
  2271. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2272. # _set.add(str(float(item[key])))
  2273. # else:
  2274. # _set.add(str(item[key]))
  2275. # if len(_set)>1:
  2276. # return _i
  2277. # ==============================
  2278. _set = set()
  2279. _set_m = set()
  2280. base_s = ""
  2281. for _i in range(len(list_dict)):
  2282. item = list_dict[_i]
  2283. if item["confidence"]>=confidence:
  2284. continue
  2285. if key in item:
  2286. if item[key]!='' and item[key] is not None:
  2287. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2288. _m = float(item[key])
  2289. if _m>100000:
  2290. _m = _m//10000*10000
  2291. _set_m.add(str(_m))
  2292. else:
  2293. _s = str(item[key])
  2294. if base_s=="":
  2295. base_s = _s
  2296. else:
  2297. simi = getSimilarityOfString(base_s,_s)
  2298. if simi<0.8:
  2299. return _i
  2300. if len(_set_m)>1:
  2301. return _i
  2302. return len(list_dict)
  2303. @annotate('bigint,string -> bigint,bigint')
  2304. class f_getGroup_dumpFinal(BaseUDTF):
  2305. '''
  2306. 从最后的结果中获取组
  2307. '''
  2308. def __init__(self):
  2309. import logging
  2310. import json
  2311. global json,logging
  2312. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2313. def process(self,docid,dumplicates):
  2314. self.forward(int(docid),int(docid))
  2315. if dumplicates is not None:
  2316. list_docids = dumplicates.split(",")
  2317. for _docid in list_docids:
  2318. self.forward(int(docid),int(_docid))
  2319. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  2320. class f_redump_limit_num(BaseUDAF):
  2321. '''
  2322. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2323. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2324. '''
  2325. def __init__(self):
  2326. import logging
  2327. import json,re
  2328. global json,logging,re
  2329. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2330. def new_buffer(self):
  2331. return [list()]
  2332. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  2333. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  2334. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  2335. "extract_count2":extract_count2,"confidence":confidence})
  2336. def merge(self, buffer, pbuffer):
  2337. buffer[0].extend(pbuffer[0])
  2338. def terminate(self, buffer):
  2339. list_group = []
  2340. the_group = buffer[0]
  2341. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2342. if len(the_group)>5:
  2343. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  2344. else:
  2345. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  2346. final_group = []
  2347. #置信度
  2348. list_key_index = []
  2349. for _k in keys:
  2350. if _k=="doctitle":
  2351. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2352. else:
  2353. list_key_index.append(getDiffIndex(the_group,_k))
  2354. _index = min(list_key_index)
  2355. if _index>1:
  2356. main_docid = the_group[0]["main_docid"]
  2357. for item in the_group[:_index]:
  2358. if item["docid"]!=main_docid:
  2359. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  2360. # stay = True
  2361. # for _key in keys:
  2362. # if len(getSet(the_group,_key))>1:
  2363. # stay = False
  2364. # break
  2365. #
  2366. # if stay:
  2367. # main_docid = the_group[0]["main_docid"]
  2368. # for item in the_group:
  2369. # if item["docid"]!=main_docid:
  2370. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  2371. return json.dumps(final_group)
  2372. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  2373. class f_get_dumpFinal_checked(BaseUDTF):
  2374. '''
  2375. 从最后的结果中获取组
  2376. '''
  2377. def __init__(self):
  2378. import logging
  2379. import json
  2380. global json,logging
  2381. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2382. def process(self,list_group):
  2383. if list_group is not None:
  2384. final_group = json.loads(list_group)
  2385. for _group in final_group:
  2386. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  2387. @annotate('string -> bigint')
  2388. class f_getDumplicateDocids(BaseUDTF):
  2389. '''
  2390. 从最后的结果中获取组
  2391. '''
  2392. def __init__(self):
  2393. import logging
  2394. import json
  2395. global json,logging
  2396. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2397. def process(self,dumplicates):
  2398. list_docids = dumplicates.split(",")
  2399. for _d in list_docids:
  2400. self.forward(int(_d))
  2401. def jaccard_score(source,target):
  2402. source_set = set([s for s in source])
  2403. target_set = set([s for s in target])
  2404. if len(source_set)==0 or len(target_set)==0:
  2405. return 0
  2406. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  2407. def getSimilarityOfString(str1,str2):
  2408. _set1 = set()
  2409. _set2 = set()
  2410. if str1 is not None:
  2411. for i in range(1,len(str1)):
  2412. _set1.add(str1[i-1:i+1])
  2413. for i in range(2,len(str1)):
  2414. _set1.add(str1[i-2:i+1])
  2415. if str2 is not None:
  2416. for i in range(1,len(str2)):
  2417. _set2.add(str2[i-1:i+1])
  2418. for i in range(2,len(str2)):
  2419. _set2.add(str2[i-2:i+1])
  2420. _len = max(1,min(len(_set1),len(_set2)))
  2421. return len(_set1&_set2)/_len
  2422. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  2423. class f_is_legal(object):
  2424. def __init__(self):
  2425. import logging
  2426. import re
  2427. global logging,re
  2428. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2429. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  2430. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  2431. return 0
  2432. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  2433. return 0
  2434. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  2435. return 0
  2436. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  2437. return 0
  2438. _sim = getSimilarityOfString(project_code1,project_code2)
  2439. if _sim>0.7 and _sim<1:
  2440. return 0
  2441. return 1
  2442. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  2443. class f_autorule_group(BaseUDAF):
  2444. '''
  2445. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2446. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2447. '''
  2448. def __init__(self):
  2449. import logging
  2450. import json,re
  2451. global json,logging,re
  2452. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2453. def new_buffer(self):
  2454. return [list()]
  2455. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  2456. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  2457. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  2458. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  2459. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  2460. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  2461. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  2462. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  2463. def merge(self, buffer, pbuffer):
  2464. buffer[0].extend(pbuffer[0][:100])
  2465. buffer[0] = buffer[0][:100]
  2466. def getSameKeys(self,_dict1,_dict2):
  2467. list_keys = []
  2468. for k,v in _dict1.items():
  2469. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  2470. continue
  2471. v2 = _dict2.get(k,"")
  2472. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  2473. list_keys.append(k)
  2474. list_keys.sort(key=lambda x:x)
  2475. return "=".join(list_keys)
  2476. def terminate(self, buffer):
  2477. list_group = []
  2478. the_group = buffer[0]
  2479. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2480. if len(the_group)>5:
  2481. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2482. else:
  2483. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2484. #置信度
  2485. list_key_index = []
  2486. for _k in keys:
  2487. if _k=="doctitle":
  2488. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2489. else:
  2490. list_key_index.append(getDiffIndex(the_group,_k))
  2491. final_group = []
  2492. _index = min(list_key_index)
  2493. if _index>1:
  2494. for item in the_group[:_index]:
  2495. final_group.append(item)
  2496. list_rules = []
  2497. for i in range(len(final_group)):
  2498. for j in range(i+1,len(final_group)):
  2499. _dict1 = final_group[i]
  2500. _dict2 = final_group[j]
  2501. _rule = self.getSameKeys(_dict1,_dict2)
  2502. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  2503. return json.dumps(list_rules)
  2504. @annotate('string -> string,bigint,bigint')
  2505. class f_autorule_group_extract(BaseUDTF):
  2506. '''
  2507. 从最后的结果中获取组
  2508. '''
  2509. def __init__(self):
  2510. import logging
  2511. import json
  2512. global json,logging
  2513. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2514. def process(self,rules_json):
  2515. list_rules = json.loads(rules_json)
  2516. for _rule in list_rules:
  2517. self.forward(_rule[0],_rule[1],_rule[2])
  2518. if __name__ == '__main__':
  2519. # f = f_decode_for_dumplicate()
  2520. # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
  2521. # print(b)
  2522. print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
  2523. # f = f_get_extractCount()
  2524. # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
  2525. # print(f.evaluate(j))
  2526. # _str1 = "PMJJ-202211030004001"
  2527. # _str2 = "PMJJ-202211030001001"
  2528. # print(getSimilarityOfString(_str1,_str2))
  2529. # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  2530. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  2531. # print(check_product(None,None))
  2532. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  2533. # print(check_money("550.0","440.0","",""))
  2534. # for i in range(0,2):
  2535. # print(i)
  2536. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  2537. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  2538. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  2539. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  2540. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  2541. # if _match is not None:
  2542. # print(_match.groupdict()["name"])
  2543. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  2544. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  2545. # c = f_get_extractCount()
  2546. # _json = '''
  2547. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  2548. # '''
  2549. # c = f_get_nlp_enterprise()
  2550. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  2551. # print(c.evaluate(_json))
  2552. # c = f_set_docid()
  2553. # _s = '''
  2554. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2555. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2556. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2557. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  2558. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2559. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2560. # '''
  2561. # buffer = c.new_buffer()
  2562. # for _line in _s.split("\n"):
  2563. # _line = _line.strip()
  2564. # if _line=="":
  2565. # continue
  2566. # l_column = _line.split("\t")
  2567. # print(l_column)
  2568. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  2569. # page_time_stamp = int(page_time_stamp)
  2570. # extract_count = int(extract_count)
  2571. # num = 1
  2572. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  2573. # print(c.terminate(buffer))