documentDumplicate.py 134 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  191. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  192. dict_pack = _extract.get("prem",{})
  193. for _key in dict_pack.keys():
  194. for _role in dict_pack[_key]["roleList"]:
  195. if isinstance(_role,list):
  196. _entity = _role[1]
  197. nlp_enterprise.append(_entity)
  198. if isinstance(_role,dict):
  199. _entity = _role["role_text"]
  200. nlp_enterprise.append(_entity)
  201. nlp_enterprise = list(set(nlp_enterprise))
  202. dict_entity = {"indoctextcon":nlp_enterprise,
  203. "notindoctextcon":nlp_enterprise_attachment}
  204. return json.dumps(dict_entity,ensure_ascii=False)
  205. @annotate("string->bigint")
  206. class f_get_extractCount(object):
  207. def __init__(self):
  208. import time
  209. global time
  210. import logging
  211. import json
  212. import re
  213. global json,logging,re
  214. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  215. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  216. def evaluate(self, extractjson):
  217. if extractjson is not None:
  218. _extract = json.loads(extractjson)
  219. else:
  220. _extract = {}
  221. dict_pack = _extract.get("prem",{})
  222. extract_count = 0
  223. list_code = _extract.get("code",[])
  224. if len(list_code)>0:
  225. project_code = list_code[0]
  226. else:
  227. project_code = ""
  228. project_name = _extract.get("name","")
  229. bidding_budget = ""
  230. win_tenderer = ""
  231. win_bid_price = ""
  232. linklist_count = 0
  233. for _key in dict_pack.keys():
  234. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  235. extract_count += 1
  236. if bidding_budget=="":
  237. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  238. for _role in dict_pack[_key]["roleList"]:
  239. if isinstance(_role,list):
  240. extract_count += 1
  241. if _role[2]!='' and float(_role[2])>0:
  242. extract_count += 1
  243. if _role[0]=="tenderee":
  244. tenderee = _role[1]
  245. if _role[0]=="win_tenderer":
  246. if win_tenderer=="":
  247. win_tenderer = _role[1]
  248. if _role[2]!='' and float(_role[2])>0:
  249. extract_count += 1
  250. if win_bid_price=="":
  251. win_bid_price = str(float(_role[2]))
  252. if _role[0]=="agency":
  253. agency = _role[1]
  254. if isinstance(_role,dict):
  255. extract_count += 1
  256. if "role_money" in _role:
  257. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  258. extract_count += 1
  259. if _role.get("role_name")=="tenderee":
  260. tenderee = _role["role_text"]
  261. if _role.get("role_name")=="win_tenderer":
  262. if win_tenderer=="":
  263. win_tenderer = _role["role_text"]
  264. if "role_money" in _role:
  265. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  266. extract_count += 1
  267. if win_bid_price=="":
  268. win_bid_price = str(float(_role["role_money"]["money"]))
  269. if _role["role_name"]=="agency":
  270. agency = _role["role_text"]
  271. linklist = _role.get("linklist",[])
  272. for link in linklist:
  273. for l in link:
  274. if l!="":
  275. linklist_count += 1
  276. extract_count += linklist_count//2
  277. if project_code!="":
  278. extract_count += 1
  279. if project_name!="":
  280. extract_count += 1
  281. return extract_count
  282. @annotate('string,string,string,string,string -> string,string,string,bigint')
  283. class f_decode_sub_docs_json(BaseUDTF):
  284. def __init__(self):
  285. import logging
  286. import json
  287. global json,logging
  288. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  289. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  290. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  291. extract_count = 0
  292. if project_code is not None and project_code!="":
  293. extract_count += 1
  294. if project_name is not None and project_name!="":
  295. extract_count += 1
  296. if tenderee is not None and tenderee!="":
  297. extract_count += 1
  298. if agency is not None and agency!="":
  299. extract_count += 1
  300. if sub_docs_json is not None:
  301. for sub_docs in json.loads(sub_docs_json):
  302. for _key_sub_docs in sub_docs.keys():
  303. extract_count += 1
  304. if _key_sub_docs in columns:
  305. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  306. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  307. if float(sub_docs[_key_sub_docs])>0:
  308. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  309. else:
  310. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  311. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  312. @annotate('string,string,string -> string,string,string,string,string,string,string')
  313. class f_decode_for_dumplicate(BaseUDTF):
  314. def __init__(self):
  315. import logging
  316. import json
  317. global json,logging
  318. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  319. def process(self,sub_docs_json,extractjson,extract):
  320. if extractjson is None or extractjson=="":
  321. extractjson = "{}"
  322. try:
  323. _extract = json.loads(extractjson)
  324. except Exception as e:
  325. _extract = {}
  326. product = ",".join(_extract.get("product",[]))
  327. list_product = product.split(",")
  328. project_codes = ",".join(_extract.get("code",[]))
  329. list_code = project_codes.split(",")
  330. if sub_docs_json is not None:
  331. list_sub_docs = json.loads(sub_docs_json)
  332. else:
  333. list_sub_docs = [{}]
  334. max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
  335. if extract!="extract":
  336. win_tenderer = ""
  337. bidding_budget = ""
  338. win_bid_price = ""
  339. for _subdoc in list_sub_docs:
  340. win_tenderer = _subdoc.get("win_tenderer","")
  341. bidding_budget = _subdoc.get("bidding_budget","0")
  342. if float(bidding_budget)==0:
  343. bidding_budget = ""
  344. else:
  345. bidding_budget = str(float(bidding_budget))
  346. win_bid_price = _subdoc.get("win_bid_price","0")
  347. if float(win_bid_price)==0:
  348. win_bid_price = ""
  349. else:
  350. win_bid_price = str(float(win_bid_price))
  351. if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
  352. break
  353. print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
  354. self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
  355. else:
  356. for _i in range(max_len):
  357. _product = list_product[_i%len(list_product)]
  358. _code = list_code[_i%len(list_code)]
  359. _subdoc = list_sub_docs[_i%len(list_sub_docs)]
  360. win_tenderer = _subdoc.get("win_tenderer","")
  361. bidding_budget = _subdoc.get("bidding_budget","0")
  362. if float(bidding_budget)==0:
  363. bidding_budget = ""
  364. else:
  365. bidding_budget = str(float(bidding_budget))
  366. win_bid_price = _subdoc.get("win_bid_price","0")
  367. if float(win_bid_price)==0:
  368. win_bid_price = ""
  369. else:
  370. win_bid_price = str(float(win_bid_price))
  371. self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
  372. @annotate("string->bigint")
  373. class totimestamp(object):
  374. def __init__(self):
  375. import time
  376. global time
  377. import logging
  378. import json
  379. import re
  380. global json,logging,re
  381. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  382. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  383. def evaluate(self, str_time):
  384. try:
  385. logging.info(str_time)
  386. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  387. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  388. timeStamp = int(time.mktime(timeArray))
  389. return timeStamp
  390. else:
  391. return 0
  392. except Exception as e:
  393. return 0
  394. @annotate("string->string")
  395. class refind_name(object):
  396. def __init__(self):
  397. import logging
  398. import re
  399. global logging,re
  400. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  401. def evaluate(self, title):
  402. if title is not None:
  403. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  404. return ""
  405. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  406. class f_set_docid(BaseUDAF):
  407. '''
  408. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  409. '''
  410. def __init__(self):
  411. import json
  412. global json
  413. def new_buffer(self):
  414. return [[]]
  415. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  416. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  417. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  418. def merge(self, buffer, pbuffer):
  419. buffer[0].extend(pbuffer[0])
  420. def terminate(self, buffer):
  421. list_docs = buffer[0]
  422. list_docs.sort(key=lambda x:x["page_time_stamp"])
  423. list_group = []
  424. _begin = 0
  425. defind_count = 0
  426. if len(list_docs)>0:
  427. defind_count = list_docs[0]["defind_count"]
  428. print(defind_count)
  429. for i in range(len(list_docs)-1):
  430. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
  431. continue
  432. else:
  433. _group = []
  434. _set_column = set()
  435. _set_tenderee = set()
  436. for j in range(_begin,i+1):
  437. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  438. _set_tenderee.add(list_docs[j]["tenderee"])
  439. _set_column.add(list_docs[j]["defind_column"])
  440. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  441. if len(_group)>=3 and len(_set_tenderee)>1:
  442. pass
  443. else:
  444. print(defind_count,len(_set_column))
  445. if len(_group)>1:
  446. if defind_count==2:
  447. if len(_set_column)>=2:
  448. list_group.append(_group)
  449. elif defind_count==1:
  450. if len(_set_column)==1:
  451. list_group.append(_group)
  452. elif defind_count==0:
  453. list_group.append(_group)
  454. _begin = i+1
  455. if len(list_docs)>1:
  456. _set_column = set()
  457. _set_tenderee = set()
  458. _group = []
  459. for j in range(_begin,len(list_docs)):
  460. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  461. _set_tenderee.add(list_docs[j]["tenderee"])
  462. _set_column.add(list_docs[j]["defind_column"])
  463. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  464. if len(_group)>=3 and len(_set_tenderee)>1:
  465. pass
  466. else:
  467. if len(_group)>1:
  468. if defind_count==2:
  469. if len(_set_column)>=2:
  470. list_group.append(_group)
  471. elif defind_count==1:
  472. if len(_set_column)==1:
  473. list_group.append(_group)
  474. elif defind_count==0:
  475. list_group.append(_group)
  476. return json.dumps(list_group)
  477. # def terminate(self, buffer):
  478. #
  479. #
  480. # list_docs = buffer[0]
  481. # if len(list_docs)>0:
  482. # defind_count = list_docs[0]["defind_count"]
  483. #
  484. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  485. #
  486. # list_group = []
  487. # for time_group in list_time_group:
  488. # _group = []
  489. # _set_column = set()
  490. # base_tenderee = ""
  491. # _set_tenderee = set()
  492. # for j in range(len(time_group)):
  493. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  494. # # if base_tenderee =="":
  495. # # base_tenderee = time_group[j]["tenderee"]
  496. # # _set_tenderee.add(time_group[j]["tenderee"])
  497. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  498. # # if simi<0.8:
  499. # # _set_tenderee.add(time_group[j]["tenderee"])
  500. #
  501. # _set_tenderee.add(time_group[j]["tenderee"])
  502. # _set_column.add(time_group[j]["defind_column"])
  503. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  504. #
  505. # if len(_group)>=3 and len(_set_tenderee)>1:
  506. # pass
  507. # else:
  508. # if len(_group)>1:
  509. # if defind_count==2:
  510. # if len(_set_column)>=2:
  511. # list_group.append(_group)
  512. # elif defind_count==1:
  513. # if len(_set_column)==1:
  514. # list_group.append(_group)
  515. # elif defind_count==0:
  516. # list_group.append(_group)
  517. #
  518. # return json.dumps(list_group)
  519. def isEmpty(_str):
  520. if _str is None or _str=="":
  521. return True
  522. return False
  523. @annotate('bigint->string')
  524. class f_group_fingerprint(BaseUDAF):
  525. def __init__(self):
  526. import json
  527. global json
  528. def new_buffer(self):
  529. return [[]]
  530. def iterate(self, buffer,docid):
  531. buffer[0].append(docid)
  532. def merge(self, buffer, pbuffer):
  533. buffer[0].extend(pbuffer[0][:100000])
  534. def terminate(self, buffer):
  535. list_docid = buffer[0][:100000]
  536. list_docid.sort(key=lambda x:x)
  537. return ",".join([str(a) for a in list_docid])
  538. @annotate('string->bigint,string')
  539. class f_ungroup_fingerprint(BaseUDTF):
  540. def process(self,dumplicates):
  541. list_docid = dumplicates.split(",")
  542. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  543. @annotate('bigint,bigint,string->string')
  544. class f_dump_probability(BaseUDAF):
  545. '''
  546. 合并组为一条记录
  547. '''
  548. def __init__(self):
  549. import json
  550. global json
  551. def new_buffer(self):
  552. return [[]]
  553. def iterate(self, buffer,docid,page_time_stamp,_type):
  554. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  555. def merge(self, buffer, pbuffer):
  556. buffer[0].extend(pbuffer[0])
  557. def terminate(self, buffer):
  558. list_dict = buffer[0]
  559. _set = set()
  560. list_data = []
  561. for _dict in list_dict:
  562. docid = _dict["docid"]
  563. if docid in _set:
  564. continue
  565. _set.add(docid)
  566. list_data.append(_dict)
  567. if len(list_data)>10000:
  568. break
  569. list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
  570. return json.dumps(list_group)
  571. @annotate('string -> bigint,bigint,bigint,bigint,string')
  572. class f_split_dumplicate_probability(BaseUDTF):
  573. def __init__(self):
  574. import logging
  575. import json
  576. global logging,json
  577. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  578. def process(self,list_group_str):
  579. logging.info("0")
  580. logging.info(list_group_str)
  581. if list_group_str is not None:
  582. logging.info("1")
  583. try:
  584. list_group = json.loads(list_group_str)
  585. logging.info("2")
  586. for _group in list_group:
  587. if len(_group)>0:
  588. _type = _group[0].get("type","")
  589. logging.info("3%d"%len(list_group))
  590. # _group.sort(key=lambda x:x["page_time_stamp"])
  591. _len = min(100,len(_group))
  592. for _index_i in range(_len):
  593. _count = 0
  594. for _index_j in range(_index_i+1,_len):
  595. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  596. break
  597. _count += 1
  598. _docid1 = _group[_index_i]["docid"]
  599. _docid2 = _group[_index_j]["docid"]
  600. if _docid1<_docid2:
  601. self.forward(_docid1,_docid2,1,_len,_type)
  602. elif _docid1>_docid2:
  603. self.forward(_docid2,_docid1,1,_len,_type)
  604. except Exception as e:
  605. logging(str(e))
  606. @annotate('bigint,bigint,string->string')
  607. class f_dumplicate_groupPairs(BaseUDAF):
  608. '''
  609. 合并组为一条记录
  610. '''
  611. def __init__(self):
  612. import json
  613. global json
  614. def new_buffer(self):
  615. return [[]]
  616. def iterate(self, buffer,is_exists,counts,_type):
  617. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  618. def merge(self, buffer, pbuffer):
  619. buffer[0].extend(pbuffer[0])
  620. def terminate(self, buffer):
  621. list_dict = buffer[0]
  622. list_dict = list_dict[:10000]
  623. return json.dumps(list_dict)
  624. def check_columns(tenderee_less,tenderee_greater,
  625. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  626. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  627. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  628. flag = True
  629. _set_tenderee = set()
  630. if tenderee_less is not None and tenderee_less!="":
  631. _set_tenderee.add(tenderee_less)
  632. if tenderee_greater is not None and tenderee_greater!="":
  633. _set_tenderee.add(tenderee_greater)
  634. if len(_set_tenderee)>1:
  635. return False
  636. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  637. if code_sim>0.6 and code_sim<1:
  638. return False
  639. #同批次不同编号
  640. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  641. _split_code_less = project_code_less.split("-")
  642. _split_code_greater = project_code_greater.split("-")
  643. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  644. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  645. return False
  646. _set_win_tenderer = set()
  647. if win_tenderer_less is not None and win_tenderer_less!="":
  648. _set_win_tenderer.add(win_tenderer_less)
  649. if win_tenderer_greater is not None and win_tenderer_greater!="":
  650. _set_win_tenderer.add(win_tenderer_greater)
  651. if len(_set_win_tenderer)>1:
  652. return False
  653. _set_win_bid_price = set()
  654. if win_bid_price_less is not None and win_bid_price_less!="":
  655. _set_win_bid_price.add(float(win_bid_price_less))
  656. if win_bid_price_greater is not None and win_bid_price_greater!="":
  657. _set_win_bid_price.add(float(win_bid_price_greater))
  658. if len(_set_win_bid_price)>1:
  659. return False
  660. _set_bidding_budget = set()
  661. if bidding_budget_less is not None and bidding_budget_less!="":
  662. _set_bidding_budget.add(float(bidding_budget_less))
  663. if bidding_budget_greater is not None and bidding_budget_greater!="":
  664. _set_bidding_budget.add(float(bidding_budget_greater))
  665. if len(_set_bidding_budget)>1:
  666. return False
  667. return True
  668. import math
  669. def featurnCount(_count,max_count=100):
  670. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  671. def getSimLevel(str1,str2):
  672. str1_null = False
  673. str2_null = False
  674. _v = 0
  675. if str1 is None or str1=="":
  676. str1_null = True
  677. if str2 is None or str2=="":
  678. str2_null = True
  679. if str1_null and str2_null:
  680. _v = 2
  681. elif str1_null and not str2_null:
  682. _v = 4
  683. elif not str1_null and str2_null:
  684. _v = 6
  685. elif not str1_null and not str2_null:
  686. if str1==str2:
  687. _v = 10
  688. else:
  689. _v = 0
  690. return _v
  691. def getLength(_str):
  692. return len(_str if _str is not None else "")
  693. def check_money(bidding_budget_less,bidding_budget_greater,
  694. win_bid_price_less,win_bid_price_greater,
  695. moneys_less,moneys_greater,
  696. moneys_attachment_less,moneys_attachment_greater):
  697. #只判断最高前六位
  698. if getLength(bidding_budget_less)>0:
  699. bidding_budget_less = round(float(bidding_budget_less))
  700. bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
  701. if getLength(bidding_budget_greater)>0:
  702. bidding_budget_greater = round(float(bidding_budget_greater))
  703. bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  704. if getLength(win_bid_price_less)>0:
  705. win_bid_price_less = round(float(win_bid_price_less))
  706. win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
  707. if getLength(win_bid_price_greater)>0:
  708. win_bid_price_greater = round(float(win_bid_price_greater))
  709. win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  710. #check saming
  711. budget_is_same = ""
  712. price_is_same = ""
  713. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  714. budget_less = float(bidding_budget_less)
  715. budget_greater = float(bidding_budget_greater)
  716. if budget_less!=budget_greater:
  717. if min(budget_less,budget_greater)>0:
  718. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  719. budget_is_same = True
  720. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  721. budget_is_same = True
  722. if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
  723. budget_is_same = True
  724. if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
  725. budget_is_same = True
  726. if budget_is_same=="":
  727. return False
  728. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  729. price_less = float(win_bid_price_less)
  730. price_greater = float(win_bid_price_greater)
  731. if price_less!=price_greater:
  732. if min(price_less,price_greater)>0:
  733. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  734. price_is_same = True
  735. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  736. price_is_same = True
  737. if price_less in moneys_greater or price_less in moneys_attachment_greater:
  738. price_is_same = True
  739. if price_greater in moneys_less or price_greater in moneys_attachment_less:
  740. price_is_same = True
  741. if price_is_same=="":
  742. return False
  743. return True
  744. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  745. tenderee_less,tenderee_greater,
  746. agency_less,agency_greater,
  747. win_tenderer_less,win_tenderer_greater,
  748. similarity=0.85):
  749. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  750. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  751. if entity_less!=entity_greater:
  752. is_same = ''
  753. _sim = jaccard_score(entity_less,entity_greater)
  754. if _sim>similarity:
  755. is_same = True
  756. if is_same=='':
  757. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  758. is_same = True
  759. if is_same=='':
  760. return False
  761. return True
  762. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  763. return False
  764. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  765. return False
  766. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  767. return False
  768. return True
  769. def check_codes(project_codes_less,project_codes_greater):
  770. #check the similarity
  771. is_same = False
  772. is_sim = False
  773. for project_code_less in project_codes_less:
  774. for project_code_greater in project_codes_greater:
  775. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  776. if project_code_less is not None and project_code_greater is not None:
  777. if code_sim>0.6:
  778. if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
  779. is_same = True
  780. else:
  781. is_sim = True
  782. if project_code_less!=project_code_greater:
  783. if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
  784. is_sim = True
  785. if is_same:
  786. return True
  787. if is_sim:
  788. return False
  789. return True
  790. def check_demand():
  791. return True
  792. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  793. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  794. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  795. num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
  796. location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
  797. building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  798. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  799. def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
  800. if code_greater is None:
  801. code_greater = []
  802. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  803. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  804. for _c in codes_less:
  805. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  806. for _c in code_greater:
  807. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  808. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  809. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  810. #check the package
  811. if doctitle_refind_less is None:
  812. doctitle_refind_less = ""
  813. if doctitle_refind_greater is None:
  814. doctitle_refind_greater = ""
  815. _pack1 = None
  816. _pack2 = None
  817. #if contain then pass
  818. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  819. return True
  820. #check the package in title
  821. _match = re.search(package_number_pattern,doctitle_refind_less)
  822. if _match is not None:
  823. _pack1 = _match.groupdict()["name"]
  824. _match = re.search(package_number_pattern,doctitle_refind_greater)
  825. if _match is not None:
  826. _pack2 = _match.groupdict()["name"]
  827. if _pack1 is not None and _pack2 is not None:
  828. if _pack1!=_pack2:
  829. return False
  830. #check the nums in title
  831. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  832. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  833. #check the nums,location,building in title
  834. for _p in [code_pattern]:
  835. num_all_l = re.findall(_p,doctitle_refind_less)
  836. num_all_g = re.findall(_p,doctitle_refind_greater)
  837. set_num_l = set()
  838. set_num_g = set()
  839. for _l in num_all_l:
  840. if re.search(num_pattern,_l) is not None:
  841. if _l.find(".")>0:
  842. set_num_l.add(_l)
  843. elif len(_l)<4:
  844. set_num_l.add(_l)
  845. for _g in num_all_g:
  846. if re.search(num_pattern,_g) is not None:
  847. if _g.find(".")>0:
  848. set_num_g.add(_g)
  849. elif len(_g)<4:
  850. set_num_g.add(_g)
  851. if len(set_num_l)>0 and len(set_num_g)>0:
  852. if len(set_num_l&set_num_g)!=len(set_num_l):
  853. return False
  854. #check location and keywords
  855. for _p in [num1_pattern,building_pattern]:
  856. num_all_l = re.findall(_p,doctitle_refind_less)
  857. num_all_g = re.findall(_p,doctitle_refind_greater)
  858. set_num_l = set(num_all_l)
  859. set_num_g = set(num_all_g)
  860. if len(set_num_l)==len(set_num_g):
  861. if len(set_num_l&set_num_g)!=len(set_num_l):
  862. return False
  863. #check the location has conflict
  864. for _p in [location_pattern]:
  865. num_all_l = re.findall(_p,doctitle_refind_less)
  866. num_all_g = re.findall(_p,doctitle_refind_greater)
  867. dict_num_l = {}
  868. dict_num_g = {}
  869. for _l in num_all_l:
  870. if len(_l)>0:
  871. key = _l[-1:]
  872. if key not in dict_num_l:
  873. dict_num_l[key] = set()
  874. dict_num_l[key].add(_l)
  875. for _g in num_all_g:
  876. if len(_g)>0:
  877. key = _g[-1:]
  878. if key not in dict_num_g:
  879. dict_num_g[key] = set()
  880. dict_num_g[key].add(_g)
  881. for k,v in dict_num_l.items():
  882. if k in dict_num_g:
  883. if len(v&dict_num_g[k])==0:
  884. return False
  885. return True
  886. def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
  887. if getLength(product_less)>0 and getLength(product_greater)>0:
  888. _product_l = product_less.split(split_char)
  889. _product_g = product_greater.split(split_char)
  890. same_count = 0
  891. if len(_product_l)>len(_product_g):
  892. a = _product_g
  893. _product_g = _product_l
  894. _product_l = a
  895. for _l in _product_l:
  896. for _g in _product_g:
  897. if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
  898. same_count += 1
  899. break
  900. if same_count/len(_product_l)>=0.5:
  901. return True
  902. return False
  903. return True
  904. def check_package(package_less,package_greater,split_char=","):
  905. if getLength(package_less)>0 and getLength(package_greater)>0:
  906. _product_l = package_less.split(split_char)
  907. _product_g = package_greater.split(split_char)
  908. for _l in _product_l:
  909. for _g in _product_g:
  910. if _l==_g:
  911. return True
  912. return False
  913. return True
  914. def check_time(json_time_less,json_time_greater):
  915. has_same = False
  916. has_diff = False
  917. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  918. if isinstance(json_time_less,dict):
  919. time_less = json_time_less
  920. else:
  921. time_less = json.loads(json_time_less)
  922. if isinstance(json_time_greater,dict):
  923. time_greater = json_time_greater
  924. else:
  925. time_greater = json.loads(json_time_greater)
  926. for k,v in time_less.items():
  927. if getLength(v)>0:
  928. v1 = time_greater.get(k,"")
  929. if getLength(v1)>0:
  930. if v[:10]!=v1[:10]:
  931. has_diff = True
  932. else:
  933. has_same = True
  934. if has_same:
  935. if has_diff:
  936. return 1
  937. return 2
  938. if has_diff:
  939. return 0
  940. return 1
  941. def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
  942. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  943. return 1
  944. #一篇要素都在附件,且两篇附件md5有重叠
  945. set_md5_less = set()
  946. set_md5_greater = set()
  947. list_md5_less = json.loads(page_attachments_less)
  948. list_md5_greater = json.loads(page_attachments_greater)
  949. for _l in list_md5_less:
  950. _md5 = _l.get("fileMd5")
  951. if _md5 is not None:
  952. set_md5_less.add(_md5)
  953. for _l in list_md5_greater:
  954. _md5 = _l.get("fileMd5")
  955. if _md5 is not None:
  956. set_md5_greater.add(_md5)
  957. if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
  958. one_in_attach = False
  959. dict_enterprise_less = json.loads(nlp_enterprise_less)
  960. dict_enterprise_greater = json.loads(nlp_enterprise_greater)
  961. indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
  962. notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
  963. indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
  964. notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
  965. if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
  966. one_in_attach = True
  967. if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
  968. one_in_attach = True
  969. if one_in_attach:
  970. if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  971. return 1
  972. if isinstance(project_codes_less,str):
  973. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  974. elif project_codes_less is None:
  975. project_codes_less = []
  976. if isinstance(project_codes_greater,str):
  977. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  978. elif project_codes_greater is None:
  979. project_codes_greater = []
  980. same_count = 0
  981. all_count = 8
  982. if len(set(project_codes_less) & set(project_codes_greater))>0:
  983. same_count += 1
  984. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  985. same_count += 1
  986. if getLength(agency_less)>0 and agency_less==agency_greater:
  987. same_count += 1
  988. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  989. same_count += 1
  990. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  991. same_count += 1
  992. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  993. same_count += 1
  994. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  995. same_count += 1
  996. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  997. same_count += 1
  998. base_prob = 0
  999. if min_counts<3:
  1000. base_prob = 0.9
  1001. elif min_counts<5:
  1002. base_prob = 0.8
  1003. elif min_counts<8:
  1004. base_prob = 0.7
  1005. else:
  1006. base_prob = 0.6
  1007. _prob = base_prob*same_count/all_count
  1008. if min(extract_count_less,extract_count_greater)<=3:
  1009. if _prob<0.1:
  1010. _prob = 0.15
  1011. if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
  1012. return 0
  1013. if _prob<0.1:
  1014. return _prob
  1015. check_result = {"pass":1}
  1016. if docchannel_less in (51,102,103,104,115,116,117):
  1017. if doctitle_refine_less!=doctitle_refine_greater:
  1018. if page_time_less!=page_time_greater:
  1019. check_result["docchannel"] = 0
  1020. check_result["pass"] = 0
  1021. else:
  1022. check_result["docchannel"] = 2
  1023. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1024. check_result["doctitle"] = 0
  1025. check_result["pass"] = 0
  1026. if b_log:
  1027. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1028. else:
  1029. check_result["doctitle"] = 2
  1030. #added check
  1031. if not check_codes(project_codes_less,project_codes_greater):
  1032. check_result["code"] = 0
  1033. check_result["pass"] = 0
  1034. if b_log:
  1035. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1036. else:
  1037. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1038. check_result["code"] = 2
  1039. else:
  1040. check_result["code"] = 1
  1041. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1042. check_result["product"] = 0
  1043. check_result["pass"] = 0
  1044. if b_log:
  1045. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1046. else:
  1047. if getLength(product_less)>0 and getLength(product_greater)>0:
  1048. check_result["product"] = 2
  1049. else:
  1050. check_result["product"] = 1
  1051. if not check_demand():
  1052. check_result["pass"] = 0
  1053. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1054. tenderee_less,tenderee_greater,
  1055. agency_less,agency_greater,
  1056. win_tenderer_less,win_tenderer_greater):
  1057. check_result["entity"] = 0
  1058. check_result["pass"] = 0
  1059. if b_log:
  1060. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1061. else:
  1062. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1063. check_result["entity"] = 2
  1064. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1065. check_result["entity"] = 2
  1066. else:
  1067. check_result["entity"] = 1
  1068. logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
  1069. logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
  1070. if not check_money(bidding_budget_less,bidding_budget_greater,
  1071. win_bid_price_less,win_bid_price_greater,
  1072. moneys_less,moneys_greater,
  1073. moneys_attachment_less,moneys_attachment_greater):
  1074. if b_log:
  1075. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1076. check_result["money"] = 0
  1077. check_result["pass"] = 0
  1078. else:
  1079. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1080. check_result["money"] = 2
  1081. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1082. check_result["money"] = 2
  1083. else:
  1084. check_result["money"] = 1
  1085. #added check
  1086. if not check_package(package_less,package_greater):
  1087. if b_log:
  1088. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1089. check_result["package"] = 0
  1090. check_result["pass"] = 0
  1091. else:
  1092. if getLength(package_less)>0 and getLength(package_greater)>0:
  1093. check_result["package"] = 2
  1094. else:
  1095. check_result["package"] = 1
  1096. #added check
  1097. _time_check = check_time(json_time_less,json_time_greater)
  1098. if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
  1099. if b_log:
  1100. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1101. if isinstance(json_time_less,dict):
  1102. time_less = json_time_less
  1103. else:
  1104. time_less = json.loads(json_time_less)
  1105. if isinstance(json_time_greater,dict):
  1106. time_greater = json_time_greater
  1107. else:
  1108. time_greater = json.loads(json_time_greater)
  1109. for k,v in time_less.items():
  1110. if getLength(v)>0:
  1111. v1 = time_greater.get(k,"")
  1112. if getLength(v1)>0:
  1113. if v!=v1:
  1114. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1115. check_result["time"] = 0
  1116. check_result["pass"] = 0
  1117. else:
  1118. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1119. check_result["time"] = 2
  1120. else:
  1121. check_result["time"] = 1
  1122. if hard_level==2 and check_result["product"]<=1:
  1123. return 0
  1124. if check_result.get("pass",0)==0:
  1125. if b_log:
  1126. logging.info(str(check_result))
  1127. if check_result.get("money",1)==0:
  1128. return 0
  1129. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1130. return _prob
  1131. else:
  1132. return 0
  1133. return _prob
  1134. def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
  1135. if web_source_no_less==web_source_no_greater:
  1136. if fingerprint_less==fingerprint_greater:
  1137. return 1
  1138. else:
  1139. return 0
  1140. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1141. return 1
  1142. if isinstance(project_codes_less,str):
  1143. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1144. elif project_codes_less is None:
  1145. project_codes_less = []
  1146. if isinstance(project_codes_greater,str):
  1147. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1148. elif project_codes_greater is None:
  1149. project_codes_greater = []
  1150. same_count = 0
  1151. all_count = 8
  1152. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1153. same_count += 1
  1154. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1155. same_count += 1
  1156. if getLength(agency_less)>0 and agency_less==agency_greater:
  1157. same_count += 1
  1158. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1159. same_count += 1
  1160. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1161. same_count += 1
  1162. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1163. same_count += 1
  1164. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1165. same_count += 1
  1166. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1167. same_count += 1
  1168. base_prob = 0
  1169. if min_counts<3:
  1170. base_prob = 0.9
  1171. elif min_counts<5:
  1172. base_prob = 0.8
  1173. elif min_counts<8:
  1174. base_prob = 0.7
  1175. else:
  1176. base_prob = 0.6
  1177. _prob = base_prob*same_count/all_count
  1178. if min(extract_count_less,extract_count_greater)<=3:
  1179. if _prob<0.1:
  1180. _prob = 0.15
  1181. if province_less!=province_greater:
  1182. return 0
  1183. if _prob<0.1:
  1184. return _prob
  1185. check_result = {"pass":1}
  1186. if docchannel_less in (51,102,103,104,115,116,117):
  1187. if doctitle_refine_less!=doctitle_refine_greater:
  1188. if page_time_less!=page_time_greater:
  1189. check_result["docchannel"] = 0
  1190. check_result["pass"] = 0
  1191. else:
  1192. check_result["docchannel"] = 2
  1193. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1194. check_result["doctitle"] = 0
  1195. check_result["pass"] = 0
  1196. if b_log:
  1197. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1198. else:
  1199. check_result["doctitle"] = 2
  1200. #added check
  1201. if not check_codes(project_codes_less,project_codes_greater):
  1202. check_result["code"] = 0
  1203. check_result["pass"] = 0
  1204. if b_log:
  1205. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1206. else:
  1207. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1208. check_result["code"] = 2
  1209. else:
  1210. check_result["code"] = 1
  1211. if not check_product(product_less,product_greater):
  1212. check_result["product"] = 0
  1213. check_result["pass"] = 0
  1214. if b_log:
  1215. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1216. else:
  1217. if getLength(product_less)>0 and getLength(product_greater)>0:
  1218. check_result["product"] = 2
  1219. else:
  1220. check_result["product"] = 1
  1221. if not check_demand():
  1222. check_result["pass"] = 0
  1223. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1224. tenderee_less,tenderee_greater,
  1225. agency_less,agency_greater,
  1226. win_tenderer_less,win_tenderer_greater):
  1227. check_result["entity"] = 0
  1228. check_result["pass"] = 0
  1229. if b_log:
  1230. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1231. else:
  1232. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1233. check_result["entity"] = 2
  1234. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1235. check_result["entity"] = 2
  1236. else:
  1237. check_result["entity"] = 1
  1238. if not check_money(bidding_budget_less,bidding_budget_greater,
  1239. win_bid_price_less,win_bid_price_greater):
  1240. if b_log:
  1241. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1242. check_result["money"] = 0
  1243. check_result["pass"] = 0
  1244. else:
  1245. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1246. check_result["money"] = 2
  1247. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1248. check_result["money"] = 2
  1249. else:
  1250. check_result["money"] = 1
  1251. #added check
  1252. if not check_package(package_less,package_greater):
  1253. if b_log:
  1254. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1255. check_result["package"] = 0
  1256. check_result["pass"] = 0
  1257. else:
  1258. if getLength(package_less)>0 and getLength(package_greater)>0:
  1259. check_result["package"] = 2
  1260. else:
  1261. check_result["package"] = 1
  1262. #added check
  1263. if not check_time(json_time_less,json_time_greater):
  1264. if b_log:
  1265. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1266. if isinstance(json_time_less,dict):
  1267. time_less = json_time_less
  1268. else:
  1269. time_less = json.loads(json_time_less)
  1270. if isinstance(json_time_greater,dict):
  1271. time_greater = json_time_greater
  1272. else:
  1273. time_greater = json.loads(json_time_greater)
  1274. for k,v in time_less.items():
  1275. if getLength(v)>0:
  1276. v1 = time_greater.get(k,"")
  1277. if getLength(v1)>0:
  1278. if v!=v1:
  1279. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1280. check_result["time"] = 0
  1281. check_result["pass"] = 0
  1282. else:
  1283. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1284. check_result["time"] = 2
  1285. else:
  1286. check_result["time"] = 1
  1287. if hard_level==2 and check_result["product"]<=1:
  1288. return 0
  1289. if check_result.get("pass",0)==0:
  1290. if b_log:
  1291. logging.info(str(check_result))
  1292. if check_result.get("money",1)==0:
  1293. return 0
  1294. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1295. return _prob
  1296. else:
  1297. return 0
  1298. if check_result.get("time",1)==0:
  1299. return 0
  1300. return _prob
  1301. @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
  1302. class f_dumplicate_check(BaseUDTF):
  1303. def __init__(self):
  1304. import logging
  1305. import json
  1306. global logging,json
  1307. def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
  1308. tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
  1309. bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
  1310. project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
  1311. extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
  1312. page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
  1313. package_less,package_greater,json_time_less,json_time_greater,json_context,
  1314. province_less,province_greater,city_less,city_greater,district_less,district_greater,
  1315. web_source_no_less,web_source_no_greater,
  1316. extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
  1317. min_counts = 100
  1318. if json_context is not None:
  1319. _context = json.loads(json_context)
  1320. for item in _context:
  1321. if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
  1322. min_counts = item["counts"]
  1323. _extract_less = {}
  1324. if extract_json_less is not None:
  1325. _extract_less = json.loads(extract_json_less)
  1326. _extract_greater = {}
  1327. if extract_json_greater is not None:
  1328. _extract_greater = json.loads(extract_json_greater)
  1329. moneys_less = set(_extract_less.get("moneys",[]))
  1330. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1331. moneys_greater = set(_extract_greater.get("moneys",[]))
  1332. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1333. if page_attachments_less is None:
  1334. page_attachments_less = '[]'
  1335. if page_attachments_greater is None:
  1336. page_attachments_greater = '[]'
  1337. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1338. self.forward(_prob)
  1339. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  1340. class f_dumplicate_featureMatrix(BaseUDTF):
  1341. def __init__(self):
  1342. import logging
  1343. import json
  1344. global logging,json
  1345. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  1346. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  1347. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  1348. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  1349. #check the page_time by special docchannel
  1350. if docchannel_less in (51,102,103,104,115,116,117):
  1351. if doctitle_refine_less!=doctitle_refine_greater:
  1352. if page_time_less!=page_time_greater:
  1353. self.forward("[1-%s]"%(str(docchannel_less)),0)
  1354. return
  1355. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1356. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  1357. return
  1358. # if not check_codes([project_code_less],[project_code_greater]):
  1359. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  1360. # return
  1361. if not check_demand():
  1362. self.forward("[4-]",0)
  1363. return
  1364. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1365. tenderee_less,tenderee_greater,
  1366. agency_less,agency_greater,
  1367. win_tenderer_less,win_tenderer_greater):
  1368. _error = ""
  1369. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  1370. _error += str(a)
  1371. self.forward("[5-%s]"%_error,0)
  1372. return
  1373. if not check_money(bidding_budget_less,bidding_budget_greater,
  1374. win_bid_price_less,win_bid_price_greater):
  1375. _error = ""
  1376. for a in [bidding_budget_less,bidding_budget_greater,
  1377. win_bid_price_less,win_bid_price_greater]:
  1378. _error += str(a)
  1379. self.forward("[6-%s]"%_error,0)
  1380. return
  1381. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1382. _error = "%s=%s"%(str(product_less),str(product_greater))
  1383. self.forward("7-%s"%_error,0)
  1384. return
  1385. _context = json.loads(json_context)
  1386. min_counts = 100
  1387. dict_context = {}
  1388. for item in _context:
  1389. if item["counts"]<min_counts:
  1390. min_counts = item["counts"]
  1391. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  1392. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1393. list_matrix = []
  1394. #get the featurn of the context into matrix
  1395. # for index_i in range(len(context_key)):
  1396. # for index_j in range(index_i+1,len(context_key)):
  1397. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  1398. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1399. # list_matrix.append(_v)
  1400. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  1401. # for index_i in range(len(context3_key)):
  1402. # for index_j in range(index_i+1,len(context3_key)):
  1403. # for index_k in range(index_j+1,len(context3_key)):
  1404. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  1405. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1406. # list_matrix.append(_v)
  1407. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  1408. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  1409. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  1410. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  1411. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  1412. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  1413. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  1414. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  1415. json_matrix = json.dumps(list_matrix)
  1416. same_count = 0
  1417. all_count = 8
  1418. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  1419. same_count += 1
  1420. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  1421. same_count += 1
  1422. if getSimilarityOfString(agency_less,agency_greater)==1:
  1423. same_count += 1
  1424. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  1425. same_count += 1
  1426. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  1427. same_count += 1
  1428. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  1429. same_count += 1
  1430. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  1431. same_count += 1
  1432. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  1433. same_count += 1
  1434. base_prob = 0
  1435. if min_counts<3:
  1436. base_prob = 0.9
  1437. elif min_counts<5:
  1438. base_prob = 0.8
  1439. elif min_counts<8:
  1440. base_prob = 0.7
  1441. else:
  1442. base_prob = 0.6
  1443. _prob = base_prob*same_count/all_count
  1444. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  1445. self.forward(json_matrix,_prob)
  1446. return
  1447. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
  1448. class f_redump_probability_final_check(BaseUDAF):
  1449. '''
  1450. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1451. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1452. '''
  1453. def __init__(self):
  1454. import logging
  1455. import json,re
  1456. global json,logging,re
  1457. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1458. def new_buffer(self):
  1459. return [list()]
  1460. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
  1461. province,city,district,web_source_no,extract_json,page_attachments):
  1462. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1463. "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1464. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
  1465. "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
  1466. def merge(self, buffer, pbuffer):
  1467. buffer[0].extend(pbuffer[0])
  1468. def terminate(self, buffer):
  1469. list_group = []
  1470. the_group = buffer[0]
  1471. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1472. _index = 0
  1473. final_group = []
  1474. if len(the_group)>0:
  1475. _index = 0
  1476. while _index<len(the_group):
  1477. document_greater = the_group[_index]
  1478. docid_greater = document_greater["docid"]
  1479. docchannel_greater = document_greater["docchannel"]
  1480. page_time_greater = document_greater["page_time"]
  1481. doctitle_refine_greater = document_greater["doctitle_refine"]
  1482. project_codes_greater = document_greater["project_codes"]
  1483. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1484. tenderee_greater = document_greater["tenderee"]
  1485. agency_greater = document_greater["agency"]
  1486. win_tenderer_greater = document_greater["win_tenderer"]
  1487. bidding_budget_greater = document_greater["bidding_budget"]
  1488. win_bid_price_greater = document_greater["win_bid_price"]
  1489. product_greater = document_greater["product"]
  1490. package_greater = document_greater["package"]
  1491. json_time_greater = document_greater["json_dicttime"]
  1492. fingerprint_greater = document_greater.get("fingerprint","")
  1493. project_name_greater = document_greater["project_name"]
  1494. extract_count_greater = document_greater["extract_count"]
  1495. province_greater = document_greater["province"]
  1496. city_greater = document_greater["city"]
  1497. district_greater = document_greater["district"]
  1498. web_source_no_greater = document_greater["web_source_no"]
  1499. extract_json_greater = document_greater["extract_json"]
  1500. page_attachments_greater = document_greater["page_attachments"]
  1501. _pass = True
  1502. for document_less in final_group:
  1503. docid_less = document_less["docid"]
  1504. docchannel_less = document_less["docchannel"]
  1505. page_time_less = document_less["page_time"]
  1506. doctitle_refine_less = document_less["doctitle_refine"]
  1507. project_codes_less = document_less["project_codes"]
  1508. nlp_enterprise_less = document_less["nlp_enterprise"]
  1509. tenderee_less = document_less["tenderee"]
  1510. agency_less = document_less["agency"]
  1511. win_tenderer_less = document_less["win_tenderer"]
  1512. bidding_budget_less = document_less["bidding_budget"]
  1513. win_bid_price_less = document_less["win_bid_price"]
  1514. product_less = document_less["product"]
  1515. package_less = document_less["package"]
  1516. json_time_less = document_less["json_dicttime"]
  1517. fingerprint_less = document_less.get("fingerprint","")
  1518. project_name_less = document_less["project_name"]
  1519. extract_count_less = document_less["extract_count"]
  1520. province_less = document_less["province"]
  1521. city_less = document_less["city"]
  1522. district_less = document_less["district"]
  1523. web_source_no_less = document_less["web_source_no"]
  1524. extract_json_less = document_less["extract_json"]
  1525. page_attachments_less = document_less["page_attachments"]
  1526. if extract_json_less is not None:
  1527. _extract_less = json.loads(extract_json_less)
  1528. _extract_greater = {}
  1529. if extract_json_greater is not None:
  1530. _extract_greater = json.loads(extract_json_greater)
  1531. moneys_less = set(_extract_less.get("moneys",[]))
  1532. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1533. moneys_greater = set(_extract_greater.get("moneys",[]))
  1534. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1535. if page_attachments_less is None:
  1536. page_attachments_less = '[]'
  1537. if page_attachments_greater is None:
  1538. page_attachments_greater = '[]'
  1539. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1540. if _prob<0.1:
  1541. _pass = False
  1542. break
  1543. if _pass:
  1544. final_group.append(document_greater)
  1545. else:
  1546. break
  1547. _index += 1
  1548. dumplicates = ""
  1549. if _index>1:
  1550. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1551. final_group.sort(key=lambda x:x["docid"])
  1552. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1553. _set = set()
  1554. for _d in final_group:
  1555. _docid = _d["docid"]
  1556. if _docid in _set:
  1557. continue
  1558. dumplicates += "%d,"%_docid
  1559. _set.add(_docid)
  1560. dumplicates = dumplicates[:-1]
  1561. return dumplicates
  1562. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  1563. class f_redump_probability_final_check_bak(BaseUDAF):
  1564. '''
  1565. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1566. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1567. '''
  1568. def __init__(self):
  1569. import logging
  1570. import json,re
  1571. global json,logging,re
  1572. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1573. def new_buffer(self):
  1574. return [list()]
  1575. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  1576. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1577. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1578. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  1579. def merge(self, buffer, pbuffer):
  1580. buffer[0].extend(pbuffer[0])
  1581. def terminate(self, buffer):
  1582. list_group = []
  1583. the_group = buffer[0]
  1584. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1585. _index = 0
  1586. if len(the_group)>0:
  1587. _index = 1
  1588. while _index<len(the_group):
  1589. document_greater = the_group[_index]
  1590. docchannel_greater = document_greater["docchannel"]
  1591. page_time_greater = document_greater["page_time"]
  1592. doctitle_refine_greater = document_greater["doctitle_refine"]
  1593. project_code_greater = document_greater["project_code"]
  1594. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1595. tenderee_greater = document_greater["tenderee"]
  1596. agency_greater = document_greater["agency"]
  1597. win_tenderer_greater = document_greater["win_tenderer"]
  1598. bidding_budget_greater = document_greater["bidding_budget"]
  1599. win_bid_price_greater = document_greater["win_bid_price"]
  1600. product_greater = document_greater["product"]
  1601. package_greater = document_greater["package"]
  1602. json_time_greater = document_greater["json_dicttime"]
  1603. _less_index = 0
  1604. while _less_index<_index:
  1605. document_less = the_group[_less_index]
  1606. docchannel_less = document_less["docchannel"]
  1607. page_time_less = document_less["page_time"]
  1608. doctitle_refine_less = document_less["doctitle_refine"]
  1609. project_code_less = document_less["project_code"]
  1610. nlp_enterprise_less = document_less["nlp_enterprise"]
  1611. tenderee_less = document_less["tenderee"]
  1612. agency_less = document_less["agency"]
  1613. win_tenderer_less = document_less["win_tenderer"]
  1614. bidding_budget_less = document_less["bidding_budget"]
  1615. win_bid_price_less = document_less["win_bid_price"]
  1616. product_less = document_less["product"]
  1617. package_less = document_less["package"]
  1618. json_time_less = document_less["json_dicttime"]
  1619. check_result = {"pass":1}
  1620. if docchannel_less in (51,102,103,104,115,116,117):
  1621. if doctitle_refine_less!=doctitle_refine_greater:
  1622. if page_time_less!=page_time_greater:
  1623. check_result["docchannel"] = 0
  1624. check_result["pass"] = 0
  1625. else:
  1626. check_result["docchannel"] = 2
  1627. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1628. check_result["doctitle"] = 0
  1629. check_result["pass"] = 0
  1630. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  1631. else:
  1632. check_result["doctitle"] = 2
  1633. #added check
  1634. if not check_codes([project_code_less],[project_code_greater]):
  1635. check_result["code"] = 0
  1636. check_result["pass"] = 0
  1637. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  1638. else:
  1639. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  1640. check_result["code"] = 2
  1641. else:
  1642. check_result["code"] = 1
  1643. if not check_product(product_less,product_greater):
  1644. check_result["product"] = 0
  1645. check_result["pass"] = 0
  1646. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  1647. else:
  1648. if getLength(product_less)>0 and getLength(product_greater)>0:
  1649. check_result["product"] = 2
  1650. else:
  1651. check_result["product"] = 1
  1652. if not check_demand():
  1653. check_result["pass"] = 0
  1654. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1655. tenderee_less,tenderee_greater,
  1656. agency_less,agency_greater,
  1657. win_tenderer_less,win_tenderer_greater):
  1658. check_result["entity"] = 0
  1659. check_result["pass"] = 0
  1660. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1661. else:
  1662. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1663. check_result["entity"] = 2
  1664. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1665. check_result["entity"] = 2
  1666. else:
  1667. check_result["entity"] = 1
  1668. if not check_money(bidding_budget_less,bidding_budget_greater,
  1669. win_bid_price_less,win_bid_price_greater):
  1670. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1671. check_result["money"] = 0
  1672. check_result["pass"] = 0
  1673. else:
  1674. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1675. check_result["money"] = 2
  1676. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1677. check_result["money"] = 2
  1678. else:
  1679. check_result["money"] = 1
  1680. #added check
  1681. if not check_package(package_less,package_greater):
  1682. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  1683. check_result["package"] = 0
  1684. check_result["pass"] = 0
  1685. else:
  1686. if getLength(package_less)>0 and getLength(package_greater)>0:
  1687. check_result["package"] = 2
  1688. else:
  1689. check_result["package"] = 1
  1690. #added check
  1691. if not check_time(json_time_less,json_time_greater):
  1692. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  1693. check_result["time"] = 0
  1694. check_result["pass"] = 0
  1695. else:
  1696. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1697. check_result["time"] = 2
  1698. else:
  1699. check_result["time"] = 1
  1700. if check_result.get("pass",0)==0:
  1701. logging.info(str(check_result))
  1702. if check_result.get("time",1)==0:
  1703. break
  1704. if check_result.get("money",1)==0:
  1705. break
  1706. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  1707. pass
  1708. else:
  1709. break
  1710. _less_index += 1
  1711. if _less_index!=_index:
  1712. break
  1713. _index += 1
  1714. dumplicates = ""
  1715. if _index>1:
  1716. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1717. final_group = the_group[:_index]
  1718. final_group.sort(key=lambda x:x["docid"])
  1719. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1720. _set = set()
  1721. for _d in final_group:
  1722. _docid = _d["docid"]
  1723. if _docid in _set:
  1724. continue
  1725. dumplicates += "%d,"%_docid
  1726. _set.add(_docid)
  1727. dumplicates = dumplicates[:-1]
  1728. return dumplicates
  1729. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  1730. class f_set_docid_binaryChart(BaseUDAF):
  1731. '''
  1732. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1733. '''
  1734. def __init__(self):
  1735. import json
  1736. global json
  1737. def new_buffer(self):
  1738. return [[]]
  1739. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  1740. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  1741. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  1742. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  1743. "agency":agency,"web_source_no":web_source_no})
  1744. def merge(self, buffer, pbuffer):
  1745. buffer[0].extend(pbuffer[0])
  1746. def terminate(self, buffer):
  1747. list_docs = buffer[0]
  1748. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
  1749. list_group = []
  1750. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  1751. for _timeGroups in list_timeGroups:
  1752. list_empty = []
  1753. list_notEmpty = []
  1754. for _item in _timeGroups:
  1755. empty_flag = True
  1756. for _key in empty_key:
  1757. if not isEmpty(_item[_key]):
  1758. empty_flag = False
  1759. break
  1760. if empty_flag:
  1761. list_empty.append(_item)
  1762. else:
  1763. list_notEmpty.append(_item)
  1764. for _e in list_empty:
  1765. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  1766. _e_tenderee = _e["tenderee"]
  1767. for _ne in list_notEmpty:
  1768. if "set_webSource" not in _ne:
  1769. _ne["set_webSource"] = set()
  1770. _ne["set_webSource"].add(_ne["web_source_no"])
  1771. _suit = False
  1772. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  1773. _suit = True
  1774. elif isEmpty(_e_tenderee):
  1775. _suit = True
  1776. if _suit:
  1777. if _e["web_source_no"] not in _ne["set_webSource"]:
  1778. _ne["set_webSource"].add(_e["web_source_no"])
  1779. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  1780. break
  1781. if len(_group)>1:
  1782. list_group.append(_group)
  1783. return json.dumps(list_group)
  1784. def split_with_time(list_dict,sort_key,timedelta=86400*7):
  1785. if len(list_dict)>0:
  1786. if sort_key in list_dict[0]:
  1787. list_dict.sort(key=lambda x:x[sort_key])
  1788. list_group = []
  1789. _begin = 0
  1790. for i in range(len(list_dict)-1):
  1791. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  1792. continue
  1793. else:
  1794. _group = []
  1795. for j in range(_begin,i+1):
  1796. _group.append(list_dict[j])
  1797. if len(_group)>1:
  1798. list_group.append(_group)
  1799. _begin = i + 1
  1800. if len(list_dict)>1:
  1801. _group = []
  1802. for j in range(_begin,len(list_dict)):
  1803. _group.append(list_dict[j])
  1804. if len(_group)>1:
  1805. list_group.append(_group)
  1806. return list_group
  1807. return [list_dict]
  1808. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  1809. class f_set_docid_limitNum_contain(BaseUDAF):
  1810. '''
  1811. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  1812. '''
  1813. def __init__(self):
  1814. import logging
  1815. import json,re
  1816. global json,logging,re
  1817. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1818. def new_buffer(self):
  1819. return [list()]
  1820. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  1821. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  1822. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  1823. "contain_column":contain_column})
  1824. def merge(self, buffer, pbuffer):
  1825. buffer[0].extend(pbuffer[0])
  1826. def terminate(self, buffer):
  1827. list_split = split_with_time(buffer[0],"page_time_stamp")
  1828. list_group = []
  1829. for _split in list_split:
  1830. flag = True
  1831. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  1832. for _key in keys:
  1833. logging.info(_key+str(getSet(_split,_key)))
  1834. if len(getSet(_split,_key))>1:
  1835. flag = False
  1836. break
  1837. MAX_CONTAIN_COLUMN = None
  1838. #判断组内每条公告是否包含
  1839. if flag:
  1840. for _d in _split:
  1841. contain_column = _d["contain_column"]
  1842. if contain_column is not None and contain_column !="":
  1843. if MAX_CONTAIN_COLUMN is None:
  1844. MAX_CONTAIN_COLUMN = contain_column
  1845. else:
  1846. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  1847. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  1848. flag = False
  1849. break
  1850. MAX_CONTAIN_COLUMN = contain_column
  1851. else:
  1852. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  1853. flag = False
  1854. break
  1855. if flag:
  1856. if len(_split)>1:
  1857. _group = []
  1858. for _item in _split:
  1859. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  1860. list_group.append(_group)
  1861. return json.dumps(list_group)
  1862. @annotate('bigint->string')
  1863. class f_stamp_squence(BaseUDAF):
  1864. '''
  1865. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1866. '''
  1867. def __init__(self):
  1868. import json
  1869. global json
  1870. import logging
  1871. global logging
  1872. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1873. def new_buffer(self):
  1874. return [set()]
  1875. def iterate(self, buffer,page_time_stamp):
  1876. buffer[0].add(page_time_stamp)
  1877. def merge(self, buffer, pbuffer):
  1878. buffer[0] |= pbuffer[0]
  1879. def terminate(self, buffer):
  1880. if 0 in buffer[0]:
  1881. buffer[0].remove(0)
  1882. list_stamp = list(buffer[0])
  1883. list_stamp.sort(key=lambda x:x)
  1884. list_stamp_final = []
  1885. _begin = 0
  1886. _time_decase = 86400*7
  1887. logging.info(str(list_stamp))
  1888. for _index in range(len(list_stamp)-1):
  1889. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  1890. continue
  1891. else:
  1892. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  1893. _begin = _index+1
  1894. if len(list_stamp)>0:
  1895. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  1896. return json.dumps(list_stamp_final)
  1897. @annotate("bigint,string->bigint")
  1898. class in_stamp(object):
  1899. def __init__(self):
  1900. import logging
  1901. import re
  1902. import json
  1903. global logging,re,json
  1904. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1905. def evaluate(self, page_time_stamp,json_stamp):
  1906. list_stamp = json.loads(json_stamp)
  1907. int_flag = 0
  1908. for item in list_stamp:
  1909. if page_time_stamp <item[0]:
  1910. break
  1911. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  1912. int_flag = 1
  1913. break
  1914. return int_flag
  1915. def getConfidence(rule_id):
  1916. if rule_id ==0:
  1917. return 30
  1918. elif rule_id >=1 and rule_id <30:
  1919. return 20
  1920. else:
  1921. return 10
  1922. @annotate('string,string -> string')
  1923. class f_splitStr(BaseUDTF):
  1924. '''
  1925. 将多个组拆解成多条记录
  1926. '''
  1927. def __init__(self):
  1928. import logging
  1929. import json
  1930. global json,logging
  1931. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1932. def process(self, str_split,_split):
  1933. try:
  1934. for _s in str_split.split(_split):
  1935. self.forward(_s)
  1936. except Exception as e:
  1937. pass
  1938. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  1939. class f_split_group_single(BaseUDTF):
  1940. '''
  1941. 将多个组拆解成多条记录
  1942. '''
  1943. def __init__(self):
  1944. import logging
  1945. import json
  1946. global json,logging
  1947. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1948. def process(self, json_set_docid,rule_id):
  1949. list_group = json.loads(json_set_docid)
  1950. for item in list_group:
  1951. if len(item)>100:
  1952. item.sort(key=lambda x:x["docid"],reverse=True)
  1953. index_i = 0
  1954. for index_j in range(1,len(item)):
  1955. if item[index_i]["docid"]!=item[index_j]["docid"]:
  1956. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1957. else:
  1958. for index_i in range(len(item)):
  1959. for index_j in range(len(item)):
  1960. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  1961. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  1962. @annotate('bigint,string->string')
  1963. class group_document(BaseUDAF):
  1964. '''
  1965. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1966. '''
  1967. def __init__(self):
  1968. import json
  1969. global json
  1970. def new_buffer(self):
  1971. return [[]]
  1972. def iterate(self, buffer,id,json_set_docid):
  1973. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  1974. def merge(self, buffer, pbuffer):
  1975. buffer[0].extend(pbuffer[0])
  1976. def terminate(self, buffer):
  1977. return json.dumps(buffer[0])
  1978. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  1979. class decare_document(BaseUDTF):
  1980. '''
  1981. 将多个组拆解成多条记录
  1982. '''
  1983. def __init__(self):
  1984. import logging
  1985. import json
  1986. global json,logging
  1987. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1988. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  1989. #y=x,少掉近一半的数据
  1990. if group_id1>=group_id2:
  1991. list_doc1 = json.loads(json_list_doc1)
  1992. list_doc2 = json.loads(json_list_doc2)
  1993. for _doc1 in list_doc1:
  1994. for _doc2 in list_doc2:
  1995. #同一个重复group不做判断
  1996. if _doc1["id"]!=_doc2["id"]:
  1997. #判断两个group是否有重复
  1998. _set1 = set()
  1999. for _item1 in _doc1["json_set_docid"]:
  2000. _set1.add(_item1["docid"])
  2001. _set2 = set()
  2002. for _item2 in _doc2["json_set_docid"]:
  2003. _set2.add(_item2["docid"])
  2004. if len(_set1&_set2)>0:
  2005. new_json_set_docid = _doc1["json_set_docid"]
  2006. for _item2 in _doc2["json_set_docid"]:
  2007. if _item2["docid"] not in _set1:
  2008. new_json_set_docid.append(_item2)
  2009. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  2010. def getBestDocid(list_pair):
  2011. # [docid1,extract_count1,docid2,extract_count2]
  2012. # list_pair.sort(key=lambda x:x[3],reverse=True)
  2013. # _max_count = max(list_pair[0][3],list_pair[0][1])
  2014. # set_candidate = set()
  2015. # if list_pair[0][1]==_max_count:
  2016. # set_candidate.add(list_pair[0][0])
  2017. # for item in list_pair:
  2018. # if item[3]==_max_count:
  2019. # set_candidate.add(item[2])
  2020. # else:
  2021. # break
  2022. # list_candidate = list(set_candidate)
  2023. # list_candidate.sort(key=lambda x:x)
  2024. new_pair = []
  2025. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  2026. for item in list_pair:
  2027. new_pair.append([item[0],item[2],item[3]])
  2028. new_pair.sort(key=lambda x:x[1])
  2029. new_pair.sort(key=lambda x:x[2],reverse=True)
  2030. return new_pair[0][1]
  2031. @annotate('bigint,bigint,bigint,bigint->string')
  2032. class choose_document(BaseUDAF):
  2033. '''
  2034. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2035. '''
  2036. def __init__(self):
  2037. import json
  2038. global json
  2039. def new_buffer(self):
  2040. return [[]]
  2041. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2042. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2043. def merge(self, buffer, pbuffer):
  2044. buffer[0].extend(pbuffer[0])
  2045. def terminate(self, buffer):
  2046. list_pair = buffer[0]
  2047. _set = set()
  2048. for item in buffer[0]:
  2049. _set.add(str(item[2]))
  2050. list_dumplicate = list(_set)
  2051. best_docid = getBestDocid(list_pair)
  2052. if best_docid==list_pair[0][0]:
  2053. save_flag = 1
  2054. else:
  2055. save_flag = 0
  2056. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  2057. @annotate('string -> bigint,string')
  2058. class f_get_choose_document(BaseUDTF):
  2059. '''
  2060. 将多个组拆解成多条记录
  2061. '''
  2062. def __init__(self):
  2063. import logging
  2064. import json
  2065. global json,logging
  2066. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2067. def process(self,json_choose):
  2068. if json_choose is None:
  2069. self.forward(1,None)
  2070. else:
  2071. _choose = json.loads(json_choose)
  2072. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  2073. @annotate('string->bigint')
  2074. class f_get_codes_count(object):
  2075. def evaluate(self,extract_json):
  2076. if extract_json is None or extract_json=="":
  2077. extract_json = "{}"
  2078. _extract = json.loads(extract_json)
  2079. _codes = _extract.get("code",[])
  2080. return len(_codes)
  2081. @annotate('string->string')
  2082. class f_get_codes(object):
  2083. def evaluate(self,extract_json):
  2084. if extract_json is None or extract_json=="":
  2085. extract_json = "{}"
  2086. _extract = json.loads(extract_json)
  2087. _codes = _extract.get("code",[])
  2088. return ",".join(_codes)
  2089. @annotate('bigint,bigint,bigint,bigint->string')
  2090. class group_document_bestFirst(BaseUDAF):
  2091. '''
  2092. 将组里面最优的放在前面
  2093. '''
  2094. def __init__(self):
  2095. import json
  2096. global json
  2097. def new_buffer(self):
  2098. return [[]]
  2099. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2100. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2101. def merge(self, buffer, pbuffer):
  2102. buffer[0].extend(pbuffer[0])
  2103. def terminate(self, buffer):
  2104. list_pair = buffer[0]
  2105. _set = set()
  2106. for item in buffer[0]:
  2107. _set.add(item[2])
  2108. _set.add(list_pair[0][0])
  2109. best_docid = getBestDocid(list_pair)
  2110. _set.remove(best_docid)
  2111. list_dumplicate = list(_set)
  2112. list_dumplicate.sort(key=lambda x:x)
  2113. list_dumplicate.insert(0,best_docid)
  2114. list_dumplicate_str = []
  2115. for item in list_dumplicate:
  2116. list_dumplicate_str.append(str(item))
  2117. return ",".join(list_dumplicate_str)
  2118. @annotate('string -> bigint,string')
  2119. class f_get_best_dumplicates(BaseUDTF):
  2120. '''
  2121. 得到每个分组中最优的那一条及其重复记录
  2122. '''
  2123. def __init__(self):
  2124. import logging
  2125. import json
  2126. global json,logging
  2127. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2128. def process(self,list_dumplicate_str):
  2129. if list_dumplicate_str is None or list_dumplicate_str=='':
  2130. pass
  2131. else:
  2132. list_dumplicate = list_dumplicate_str.split(",")
  2133. if len(list_dumplicate)>0:
  2134. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  2135. else:
  2136. pass
  2137. @annotate('bigint,bigint->string')
  2138. class bridge2group(BaseUDAF):
  2139. '''
  2140. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2141. '''
  2142. def __init__(self):
  2143. import json
  2144. global json
  2145. def new_buffer(self):
  2146. return [set()]
  2147. def iterate(self, buffer,docid1,docid2):
  2148. buffer[0].add(docid1)
  2149. buffer[0].add(docid2)
  2150. def merge(self, buffer, pbuffer):
  2151. buffer[0] |= pbuffer[0]
  2152. def terminate(self, buffer):
  2153. list_pair = list(buffer[0])
  2154. list_pair.sort(key=lambda x:x,reverse=True)
  2155. return json.dumps(list_pair)
  2156. @annotate('string -> bigint,bigint')
  2157. class group2bridge(BaseUDTF):
  2158. '''
  2159. 将多个组拆解成多条记录
  2160. '''
  2161. def __init__(self):
  2162. import logging
  2163. import json
  2164. global json,logging
  2165. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2166. def process(self,json_list_docid):
  2167. list_docid = json.loads(json_list_docid)
  2168. for _docid in list_docid:
  2169. self.forward(list_docid[-1],_docid)
  2170. @annotate('string->string')
  2171. class to_url(object):
  2172. def evaluate(self,_s):
  2173. if _s is None or _s=="":
  2174. return
  2175. else:
  2176. list_l = []
  2177. for l in _s.split(","):
  2178. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  2179. return ",".join(list_l)
  2180. @annotate('bigint,bigint,string -> bigint')
  2181. class f_get_dump_docid(BaseUDTF):
  2182. '''
  2183. 将多个组拆解成多条记录
  2184. '''
  2185. def __init__(self):
  2186. import logging
  2187. import json
  2188. global json,logging
  2189. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2190. def process(self,docid,save_flag,dumplicates):
  2191. if save_flag==0:
  2192. self.forward(docid)
  2193. if dumplicates is not None:
  2194. list_docid = dumplicates.split(",")
  2195. if len(list_docid)>0:
  2196. for _docid in list_docid[1:]:
  2197. self.forward(int(_docid))
  2198. else:
  2199. if dumplicates is not None:
  2200. list_docid = dumplicates.split(",")
  2201. if len(list_docid)>0:
  2202. for _docid in list_docid:
  2203. self.forward(int(_docid))
  2204. @annotate('string -> bigint,bigint')
  2205. class f_get_docid(BaseUDTF):
  2206. '''
  2207. 将多个组拆解成多条记录
  2208. '''
  2209. def __init__(self):
  2210. import logging
  2211. import json
  2212. global json,logging
  2213. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2214. def process(self,json_set_docid):
  2215. team_id = 0
  2216. if json_set_docid is not None:
  2217. list_docses = json.loads(json_set_docid)
  2218. for list_docs in list_docses:
  2219. team_id += 1
  2220. for item in list_docs:
  2221. self.forward(team_id,item["docid"])
  2222. @annotate("string->bigint")
  2223. class get_count_dump(object):
  2224. def __init__(self):
  2225. import logging
  2226. import re
  2227. global logging,re
  2228. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2229. def evaluate(self, title):
  2230. _count = 0
  2231. if title is not None:
  2232. _count = len(title.split(","))
  2233. return _count
  2234. def getSet(list_dict,key):
  2235. _set = set()
  2236. for item in list_dict:
  2237. if key in item:
  2238. if item[key]!='' and item[key] is not None:
  2239. if re.search("^\d[\d\.]*$",item[key]) is not None:
  2240. _set.add(str(float(item[key])))
  2241. else:
  2242. _set.add(str(item[key]))
  2243. return _set
  2244. def getDiffIndex(list_dict,key,confidence=100):
  2245. '''
  2246. 优化为相似度判断
  2247. :param list_dict:
  2248. :param key:
  2249. :param confidence:
  2250. :return:
  2251. '''
  2252. # _set = set()
  2253. # for _i in range(len(list_dict)):
  2254. # item = list_dict[_i]
  2255. # if item["confidence"]>=confidence:
  2256. # continue
  2257. # if key in item:
  2258. # if item[key]!='' and item[key] is not None:
  2259. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2260. # _set.add(str(float(item[key])))
  2261. # else:
  2262. # _set.add(str(item[key]))
  2263. # if len(_set)>1:
  2264. # return _i
  2265. # ==============================
  2266. _set = set()
  2267. _set_m = set()
  2268. base_s = ""
  2269. for _i in range(len(list_dict)):
  2270. item = list_dict[_i]
  2271. if item["confidence"]>=confidence:
  2272. continue
  2273. if key in item:
  2274. if item[key]!='' and item[key] is not None:
  2275. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2276. _m = float(item[key])
  2277. if _m>100000:
  2278. _m = _m//10000*10000
  2279. _set_m.add(str(_m))
  2280. else:
  2281. _s = str(item[key])
  2282. if base_s=="":
  2283. base_s = _s
  2284. else:
  2285. simi = getSimilarityOfString(base_s,_s)
  2286. if simi<0.8:
  2287. return _i
  2288. if len(_set_m)>1:
  2289. return _i
  2290. return len(list_dict)
  2291. @annotate('bigint,string -> bigint,bigint')
  2292. class f_getGroup_dumpFinal(BaseUDTF):
  2293. '''
  2294. 从最后的结果中获取组
  2295. '''
  2296. def __init__(self):
  2297. import logging
  2298. import json
  2299. global json,logging
  2300. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2301. def process(self,docid,dumplicates):
  2302. self.forward(int(docid),int(docid))
  2303. if dumplicates is not None:
  2304. list_docids = dumplicates.split(",")
  2305. for _docid in list_docids:
  2306. self.forward(int(docid),int(_docid))
  2307. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  2308. class f_redump_limit_num(BaseUDAF):
  2309. '''
  2310. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2311. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2312. '''
  2313. def __init__(self):
  2314. import logging
  2315. import json,re
  2316. global json,logging,re
  2317. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2318. def new_buffer(self):
  2319. return [list()]
  2320. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  2321. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  2322. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  2323. "extract_count2":extract_count2,"confidence":confidence})
  2324. def merge(self, buffer, pbuffer):
  2325. buffer[0].extend(pbuffer[0])
  2326. def terminate(self, buffer):
  2327. list_group = []
  2328. the_group = buffer[0]
  2329. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2330. if len(the_group)>5:
  2331. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  2332. else:
  2333. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  2334. final_group = []
  2335. #置信度
  2336. list_key_index = []
  2337. for _k in keys:
  2338. if _k=="doctitle":
  2339. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2340. else:
  2341. list_key_index.append(getDiffIndex(the_group,_k))
  2342. _index = min(list_key_index)
  2343. if _index>1:
  2344. main_docid = the_group[0]["main_docid"]
  2345. for item in the_group[:_index]:
  2346. if item["docid"]!=main_docid:
  2347. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  2348. # stay = True
  2349. # for _key in keys:
  2350. # if len(getSet(the_group,_key))>1:
  2351. # stay = False
  2352. # break
  2353. #
  2354. # if stay:
  2355. # main_docid = the_group[0]["main_docid"]
  2356. # for item in the_group:
  2357. # if item["docid"]!=main_docid:
  2358. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  2359. return json.dumps(final_group)
  2360. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  2361. class f_get_dumpFinal_checked(BaseUDTF):
  2362. '''
  2363. 从最后的结果中获取组
  2364. '''
  2365. def __init__(self):
  2366. import logging
  2367. import json
  2368. global json,logging
  2369. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2370. def process(self,list_group):
  2371. if list_group is not None:
  2372. final_group = json.loads(list_group)
  2373. for _group in final_group:
  2374. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  2375. @annotate('string -> bigint')
  2376. class f_getDumplicateDocids(BaseUDTF):
  2377. '''
  2378. 从最后的结果中获取组
  2379. '''
  2380. def __init__(self):
  2381. import logging
  2382. import json
  2383. global json,logging
  2384. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2385. def process(self,dumplicates):
  2386. list_docids = dumplicates.split(",")
  2387. for _d in list_docids:
  2388. self.forward(int(_d))
  2389. def jaccard_score(source,target):
  2390. source_set = set([s for s in source])
  2391. target_set = set([s for s in target])
  2392. if len(source_set)==0 or len(target_set)==0:
  2393. return 0
  2394. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  2395. def getSimilarityOfString(str1,str2):
  2396. _set1 = set()
  2397. _set2 = set()
  2398. if str1 is not None:
  2399. for i in range(1,len(str1)):
  2400. _set1.add(str1[i-1:i+1])
  2401. for i in range(2,len(str1)):
  2402. _set1.add(str1[i-2:i+1])
  2403. if str2 is not None:
  2404. for i in range(1,len(str2)):
  2405. _set2.add(str2[i-1:i+1])
  2406. for i in range(2,len(str2)):
  2407. _set2.add(str2[i-2:i+1])
  2408. _len = max(1,min(len(_set1),len(_set2)))
  2409. return len(_set1&_set2)/_len
  2410. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  2411. class f_is_legal(object):
  2412. def __init__(self):
  2413. import logging
  2414. import re
  2415. global logging,re
  2416. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2417. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  2418. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  2419. return 0
  2420. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  2421. return 0
  2422. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  2423. return 0
  2424. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  2425. return 0
  2426. _sim = getSimilarityOfString(project_code1,project_code2)
  2427. if _sim>0.7 and _sim<1:
  2428. return 0
  2429. return 1
  2430. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  2431. class f_autorule_group(BaseUDAF):
  2432. '''
  2433. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2434. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2435. '''
  2436. def __init__(self):
  2437. import logging
  2438. import json,re
  2439. global json,logging,re
  2440. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2441. def new_buffer(self):
  2442. return [list()]
  2443. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  2444. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  2445. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  2446. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  2447. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  2448. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  2449. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  2450. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  2451. def merge(self, buffer, pbuffer):
  2452. buffer[0].extend(pbuffer[0][:100])
  2453. buffer[0] = buffer[0][:100]
  2454. def getSameKeys(self,_dict1,_dict2):
  2455. list_keys = []
  2456. for k,v in _dict1.items():
  2457. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  2458. continue
  2459. v2 = _dict2.get(k,"")
  2460. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  2461. list_keys.append(k)
  2462. list_keys.sort(key=lambda x:x)
  2463. return "=".join(list_keys)
  2464. def terminate(self, buffer):
  2465. list_group = []
  2466. the_group = buffer[0]
  2467. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2468. if len(the_group)>5:
  2469. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2470. else:
  2471. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2472. #置信度
  2473. list_key_index = []
  2474. for _k in keys:
  2475. if _k=="doctitle":
  2476. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2477. else:
  2478. list_key_index.append(getDiffIndex(the_group,_k))
  2479. final_group = []
  2480. _index = min(list_key_index)
  2481. if _index>1:
  2482. for item in the_group[:_index]:
  2483. final_group.append(item)
  2484. list_rules = []
  2485. for i in range(len(final_group)):
  2486. for j in range(i+1,len(final_group)):
  2487. _dict1 = final_group[i]
  2488. _dict2 = final_group[j]
  2489. _rule = self.getSameKeys(_dict1,_dict2)
  2490. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  2491. return json.dumps(list_rules)
  2492. @annotate('string -> string,bigint,bigint')
  2493. class f_autorule_group_extract(BaseUDTF):
  2494. '''
  2495. 从最后的结果中获取组
  2496. '''
  2497. def __init__(self):
  2498. import logging
  2499. import json
  2500. global json,logging
  2501. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2502. def process(self,rules_json):
  2503. list_rules = json.loads(rules_json)
  2504. for _rule in list_rules:
  2505. self.forward(_rule[0],_rule[1],_rule[2])
  2506. if __name__ == '__main__':
  2507. # f = f_decode_for_dumplicate()
  2508. # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
  2509. # print(b)
  2510. print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
  2511. # f = f_get_extractCount()
  2512. # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
  2513. # print(f.evaluate(j))
  2514. # _str1 = "PMJJ-202211030004001"
  2515. # _str2 = "PMJJ-202211030001001"
  2516. # print(getSimilarityOfString(_str1,_str2))
  2517. # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  2518. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  2519. # print(check_product(None,None))
  2520. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  2521. # print(check_money("550.0","440.0","",""))
  2522. # for i in range(0,2):
  2523. # print(i)
  2524. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  2525. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  2526. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  2527. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  2528. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  2529. # if _match is not None:
  2530. # print(_match.groupdict()["name"])
  2531. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  2532. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  2533. # c = f_get_extractCount()
  2534. # _json = '''
  2535. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  2536. # '''
  2537. # c = f_get_nlp_enterprise()
  2538. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  2539. # print(c.evaluate(_json))
  2540. # c = f_set_docid()
  2541. # _s = '''
  2542. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2543. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2544. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2545. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  2546. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2547. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2548. # '''
  2549. # buffer = c.new_buffer()
  2550. # for _line in _s.split("\n"):
  2551. # _line = _line.strip()
  2552. # if _line=="":
  2553. # continue
  2554. # l_column = _line.split("\t")
  2555. # print(l_column)
  2556. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  2557. # page_time_stamp = int(page_time_stamp)
  2558. # extract_count = int(extract_count)
  2559. # num = 1
  2560. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  2561. # print(c.terminate(buffer))