documentDumplicate.py 138 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031
  1. #coding:UTF8
  2. from odps.udf import annotate
  3. from odps.udf import BaseUDTF
  4. from odps.udf import BaseUDAF
  5. import re
  6. @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
  7. class f_decode_extract(BaseUDTF):
  8. def __init__(self):
  9. import logging
  10. import json
  11. import time,re
  12. global json,logging,time,re
  13. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  14. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. self.dict_channel = {"公告变更":51,
  16. "招标公告":52,
  17. "中标信息":101,
  18. "招标预告":102,
  19. "招标答疑":103,
  20. "资审结果":105,
  21. "法律法规":106,
  22. "新闻资讯":107,
  23. "采购意向":114,
  24. "拍卖出让":115,
  25. "土地矿产":116,
  26. "产权交易":117,
  27. "废标公告":118,
  28. "候选人公示":119,
  29. "合同公告":120}
  30. def process(self, extractjson,otherjson):
  31. if extractjson is not None:
  32. _extract = json.loads(extractjson)
  33. else:
  34. _extract = {}
  35. if otherjson is not None:
  36. _other = json.loads(otherjson)
  37. else:
  38. _other = {}
  39. project_code = ""
  40. project_name = ""
  41. tenderee = ""
  42. agency = ""
  43. win_tenderer = ""
  44. bidding_budget = ""
  45. win_bid_price = ""
  46. fingerprint = ""
  47. page_time_stamp = 0
  48. docchannel = 0
  49. extract_count = 0
  50. page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
  51. doctitle = _other.get("doctitle","")
  52. doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
  53. area = _other.get("area","")
  54. province = _other.get("province","")
  55. city = _other.get("city","")
  56. district = _other.get("district","")
  57. web_source_no = _other.get("webSourceNo","")
  58. time_bidclose = _extract.get("time_bidclose")
  59. time_bidopen = _extract.get("time_bidopen")
  60. time_bidstart = _extract.get("time_bidstart")
  61. time_commencement = _extract.get("time_commencement")
  62. time_completion = _extract.get("time_completion")
  63. time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
  64. time_earnest_money_start = _extract.get("time_earnestMoneyStart")
  65. time_get_file_end = _extract.get("time_getFileEnd")
  66. time_get_file_start = _extract.get("time_getFileStart")
  67. time_publicity_end = _extract.get("time_publicityEnd")
  68. time_publicity_start = _extract.get("time_publicityStart")
  69. time_registration_end = _extract.get("time_registrationEnd")
  70. time_registration_start = _extract.get("time_registrationStart")
  71. time_release = _extract.get("time_release")
  72. # docchannel = _other.get("docchannel",0)
  73. docchannel_name = _extract.get("docchannel",{}).get("docchannel")
  74. doctype_name = _extract.get("docchannel",{}).get("doctype")
  75. if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
  76. docchannel_name = doctype_name
  77. docchannel = self.dict_channel.get(docchannel_name,0)
  78. if re.search(self.time_pattern,page_time) is not None:
  79. try:
  80. timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
  81. page_time_stamp = int(time.mktime(timeArray))
  82. except Exception as e:
  83. pass
  84. list_code = _extract.get("code",[])
  85. if len(list_code)>0:
  86. project_code = list_code[0]
  87. project_name = _extract.get("name","")
  88. fingerprint = _extract.get("fingerprint","")
  89. dict_pack = _extract.get("prem",{})
  90. logging.info(dict_pack)
  91. for _key in dict_pack.keys():
  92. if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  93. extract_count += 1
  94. if bidding_budget=="":
  95. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  96. for _role in dict_pack[_key]["roleList"]:
  97. if isinstance(_role,list):
  98. extract_count += 1
  99. if _role[2]!='' and float(_role[2])>0:
  100. extract_count += 1
  101. if _role[0]=="tenderee":
  102. tenderee = _role[1]
  103. if _role[0]=="win_tenderer":
  104. if win_tenderer=="":
  105. win_tenderer = _role[1]
  106. if _role[2]!='' and float(_role[2])>0:
  107. extract_count += 1
  108. if win_bid_price=="":
  109. win_bid_price = str(float(_role[2]))
  110. if _role[0]=="agency":
  111. agency = _role[1]
  112. if isinstance(_role,dict):
  113. extract_count += 1
  114. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  115. extract_count += 1
  116. if _role["role_name"]=="tenderee":
  117. tenderee = _role["role_text"]
  118. if _role["role_name"]=="win_tenderer":
  119. if win_tenderer=="":
  120. win_tenderer = _role["role_text"]
  121. if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
  122. extract_count += 1
  123. if win_bid_price=="":
  124. win_bid_price = str(float(_role["role_money"]["money"]))
  125. if _role["role_name"]=="agency":
  126. agency = _role["role_text"]
  127. if project_code!="":
  128. extract_count += 1
  129. if project_name!="":
  130. extract_count += 1
  131. logging.info(page_time+doctitle+doctitle_refine+area+province+city+
  132. district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
  133. self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
  134. district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
  135. time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
  136. time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
  137. @annotate("string->string")
  138. class f_get_product(object):
  139. def __init__(self):
  140. import time
  141. global time
  142. import logging
  143. import json
  144. import re
  145. global json,logging,re
  146. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  147. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  148. def evaluate(self, extractjson):
  149. if extractjson is None or extractjson=="":
  150. extractjson = "{}"
  151. _extract = json.loads(extractjson)
  152. return ",".join(_extract.get("product",[]))
  153. @annotate("string->string")
  154. class f_get_package(object):
  155. def __init__(self):
  156. import time
  157. global time
  158. import logging
  159. import json
  160. import re
  161. global json,logging,re
  162. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  163. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  164. def evaluate(self, extractjson):
  165. if extractjson is None or extractjson=="":
  166. extractjson = "{}"
  167. _extract = json.loads(extractjson)
  168. prem = _extract.get("prem",{})
  169. list_pack = []
  170. for k,v in prem.items():
  171. if k!="Project":
  172. list_pack.append(k)
  173. return ",".join(list_pack)
  174. @annotate("string->string")
  175. class f_get_nlp_enterprise(object):
  176. def __init__(self):
  177. import time
  178. global time
  179. import logging
  180. import json
  181. import re
  182. global json,logging,re
  183. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  184. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  185. def evaluate(self, extractjson):
  186. if extractjson is None or extractjson=="":
  187. extractjson = "{}"
  188. _extract = json.loads(extractjson)
  189. nlp_enterprise = _extract.get("nlp_enterprise",[])
  190. nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
  191. if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
  192. dict_pack = _extract.get("prem",{})
  193. for _key in dict_pack.keys():
  194. for _role in dict_pack[_key]["roleList"]:
  195. if isinstance(_role,list):
  196. _entity = _role[1]
  197. nlp_enterprise.append(_entity)
  198. if isinstance(_role,dict):
  199. _entity = _role["role_text"]
  200. nlp_enterprise.append(_entity)
  201. nlp_enterprise = list(set(nlp_enterprise))
  202. dict_entity = {"indoctextcon":nlp_enterprise,
  203. "notindoctextcon":nlp_enterprise_attachment}
  204. return json.dumps(dict_entity,ensure_ascii=False)
  205. @annotate("string->bigint")
  206. class f_get_extractCount(object):
  207. def __init__(self):
  208. import time
  209. global time
  210. import logging
  211. import json
  212. import re
  213. global json,logging,re
  214. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  215. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  216. def evaluate(self, extractjson):
  217. if extractjson is not None:
  218. _extract = json.loads(extractjson)
  219. return _extract.get("extract_count",0)
  220. else:
  221. _extract = {}
  222. dict_pack = _extract.get("prem",{})
  223. extract_count = 0
  224. list_code = _extract.get("code",[])
  225. if len(list_code)>0:
  226. project_code = list_code[0]
  227. else:
  228. project_code = ""
  229. project_name = _extract.get("name","")
  230. bidding_budget = ""
  231. win_tenderer = ""
  232. win_bid_price = ""
  233. linklist_count = 0
  234. for _key in dict_pack.keys():
  235. if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
  236. extract_count += 1
  237. if bidding_budget=="":
  238. bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
  239. for _role in dict_pack[_key]["roleList"]:
  240. if isinstance(_role,list):
  241. extract_count += 1
  242. if _role[2]!='' and float(_role[2])>0:
  243. extract_count += 1
  244. if _role[0]=="tenderee":
  245. tenderee = _role[1]
  246. if _role[0]=="win_tenderer":
  247. if win_tenderer=="":
  248. win_tenderer = _role[1]
  249. if _role[2]!='' and float(_role[2])>0:
  250. extract_count += 1
  251. if win_bid_price=="":
  252. win_bid_price = str(float(_role[2]))
  253. if _role[0]=="agency":
  254. agency = _role[1]
  255. if isinstance(_role,dict):
  256. extract_count += 1
  257. if "role_money" in _role:
  258. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  259. extract_count += 1
  260. if _role.get("role_name")=="tenderee":
  261. tenderee = _role["role_text"]
  262. if _role.get("role_name")=="win_tenderer":
  263. if win_tenderer=="":
  264. win_tenderer = _role["role_text"]
  265. if "role_money" in _role:
  266. if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
  267. extract_count += 1
  268. if win_bid_price=="":
  269. win_bid_price = str(float(_role["role_money"]["money"]))
  270. if _role["role_name"]=="agency":
  271. agency = _role["role_text"]
  272. linklist = _role.get("linklist",[])
  273. for link in linklist:
  274. for l in link:
  275. if l!="":
  276. linklist_count += 1
  277. extract_count += linklist_count//2
  278. if project_code!="":
  279. extract_count += 1
  280. if project_name!="":
  281. extract_count += 1
  282. return extract_count
  283. @annotate('string,string,string,string,string -> string,string,string,bigint')
  284. class f_decode_sub_docs_json(BaseUDTF):
  285. def __init__(self):
  286. import logging
  287. import json
  288. global json,logging
  289. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  290. def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
  291. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  292. extract_count = 0
  293. if project_code is not None and project_code!="":
  294. extract_count += 1
  295. if project_name is not None and project_name!="":
  296. extract_count += 1
  297. if tenderee is not None and tenderee!="":
  298. extract_count += 1
  299. if agency is not None and agency!="":
  300. extract_count += 1
  301. if sub_docs_json is not None:
  302. for sub_docs in json.loads(sub_docs_json):
  303. for _key_sub_docs in sub_docs.keys():
  304. extract_count += 1
  305. if _key_sub_docs in columns:
  306. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  307. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  308. if float(sub_docs[_key_sub_docs])>0:
  309. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  310. else:
  311. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  312. self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
  313. @annotate('string,string,string -> string,string,string,string,string,string,string')
  314. class f_decode_for_dumplicate(BaseUDTF):
  315. def __init__(self):
  316. import logging
  317. import json
  318. global json,logging
  319. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  320. def process(self,sub_docs_json,extractjson,extract):
  321. if extractjson is None or extractjson=="":
  322. extractjson = "{}"
  323. try:
  324. _extract = json.loads(extractjson)
  325. except Exception as e:
  326. _extract = {}
  327. product = ",".join(_extract.get("product",[]))
  328. list_product = product.split(",")
  329. project_codes = ",".join(_extract.get("code",[]))
  330. list_code = project_codes.split(",")
  331. if sub_docs_json is not None:
  332. list_sub_docs = json.loads(sub_docs_json)
  333. else:
  334. list_sub_docs = [{}]
  335. max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
  336. if extract!="extract":
  337. win_tenderer = ""
  338. bidding_budget = ""
  339. win_bid_price = ""
  340. for _subdoc in list_sub_docs:
  341. win_tenderer = _subdoc.get("win_tenderer","")
  342. bidding_budget = _subdoc.get("bidding_budget","0")
  343. if float(bidding_budget)==0:
  344. bidding_budget = ""
  345. else:
  346. bidding_budget = str(float(bidding_budget))
  347. win_bid_price = _subdoc.get("win_bid_price","0")
  348. if float(win_bid_price)==0:
  349. win_bid_price = ""
  350. else:
  351. win_bid_price = str(float(win_bid_price))
  352. if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
  353. break
  354. print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
  355. self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
  356. else:
  357. for _i in range(max_len):
  358. _product = list_product[_i%len(list_product)]
  359. _code = list_code[_i%len(list_code)]
  360. _subdoc = list_sub_docs[_i%len(list_sub_docs)]
  361. win_tenderer = _subdoc.get("win_tenderer","")
  362. bidding_budget = _subdoc.get("bidding_budget","0")
  363. if float(bidding_budget)==0:
  364. bidding_budget = ""
  365. else:
  366. bidding_budget = str(float(bidding_budget))
  367. win_bid_price = _subdoc.get("win_bid_price","0")
  368. if float(win_bid_price)==0:
  369. win_bid_price = ""
  370. else:
  371. win_bid_price = str(float(win_bid_price))
  372. self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
  373. @annotate("string->bigint")
  374. class totimestamp(object):
  375. def __init__(self):
  376. import time
  377. global time
  378. import logging
  379. import json
  380. import re
  381. global json,logging,re
  382. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  383. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  384. def evaluate(self, str_time):
  385. try:
  386. logging.info(str_time)
  387. if str_time is not None and re.search(self.time_pattern,str_time) is not None:
  388. timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
  389. timeStamp = int(time.mktime(timeArray))
  390. return timeStamp
  391. else:
  392. return 0
  393. except Exception as e:
  394. return 0
  395. @annotate("string->string")
  396. class refind_name(object):
  397. def __init__(self):
  398. import logging
  399. import re
  400. global logging,re
  401. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  402. def evaluate(self, title):
  403. if title is not None:
  404. return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
  405. return ""
  406. @annotate('bigint,bigint,bigint,string,bigint,string->string')
  407. class f_set_docid(BaseUDAF):
  408. '''
  409. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  410. '''
  411. def __init__(self):
  412. import json
  413. global json
  414. def new_buffer(self):
  415. return [[]]
  416. def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
  417. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  418. "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
  419. def merge(self, buffer, pbuffer):
  420. buffer[0].extend(pbuffer[0])
  421. def terminate(self, buffer):
  422. list_docs = buffer[0]
  423. list_docs.sort(key=lambda x:x["page_time_stamp"])
  424. list_group = []
  425. _begin = 0
  426. defind_count = 0
  427. if len(list_docs)>0:
  428. defind_count = list_docs[0]["defind_count"]
  429. print(defind_count)
  430. for i in range(len(list_docs)-1):
  431. if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
  432. continue
  433. else:
  434. _group = []
  435. _set_column = set()
  436. _set_tenderee = set()
  437. for j in range(_begin,i+1):
  438. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  439. _set_tenderee.add(list_docs[j]["tenderee"])
  440. _set_column.add(list_docs[j]["defind_column"])
  441. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  442. if len(_group)>=3 and len(_set_tenderee)>1:
  443. pass
  444. else:
  445. print(defind_count,len(_set_column))
  446. if len(_group)>1:
  447. if defind_count==2:
  448. if len(_set_column)>=2:
  449. list_group.append(_group)
  450. elif defind_count==1:
  451. if len(_set_column)==1:
  452. list_group.append(_group)
  453. elif defind_count==0:
  454. list_group.append(_group)
  455. _begin = i+1
  456. if len(list_docs)>1:
  457. _set_column = set()
  458. _set_tenderee = set()
  459. _group = []
  460. for j in range(_begin,len(list_docs)):
  461. if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
  462. _set_tenderee.add(list_docs[j]["tenderee"])
  463. _set_column.add(list_docs[j]["defind_column"])
  464. _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
  465. if len(_group)>=3 and len(_set_tenderee)>1:
  466. pass
  467. else:
  468. if len(_group)>1:
  469. if defind_count==2:
  470. if len(_set_column)>=2:
  471. list_group.append(_group)
  472. elif defind_count==1:
  473. if len(_set_column)==1:
  474. list_group.append(_group)
  475. elif defind_count==0:
  476. list_group.append(_group)
  477. return json.dumps(list_group)
  478. # def terminate(self, buffer):
  479. #
  480. #
  481. # list_docs = buffer[0]
  482. # if len(list_docs)>0:
  483. # defind_count = list_docs[0]["defind_count"]
  484. #
  485. # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
  486. #
  487. # list_group = []
  488. # for time_group in list_time_group:
  489. # _group = []
  490. # _set_column = set()
  491. # base_tenderee = ""
  492. # _set_tenderee = set()
  493. # for j in range(len(time_group)):
  494. # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
  495. # # if base_tenderee =="":
  496. # # base_tenderee = time_group[j]["tenderee"]
  497. # # _set_tenderee.add(time_group[j]["tenderee"])
  498. # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
  499. # # if simi<0.8:
  500. # # _set_tenderee.add(time_group[j]["tenderee"])
  501. #
  502. # _set_tenderee.add(time_group[j]["tenderee"])
  503. # _set_column.add(time_group[j]["defind_column"])
  504. # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
  505. #
  506. # if len(_group)>=3 and len(_set_tenderee)>1:
  507. # pass
  508. # else:
  509. # if len(_group)>1:
  510. # if defind_count==2:
  511. # if len(_set_column)>=2:
  512. # list_group.append(_group)
  513. # elif defind_count==1:
  514. # if len(_set_column)==1:
  515. # list_group.append(_group)
  516. # elif defind_count==0:
  517. # list_group.append(_group)
  518. #
  519. # return json.dumps(list_group)
  520. def isEmpty(_str):
  521. if _str is None or _str=="":
  522. return True
  523. return False
  524. @annotate('bigint->string')
  525. class f_group_fingerprint(BaseUDAF):
  526. def __init__(self):
  527. import json
  528. global json
  529. def new_buffer(self):
  530. return [[]]
  531. def iterate(self, buffer,docid):
  532. buffer[0].append(docid)
  533. def merge(self, buffer, pbuffer):
  534. buffer[0].extend(pbuffer[0][:100000])
  535. def terminate(self, buffer):
  536. list_docid = buffer[0][:100000]
  537. list_docid.sort(key=lambda x:x)
  538. return ",".join([str(a) for a in list_docid])
  539. @annotate('string->bigint,string')
  540. class f_ungroup_fingerprint(BaseUDTF):
  541. def process(self,dumplicates):
  542. list_docid = dumplicates.split(",")
  543. self.forward(int(list_docid[0]),",".join(list_docid[1:]))
  544. @annotate('bigint,bigint,string->string')
  545. class f_dump_probability(BaseUDAF):
  546. '''
  547. 合并组为一条记录
  548. '''
  549. def __init__(self):
  550. import json
  551. global json
  552. def new_buffer(self):
  553. return [[]]
  554. def iterate(self, buffer,docid,page_time_stamp,_type):
  555. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
  556. def merge(self, buffer, pbuffer):
  557. buffer[0].extend(pbuffer[0])
  558. def terminate(self, buffer):
  559. list_dict = buffer[0]
  560. _set = set()
  561. list_data = []
  562. for _dict in list_dict:
  563. docid = _dict["docid"]
  564. if docid in _set:
  565. continue
  566. _set.add(docid)
  567. list_data.append(_dict)
  568. if len(list_data)>10000:
  569. break
  570. list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
  571. return json.dumps(list_group)
  572. @annotate('string -> bigint,bigint,bigint,bigint,string')
  573. class f_split_dumplicate_probability(BaseUDTF):
  574. def __init__(self):
  575. import logging
  576. import json
  577. global logging,json
  578. logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  579. def process(self,list_group_str):
  580. logging.info("0")
  581. logging.info(list_group_str)
  582. if list_group_str is not None:
  583. logging.info("1")
  584. try:
  585. list_group = json.loads(list_group_str)
  586. logging.info("2")
  587. for _group in list_group:
  588. if len(_group)>0:
  589. _type = _group[0].get("type","")
  590. logging.info("3%d"%len(list_group))
  591. # _group.sort(key=lambda x:x["page_time_stamp"])
  592. _len = min(100,len(_group))
  593. for _index_i in range(_len):
  594. _count = 0
  595. for _index_j in range(_index_i+1,_len):
  596. if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
  597. break
  598. _count += 1
  599. _docid1 = _group[_index_i]["docid"]
  600. _docid2 = _group[_index_j]["docid"]
  601. if _docid1<_docid2:
  602. self.forward(_docid1,_docid2,1,_len,_type)
  603. elif _docid1>_docid2:
  604. self.forward(_docid2,_docid1,1,_len,_type)
  605. except Exception as e:
  606. logging(str(e))
  607. @annotate('bigint,bigint,string->string')
  608. class f_dumplicate_groupPairs(BaseUDAF):
  609. '''
  610. 合并组为一条记录
  611. '''
  612. def __init__(self):
  613. import json
  614. global json
  615. def new_buffer(self):
  616. return [[]]
  617. def iterate(self, buffer,is_exists,counts,_type):
  618. buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
  619. def merge(self, buffer, pbuffer):
  620. buffer[0].extend(pbuffer[0])
  621. def terminate(self, buffer):
  622. list_dict = buffer[0]
  623. list_dict = list_dict[:10000]
  624. return json.dumps(list_dict)
  625. def check_columns(tenderee_less,tenderee_greater,
  626. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  627. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  628. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
  629. flag = True
  630. _set_tenderee = set()
  631. if tenderee_less is not None and tenderee_less!="":
  632. _set_tenderee.add(tenderee_less)
  633. if tenderee_greater is not None and tenderee_greater!="":
  634. _set_tenderee.add(tenderee_greater)
  635. if len(_set_tenderee)>1:
  636. return False
  637. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  638. if code_sim>0.6 and code_sim<1:
  639. return False
  640. #同批次不同编号
  641. if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
  642. _split_code_less = project_code_less.split("-")
  643. _split_code_greater = project_code_greater.split("-")
  644. if len(_split_code_less)>1 and len(_split_code_greater)>1:
  645. if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
  646. return False
  647. _set_win_tenderer = set()
  648. if win_tenderer_less is not None and win_tenderer_less!="":
  649. _set_win_tenderer.add(win_tenderer_less)
  650. if win_tenderer_greater is not None and win_tenderer_greater!="":
  651. _set_win_tenderer.add(win_tenderer_greater)
  652. if len(_set_win_tenderer)>1:
  653. return False
  654. _set_win_bid_price = set()
  655. if win_bid_price_less is not None and win_bid_price_less!="":
  656. _set_win_bid_price.add(float(win_bid_price_less))
  657. if win_bid_price_greater is not None and win_bid_price_greater!="":
  658. _set_win_bid_price.add(float(win_bid_price_greater))
  659. if len(_set_win_bid_price)>1:
  660. return False
  661. _set_bidding_budget = set()
  662. if bidding_budget_less is not None and bidding_budget_less!="":
  663. _set_bidding_budget.add(float(bidding_budget_less))
  664. if bidding_budget_greater is not None and bidding_budget_greater!="":
  665. _set_bidding_budget.add(float(bidding_budget_greater))
  666. if len(_set_bidding_budget)>1:
  667. return False
  668. return True
  669. import math
  670. def featurnCount(_count,max_count=100):
  671. return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
  672. def getSimLevel(str1,str2):
  673. str1_null = False
  674. str2_null = False
  675. _v = 0
  676. if str1 is None or str1=="":
  677. str1_null = True
  678. if str2 is None or str2=="":
  679. str2_null = True
  680. if str1_null and str2_null:
  681. _v = 2
  682. elif str1_null and not str2_null:
  683. _v = 4
  684. elif not str1_null and str2_null:
  685. _v = 6
  686. elif not str1_null and not str2_null:
  687. if str1==str2:
  688. _v = 10
  689. else:
  690. _v = 0
  691. return _v
  692. def getLength(_str):
  693. return len(str(_str) if _str is not None else "")
  694. def check_money(bidding_budget_less,bidding_budget_greater,
  695. win_bid_price_less,win_bid_price_greater,
  696. moneys_less,moneys_greater,
  697. moneys_attachment_less,moneys_attachment_greater):
  698. #只判断最高前六位
  699. if getLength(bidding_budget_less)>0:
  700. bidding_budget_less = round(float(bidding_budget_less))
  701. bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
  702. if getLength(bidding_budget_greater)>0:
  703. bidding_budget_greater = round(float(bidding_budget_greater))
  704. bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
  705. if getLength(win_bid_price_less)>0:
  706. win_bid_price_less = round(float(win_bid_price_less))
  707. win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
  708. if getLength(win_bid_price_greater)>0:
  709. win_bid_price_greater = round(float(win_bid_price_greater))
  710. win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
  711. #check saming
  712. budget_is_same = ""
  713. price_is_same = ""
  714. if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  715. budget_less = float(bidding_budget_less)
  716. budget_greater = float(bidding_budget_greater)
  717. if budget_less!=budget_greater:
  718. if min(budget_less,budget_greater)>0:
  719. if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
  720. budget_is_same = True
  721. if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
  722. budget_is_same = True
  723. if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
  724. budget_is_same = True
  725. if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
  726. budget_is_same = True
  727. if budget_is_same=="":
  728. return False
  729. if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  730. price_less = float(win_bid_price_less)
  731. price_greater = float(win_bid_price_greater)
  732. if price_less!=price_greater:
  733. if min(price_less,price_greater)>0:
  734. if max(price_less,price_greater)/min(price_less,price_greater)==10000:
  735. price_is_same = True
  736. if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
  737. price_is_same = True
  738. if price_less in moneys_greater or price_less in moneys_attachment_greater:
  739. price_is_same = True
  740. if price_greater in moneys_less or price_greater in moneys_attachment_less:
  741. price_is_same = True
  742. if price_is_same=="":
  743. return False
  744. return True
  745. def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  746. tenderee_less,tenderee_greater,
  747. agency_less,agency_greater,
  748. win_tenderer_less,win_tenderer_greater,
  749. similarity=0.85):
  750. def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
  751. if getLength(entity_less)>0 and getLength(entity_greater)>0:
  752. if entity_less!=entity_greater:
  753. is_same = ''
  754. _sim = jaccard_score(entity_less,entity_greater)
  755. if _sim>similarity:
  756. is_same = True
  757. if is_same=='':
  758. if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
  759. is_same = True
  760. if is_same=='':
  761. return False
  762. return True
  763. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
  764. return False
  765. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
  766. return False
  767. if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
  768. return False
  769. return True
  770. def check_punish(punish_less,punish_greater):
  771. same_count = 0
  772. not_same_count = 0
  773. _flag = True
  774. keys = list(set(list(punish_less.keys())) | set(list(punish_greater.keys())))
  775. for k in keys:
  776. v1 = punish_less.get(k)
  777. v2 = punish_greater.get(k)
  778. if getLength(v1)>0 and getLength(v2)>0:
  779. if k=="punish_code":
  780. if not check_codes([v1],[v2]):
  781. not_same_count += 1
  782. _flag = False
  783. else:
  784. same_count += 1
  785. if k=="punishDecision":
  786. if getSimilarityOfString(v1,v2)>0.8:
  787. same_count += 1
  788. if k in ("complainants","punishPeople","institutions"):
  789. if v1==v2:
  790. same_count += 1
  791. else:
  792. not_same_count == 1
  793. _flag = False
  794. return _flag,same_count,not_same_count
  795. def check_source_type(source_type_less,source_type_greater):
  796. if getLength(source_type_less)>0 and getLength(source_type_greater)>0:
  797. if source_type_less!=source_type_greater:
  798. return False
  799. return True
  800. def check_approval(approval_less,approval_greater,b_log):
  801. if b_log:
  802. logging.info("approval_less %s==approval_greater %s"%(approval_less,approval_greater))
  803. for _less in approval_less:
  804. for _greater in approval_greater:
  805. same_count = 0
  806. not_same_count = 0
  807. flag = True
  808. keys = ["source_stage","source_type","doc_num","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","evaluation_agency","legal_person","compilation_unit","time_approval"]
  809. for k in keys:
  810. v1 = _less.get(k)
  811. v2 = _greater.get(k)
  812. if getLength(v1)>0 and getLength(v2)>0:
  813. if k in ("source_stage","source_type"):
  814. if v1!=v2:
  815. flag = False
  816. if k in ("project_code","doc_num"):
  817. if check_codes([v1],[v2]):
  818. same_count += 1
  819. else:
  820. not_same_count -= 1
  821. if b_log:
  822. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  823. flag = False
  824. if k in ("approval_items","approval_result","project_name"):
  825. if getSimilarityOfString(v1,v2)>0.8:
  826. same_count += 1
  827. else:
  828. not_same_count -= 1
  829. if k in ("approver","construct_company","declare_company","evaluation_agency","legal_person","compilation_unit"):
  830. if v1==v2:
  831. same_count += 1
  832. else:
  833. not_same_count -= 1
  834. if b_log:
  835. logging.info("check approval %s false %s-%s"%(k,v1,v2))
  836. flag = False
  837. if flag and same_count>1:
  838. return flag,same_count,not_same_count
  839. flag = True
  840. if len(approval_less)>0 and len(approval_greater)>0:
  841. flag = False
  842. return flag,0,0
  843. def check_codes(project_codes_less,project_codes_greater):
  844. #check the similarity
  845. is_same = False
  846. is_sim = False
  847. for project_code_less in project_codes_less:
  848. for project_code_greater in project_codes_greater:
  849. code_sim = getSimilarityOfString(project_code_less,project_code_greater)
  850. if project_code_less is not None and project_code_greater is not None:
  851. if code_sim>0.6:
  852. if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
  853. is_same = True
  854. else:
  855. is_sim = True
  856. if project_code_less!=project_code_greater:
  857. if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
  858. is_sim = True
  859. if is_same:
  860. return True
  861. if is_sim:
  862. return False
  863. return True
  864. def check_demand():
  865. return True
  866. package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  867. code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
  868. num_pattern = re.compile("^\d+(?:\.\d+)?$")
  869. num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
  870. location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
  871. building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
  872. date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
  873. def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
  874. if code_greater is None:
  875. code_greater = []
  876. doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
  877. doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
  878. for _c in codes_less:
  879. doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
  880. for _c in code_greater:
  881. doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
  882. doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
  883. doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
  884. #check the package
  885. if doctitle_refind_less is None:
  886. doctitle_refind_less = ""
  887. if doctitle_refind_greater is None:
  888. doctitle_refind_greater = ""
  889. _pack1 = None
  890. _pack2 = None
  891. #if contain then pass
  892. if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
  893. return True
  894. #check the package in title
  895. _match = re.search(package_number_pattern,doctitle_refind_less)
  896. if _match is not None:
  897. _pack1 = _match.groupdict()["name"]
  898. _match = re.search(package_number_pattern,doctitle_refind_greater)
  899. if _match is not None:
  900. _pack2 = _match.groupdict()["name"]
  901. if _pack1 is not None and _pack2 is not None:
  902. if _pack1!=_pack2:
  903. return False
  904. #check the nums in title
  905. doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
  906. doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
  907. #check the nums,location,building in title
  908. for _p in [code_pattern]:
  909. num_all_l = re.findall(_p,doctitle_refind_less)
  910. num_all_g = re.findall(_p,doctitle_refind_greater)
  911. set_num_l = set()
  912. set_num_g = set()
  913. for _l in num_all_l:
  914. if re.search(num_pattern,_l) is not None:
  915. if _l.find(".")>0:
  916. set_num_l.add(_l)
  917. elif len(_l)<4:
  918. set_num_l.add(_l)
  919. for _g in num_all_g:
  920. if re.search(num_pattern,_g) is not None:
  921. if _g.find(".")>0:
  922. set_num_g.add(_g)
  923. elif len(_g)<4:
  924. set_num_g.add(_g)
  925. if len(set_num_l)>0 and len(set_num_g)>0:
  926. if len(set_num_l&set_num_g)!=len(set_num_l):
  927. return False
  928. #check location and keywords
  929. for _p in [num1_pattern,building_pattern]:
  930. num_all_l = re.findall(_p,doctitle_refind_less)
  931. num_all_g = re.findall(_p,doctitle_refind_greater)
  932. set_num_l = set(num_all_l)
  933. set_num_g = set(num_all_g)
  934. if len(set_num_l)==len(set_num_g):
  935. if len(set_num_l&set_num_g)!=len(set_num_l):
  936. return False
  937. #check the location has conflict
  938. for _p in [location_pattern]:
  939. num_all_l = re.findall(_p,doctitle_refind_less)
  940. num_all_g = re.findall(_p,doctitle_refind_greater)
  941. dict_num_l = {}
  942. dict_num_g = {}
  943. for _l in num_all_l:
  944. if len(_l)>0:
  945. key = _l[-1:]
  946. if key not in dict_num_l:
  947. dict_num_l[key] = set()
  948. dict_num_l[key].add(_l)
  949. for _g in num_all_g:
  950. if len(_g)>0:
  951. key = _g[-1:]
  952. if key not in dict_num_g:
  953. dict_num_g[key] = set()
  954. dict_num_g[key].add(_g)
  955. for k,v in dict_num_l.items():
  956. if k in dict_num_g:
  957. if len(v&dict_num_g[k])==0:
  958. return False
  959. return True
  960. def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
  961. if getLength(product_less)>0 and getLength(product_greater)>0:
  962. _product_l = product_less.split(split_char)
  963. _product_g = product_greater.split(split_char)
  964. same_count = 0
  965. if len(_product_l)>len(_product_g):
  966. a = _product_g
  967. _product_g = _product_l
  968. _product_l = a
  969. for _l in _product_l:
  970. for _g in _product_g:
  971. if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
  972. same_count += 1
  973. break
  974. if same_count/len(_product_l)>=0.5:
  975. return True
  976. return False
  977. return True
  978. def check_package(package_less,package_greater,split_char=","):
  979. if getLength(package_less)>0 and getLength(package_greater)>0:
  980. _product_l = package_less.split(split_char)
  981. _product_g = package_greater.split(split_char)
  982. for _l in _product_l:
  983. for _g in _product_g:
  984. if _l==_g:
  985. return True
  986. return False
  987. return True
  988. def check_time(json_time_less,json_time_greater):
  989. has_same = False
  990. has_diff = False
  991. if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
  992. if isinstance(json_time_less,dict):
  993. time_less = json_time_less
  994. else:
  995. time_less = json.loads(json_time_less)
  996. if isinstance(json_time_greater,dict):
  997. time_greater = json_time_greater
  998. else:
  999. time_greater = json.loads(json_time_greater)
  1000. for k,v in time_less.items():
  1001. if getLength(v)>0:
  1002. v1 = time_greater.get(k,"")
  1003. if getLength(v1)>0:
  1004. if v[:10]!=v1[:10]:
  1005. has_diff = True
  1006. else:
  1007. has_same = True
  1008. if has_same:
  1009. if has_diff:
  1010. return 1
  1011. return 2
  1012. if has_diff:
  1013. return 0
  1014. return 1
  1015. def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]",punish_less = {},punish_greater = {},approval_less = [],approval_greater = [],source_type_less = None,source_type_greater=None):
  1016. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1017. return 1
  1018. #一篇要素都在附件,且两篇附件md5有重叠
  1019. set_md5_less = set()
  1020. set_md5_greater = set()
  1021. list_md5_less = []
  1022. if page_attachments_less:
  1023. try:
  1024. list_md5_less = json.loads(page_attachments_less)
  1025. except Exception as e:
  1026. pass
  1027. list_md5_greater = []
  1028. if page_attachments_greater:
  1029. try:
  1030. list_md5_greater = json.loads(page_attachments_greater)
  1031. except Exception as e:
  1032. pass
  1033. for _l in list_md5_less:
  1034. _md5 = _l.get("fileMd5")
  1035. if _md5 is not None:
  1036. set_md5_less.add(_md5)
  1037. for _l in list_md5_greater:
  1038. _md5 = _l.get("fileMd5")
  1039. if _md5 is not None:
  1040. set_md5_greater.add(_md5)
  1041. if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
  1042. one_in_attach = False
  1043. dict_enterprise_less = json.loads(nlp_enterprise_less)
  1044. dict_enterprise_greater = json.loads(nlp_enterprise_greater)
  1045. indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
  1046. notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
  1047. indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
  1048. notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
  1049. if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
  1050. one_in_attach = True
  1051. if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
  1052. one_in_attach = True
  1053. if one_in_attach:
  1054. if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1055. return 1
  1056. #同一个站源,都有附件但附件没有重叠则不去重
  1057. if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
  1058. return 0
  1059. if isinstance(project_codes_less,str):
  1060. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1061. elif project_codes_less is None:
  1062. project_codes_less = []
  1063. if isinstance(project_codes_greater,str):
  1064. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1065. elif project_codes_greater is None:
  1066. project_codes_greater = []
  1067. same_count = 0
  1068. all_count = 8
  1069. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1070. same_count += 1
  1071. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1072. same_count += 1
  1073. if getLength(agency_less)>0 and agency_less==agency_greater:
  1074. same_count += 1
  1075. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1076. same_count += 1
  1077. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1078. same_count += 1
  1079. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1080. same_count += 1
  1081. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1082. same_count += 1
  1083. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1084. same_count += 1
  1085. _flag,_c1,_c2 = check_punish(punish_less,punish_greater)
  1086. if not _flag:
  1087. if b_log:
  1088. logging.info("check_punish failed")
  1089. return 0
  1090. else:
  1091. if b_log:
  1092. logging.info("check_punish true %d"%(_c1))
  1093. same_count += _c1
  1094. _flag,_c1,_c2 = check_approval(approval_less,approval_greater,b_log)
  1095. if not _flag:
  1096. if b_log:
  1097. logging.info("check approval failed")
  1098. return 0
  1099. else:
  1100. if b_log:
  1101. logging.info("check approval true %d"%(_c1))
  1102. same_count += _c1
  1103. _flag = check_source_type(source_type_less,source_type_greater)
  1104. if not _flag:
  1105. if b_log:
  1106. logging.info("check source type failed")
  1107. return 0
  1108. base_prob = 0
  1109. if min_counts<3:
  1110. base_prob = 0.9
  1111. elif min_counts<5:
  1112. base_prob = 0.8
  1113. elif min_counts<8:
  1114. base_prob = 0.7
  1115. else:
  1116. base_prob = 0.6
  1117. _prob = base_prob*same_count/all_count
  1118. if min(extract_count_less,extract_count_greater)<=3:
  1119. if _prob<0.1:
  1120. _prob = 0.15
  1121. if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
  1122. return 0
  1123. if _prob<0.1:
  1124. return _prob
  1125. check_result = {"pass":1}
  1126. if docchannel_less in (51,102,103,104,115,116,117):
  1127. if doctitle_refine_less!=doctitle_refine_greater:
  1128. if page_time_less!=page_time_greater:
  1129. check_result["docchannel"] = 0
  1130. check_result["pass"] = 0
  1131. else:
  1132. check_result["docchannel"] = 2
  1133. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1134. check_result["doctitle"] = 0
  1135. check_result["pass"] = 0
  1136. if b_log:
  1137. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1138. else:
  1139. check_result["doctitle"] = 2
  1140. #added check
  1141. if not check_codes(project_codes_less,project_codes_greater):
  1142. check_result["code"] = 0
  1143. check_result["pass"] = 0
  1144. if b_log:
  1145. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1146. else:
  1147. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1148. check_result["code"] = 2
  1149. else:
  1150. check_result["code"] = 1
  1151. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1152. check_result["product"] = 0
  1153. check_result["pass"] = 0
  1154. if b_log:
  1155. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1156. else:
  1157. if getLength(product_less)>0 and getLength(product_greater)>0:
  1158. check_result["product"] = 2
  1159. else:
  1160. check_result["product"] = 1
  1161. if not check_demand():
  1162. check_result["pass"] = 0
  1163. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1164. tenderee_less,tenderee_greater,
  1165. agency_less,agency_greater,
  1166. win_tenderer_less,win_tenderer_greater):
  1167. check_result["entity"] = 0
  1168. check_result["pass"] = 0
  1169. if b_log:
  1170. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1171. else:
  1172. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1173. check_result["entity"] = 2
  1174. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1175. check_result["entity"] = 2
  1176. else:
  1177. check_result["entity"] = 1
  1178. if not check_money(bidding_budget_less,bidding_budget_greater,
  1179. win_bid_price_less,win_bid_price_greater,
  1180. moneys_less,moneys_greater,
  1181. moneys_attachment_less,moneys_attachment_greater):
  1182. if b_log:
  1183. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1184. check_result["money"] = 0
  1185. check_result["pass"] = 0
  1186. else:
  1187. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1188. check_result["money"] = 2
  1189. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1190. check_result["money"] = 2
  1191. else:
  1192. check_result["money"] = 1
  1193. #added check
  1194. if not check_package(package_less,package_greater):
  1195. if b_log:
  1196. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1197. check_result["package"] = 0
  1198. check_result["pass"] = 0
  1199. else:
  1200. if getLength(package_less)>0 and getLength(package_greater)>0:
  1201. check_result["package"] = 2
  1202. else:
  1203. check_result["package"] = 1
  1204. #added check
  1205. _time_check = check_time(json_time_less,json_time_greater)
  1206. if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
  1207. if b_log:
  1208. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1209. if isinstance(json_time_less,dict):
  1210. time_less = json_time_less
  1211. else:
  1212. time_less = json.loads(json_time_less)
  1213. if isinstance(json_time_greater,dict):
  1214. time_greater = json_time_greater
  1215. else:
  1216. time_greater = json.loads(json_time_greater)
  1217. for k,v in time_less.items():
  1218. if getLength(v)>0:
  1219. v1 = time_greater.get(k,"")
  1220. if getLength(v1)>0:
  1221. if v!=v1:
  1222. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1223. check_result["time"] = 0
  1224. check_result["pass"] = 0
  1225. else:
  1226. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1227. check_result["time"] = 2
  1228. else:
  1229. check_result["time"] = 1
  1230. if hard_level==2 and check_result["product"]<=1:
  1231. return 0
  1232. if check_result.get("pass",0)==0:
  1233. if b_log:
  1234. logging.info(str(check_result))
  1235. if check_result.get("money",1)==0:
  1236. return 0
  1237. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1238. return _prob
  1239. else:
  1240. return 0
  1241. return _prob
  1242. def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
  1243. if web_source_no_less==web_source_no_greater:
  1244. if fingerprint_less==fingerprint_greater:
  1245. return 1
  1246. else:
  1247. return 0
  1248. if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
  1249. return 1
  1250. if isinstance(project_codes_less,str):
  1251. project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
  1252. elif project_codes_less is None:
  1253. project_codes_less = []
  1254. if isinstance(project_codes_greater,str):
  1255. project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
  1256. elif project_codes_greater is None:
  1257. project_codes_greater = []
  1258. same_count = 0
  1259. all_count = 8
  1260. if len(set(project_codes_less) & set(project_codes_greater))>0:
  1261. same_count += 1
  1262. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  1263. same_count += 1
  1264. if getLength(agency_less)>0 and agency_less==agency_greater:
  1265. same_count += 1
  1266. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  1267. same_count += 1
  1268. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  1269. same_count += 1
  1270. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  1271. same_count += 1
  1272. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  1273. same_count += 1
  1274. if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
  1275. same_count += 1
  1276. base_prob = 0
  1277. if min_counts<3:
  1278. base_prob = 0.9
  1279. elif min_counts<5:
  1280. base_prob = 0.8
  1281. elif min_counts<8:
  1282. base_prob = 0.7
  1283. else:
  1284. base_prob = 0.6
  1285. _prob = base_prob*same_count/all_count
  1286. if min(extract_count_less,extract_count_greater)<=3:
  1287. if _prob<0.1:
  1288. _prob = 0.15
  1289. if province_less!=province_greater:
  1290. return 0
  1291. if _prob<0.1:
  1292. return _prob
  1293. check_result = {"pass":1}
  1294. if docchannel_less in (51,102,103,104,115,116,117):
  1295. if doctitle_refine_less!=doctitle_refine_greater:
  1296. if page_time_less!=page_time_greater:
  1297. check_result["docchannel"] = 0
  1298. check_result["pass"] = 0
  1299. else:
  1300. check_result["docchannel"] = 2
  1301. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  1302. check_result["doctitle"] = 0
  1303. check_result["pass"] = 0
  1304. if b_log:
  1305. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  1306. else:
  1307. check_result["doctitle"] = 2
  1308. #added check
  1309. if not check_codes(project_codes_less,project_codes_greater):
  1310. check_result["code"] = 0
  1311. check_result["pass"] = 0
  1312. if b_log:
  1313. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  1314. else:
  1315. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  1316. check_result["code"] = 2
  1317. else:
  1318. check_result["code"] = 1
  1319. if not check_product(product_less,product_greater):
  1320. check_result["product"] = 0
  1321. check_result["pass"] = 0
  1322. if b_log:
  1323. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  1324. else:
  1325. if getLength(product_less)>0 and getLength(product_greater)>0:
  1326. check_result["product"] = 2
  1327. else:
  1328. check_result["product"] = 1
  1329. if not check_demand():
  1330. check_result["pass"] = 0
  1331. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1332. tenderee_less,tenderee_greater,
  1333. agency_less,agency_greater,
  1334. win_tenderer_less,win_tenderer_greater):
  1335. check_result["entity"] = 0
  1336. check_result["pass"] = 0
  1337. if b_log:
  1338. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1339. else:
  1340. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1341. check_result["entity"] = 2
  1342. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1343. check_result["entity"] = 2
  1344. else:
  1345. check_result["entity"] = 1
  1346. if not check_money(bidding_budget_less,bidding_budget_greater,
  1347. win_bid_price_less,win_bid_price_greater):
  1348. if b_log:
  1349. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1350. check_result["money"] = 0
  1351. check_result["pass"] = 0
  1352. else:
  1353. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1354. check_result["money"] = 2
  1355. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1356. check_result["money"] = 2
  1357. else:
  1358. check_result["money"] = 1
  1359. #added check
  1360. if not check_package(package_less,package_greater):
  1361. if b_log:
  1362. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  1363. check_result["package"] = 0
  1364. check_result["pass"] = 0
  1365. else:
  1366. if getLength(package_less)>0 and getLength(package_greater)>0:
  1367. check_result["package"] = 2
  1368. else:
  1369. check_result["package"] = 1
  1370. #added check
  1371. if not check_time(json_time_less,json_time_greater):
  1372. if b_log:
  1373. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  1374. if isinstance(json_time_less,dict):
  1375. time_less = json_time_less
  1376. else:
  1377. time_less = json.loads(json_time_less)
  1378. if isinstance(json_time_greater,dict):
  1379. time_greater = json_time_greater
  1380. else:
  1381. time_greater = json.loads(json_time_greater)
  1382. for k,v in time_less.items():
  1383. if getLength(v)>0:
  1384. v1 = time_greater.get(k,"")
  1385. if getLength(v1)>0:
  1386. if v!=v1:
  1387. logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  1388. check_result["time"] = 0
  1389. check_result["pass"] = 0
  1390. else:
  1391. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1392. check_result["time"] = 2
  1393. else:
  1394. check_result["time"] = 1
  1395. if hard_level==2 and check_result["product"]<=1:
  1396. return 0
  1397. if check_result.get("pass",0)==0:
  1398. if b_log:
  1399. logging.info(str(check_result))
  1400. if check_result.get("money",1)==0:
  1401. return 0
  1402. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  1403. return _prob
  1404. else:
  1405. return 0
  1406. if check_result.get("time",1)==0:
  1407. return 0
  1408. return _prob
  1409. @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
  1410. class f_dumplicate_check(BaseUDTF):
  1411. def __init__(self):
  1412. import logging
  1413. import json
  1414. global logging,json
  1415. def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
  1416. tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
  1417. bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
  1418. project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
  1419. extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
  1420. page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
  1421. package_less,package_greater,json_time_less,json_time_greater,json_context,
  1422. province_less,province_greater,city_less,city_greater,district_less,district_greater,
  1423. web_source_no_less,web_source_no_greater,
  1424. extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
  1425. min_counts = 100
  1426. if json_context is not None:
  1427. _context = json.loads(json_context)
  1428. for item in _context:
  1429. if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
  1430. min_counts = item["counts"]
  1431. _extract_less = {}
  1432. if extract_json_less is not None:
  1433. _extract_less = json.loads(extract_json_less)
  1434. _extract_greater = {}
  1435. if extract_json_greater is not None:
  1436. _extract_greater = json.loads(extract_json_greater)
  1437. moneys_less = set(_extract_less.get("moneys",[]))
  1438. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1439. moneys_greater = set(_extract_greater.get("moneys",[]))
  1440. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1441. if page_attachments_less is None:
  1442. page_attachments_less = '[]'
  1443. if page_attachments_greater is None:
  1444. page_attachments_greater = '[]'
  1445. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1446. self.forward(_prob)
  1447. @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
  1448. class f_dumplicate_featureMatrix(BaseUDTF):
  1449. def __init__(self):
  1450. import logging
  1451. import json
  1452. global logging,json
  1453. def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
  1454. agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
  1455. win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
  1456. bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
  1457. #check the page_time by special docchannel
  1458. if docchannel_less in (51,102,103,104,115,116,117):
  1459. if doctitle_refine_less!=doctitle_refine_greater:
  1460. if page_time_less!=page_time_greater:
  1461. self.forward("[1-%s]"%(str(docchannel_less)),0)
  1462. return
  1463. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1464. self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
  1465. return
  1466. # if not check_codes([project_code_less],[project_code_greater]):
  1467. # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
  1468. # return
  1469. if not check_demand():
  1470. self.forward("[4-]",0)
  1471. return
  1472. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1473. tenderee_less,tenderee_greater,
  1474. agency_less,agency_greater,
  1475. win_tenderer_less,win_tenderer_greater):
  1476. _error = ""
  1477. for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
  1478. _error += str(a)
  1479. self.forward("[5-%s]"%_error,0)
  1480. return
  1481. if not check_money(bidding_budget_less,bidding_budget_greater,
  1482. win_bid_price_less,win_bid_price_greater):
  1483. _error = ""
  1484. for a in [bidding_budget_less,bidding_budget_greater,
  1485. win_bid_price_less,win_bid_price_greater]:
  1486. _error += str(a)
  1487. self.forward("[6-%s]"%_error,0)
  1488. return
  1489. if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
  1490. _error = "%s=%s"%(str(product_less),str(product_greater))
  1491. self.forward("7-%s"%_error,0)
  1492. return
  1493. _context = json.loads(json_context)
  1494. min_counts = 100
  1495. dict_context = {}
  1496. for item in _context:
  1497. if item["counts"]<min_counts:
  1498. min_counts = item["counts"]
  1499. dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
  1500. context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1501. list_matrix = []
  1502. #get the featurn of the context into matrix
  1503. # for index_i in range(len(context_key)):
  1504. # for index_j in range(index_i+1,len(context_key)):
  1505. # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
  1506. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1507. # list_matrix.append(_v)
  1508. # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
  1509. # for index_i in range(len(context3_key)):
  1510. # for index_j in range(index_i+1,len(context3_key)):
  1511. # for index_k in range(index_j+1,len(context3_key)):
  1512. # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
  1513. # _v = featurnCount(dict_context.get(_key,[0,0])[1])
  1514. # list_matrix.append(_v)
  1515. # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
  1516. # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
  1517. # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
  1518. # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
  1519. # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
  1520. # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
  1521. # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
  1522. # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
  1523. json_matrix = json.dumps(list_matrix)
  1524. same_count = 0
  1525. all_count = 8
  1526. if getSimilarityOfString(project_code_less,project_code_greater)==1:
  1527. same_count += 1
  1528. if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
  1529. same_count += 1
  1530. if getSimilarityOfString(agency_less,agency_greater)==1:
  1531. same_count += 1
  1532. if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
  1533. same_count += 1
  1534. if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
  1535. same_count += 1
  1536. if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
  1537. same_count += 1
  1538. if getSimilarityOfString(project_name_less,project_name_greater)==1:
  1539. same_count += 1
  1540. if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
  1541. same_count += 1
  1542. base_prob = 0
  1543. if min_counts<3:
  1544. base_prob = 0.9
  1545. elif min_counts<5:
  1546. base_prob = 0.8
  1547. elif min_counts<8:
  1548. base_prob = 0.7
  1549. else:
  1550. base_prob = 0.6
  1551. _prob = base_prob*same_count/all_count
  1552. json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
  1553. self.forward(json_matrix,_prob)
  1554. return
  1555. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
  1556. class f_redump_probability_final_check(BaseUDAF):
  1557. '''
  1558. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1559. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1560. '''
  1561. def __init__(self):
  1562. import logging
  1563. import json,re
  1564. global json,logging,re
  1565. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1566. def new_buffer(self):
  1567. return [list()]
  1568. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
  1569. province,city,district,web_source_no,extract_json,page_attachments):
  1570. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1571. "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1572. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
  1573. "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
  1574. def merge(self, buffer, pbuffer):
  1575. buffer[0].extend(pbuffer[0])
  1576. def terminate(self, buffer):
  1577. list_group = []
  1578. the_group = buffer[0]
  1579. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1580. _index = 0
  1581. final_group = []
  1582. if len(the_group)>0:
  1583. _index = 0
  1584. while _index<len(the_group):
  1585. document_greater = the_group[_index]
  1586. docid_greater = document_greater["docid"]
  1587. docchannel_greater = document_greater["docchannel"]
  1588. page_time_greater = document_greater["page_time"]
  1589. doctitle_refine_greater = document_greater["doctitle_refine"]
  1590. project_codes_greater = document_greater["project_codes"]
  1591. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1592. tenderee_greater = document_greater["tenderee"]
  1593. agency_greater = document_greater["agency"]
  1594. win_tenderer_greater = document_greater["win_tenderer"]
  1595. bidding_budget_greater = document_greater["bidding_budget"]
  1596. win_bid_price_greater = document_greater["win_bid_price"]
  1597. product_greater = document_greater["product"]
  1598. package_greater = document_greater["package"]
  1599. json_time_greater = document_greater["json_dicttime"]
  1600. fingerprint_greater = document_greater.get("fingerprint","")
  1601. project_name_greater = document_greater["project_name"]
  1602. extract_count_greater = document_greater["extract_count"]
  1603. province_greater = document_greater["province"]
  1604. city_greater = document_greater["city"]
  1605. district_greater = document_greater["district"]
  1606. web_source_no_greater = document_greater["web_source_no"]
  1607. extract_json_greater = document_greater["extract_json"]
  1608. page_attachments_greater = document_greater["page_attachments"]
  1609. _pass = True
  1610. for document_less in final_group:
  1611. docid_less = document_less["docid"]
  1612. docchannel_less = document_less["docchannel"]
  1613. page_time_less = document_less["page_time"]
  1614. doctitle_refine_less = document_less["doctitle_refine"]
  1615. project_codes_less = document_less["project_codes"]
  1616. nlp_enterprise_less = document_less["nlp_enterprise"]
  1617. tenderee_less = document_less["tenderee"]
  1618. agency_less = document_less["agency"]
  1619. win_tenderer_less = document_less["win_tenderer"]
  1620. bidding_budget_less = document_less["bidding_budget"]
  1621. win_bid_price_less = document_less["win_bid_price"]
  1622. product_less = document_less["product"]
  1623. package_less = document_less["package"]
  1624. json_time_less = document_less["json_dicttime"]
  1625. fingerprint_less = document_less.get("fingerprint","")
  1626. project_name_less = document_less["project_name"]
  1627. extract_count_less = document_less["extract_count"]
  1628. province_less = document_less["province"]
  1629. city_less = document_less["city"]
  1630. district_less = document_less["district"]
  1631. web_source_no_less = document_less["web_source_no"]
  1632. extract_json_less = document_less["extract_json"]
  1633. page_attachments_less = document_less["page_attachments"]
  1634. _extract_less = {}
  1635. if extract_json_less is not None:
  1636. _extract_less = json.loads(extract_json_less)
  1637. _extract_greater = {}
  1638. if extract_json_greater is not None:
  1639. _extract_greater = json.loads(extract_json_greater)
  1640. moneys_less = set(_extract_less.get("moneys",[]))
  1641. moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
  1642. moneys_greater = set(_extract_greater.get("moneys",[]))
  1643. moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
  1644. if page_attachments_less is None:
  1645. page_attachments_less = '[]'
  1646. if page_attachments_greater is None:
  1647. page_attachments_greater = '[]'
  1648. _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
  1649. if _prob<0.1:
  1650. _pass = False
  1651. break
  1652. if _pass:
  1653. final_group.append(document_greater)
  1654. else:
  1655. break
  1656. _index += 1
  1657. dumplicates = ""
  1658. if _index>1:
  1659. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1660. final_group.sort(key=lambda x:x["docid"])
  1661. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1662. _set = set()
  1663. for _d in final_group:
  1664. _docid = _d["docid"]
  1665. if _docid in _set:
  1666. continue
  1667. dumplicates += "%d,"%_docid
  1668. _set.add(_docid)
  1669. dumplicates = dumplicates[:-1]
  1670. return dumplicates
  1671. @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
  1672. class f_redump_probability_final_check_bak(BaseUDAF):
  1673. '''
  1674. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1675. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  1676. '''
  1677. def __init__(self):
  1678. import logging
  1679. import json,re
  1680. global json,logging,re
  1681. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1682. def new_buffer(self):
  1683. return [list()]
  1684. def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
  1685. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
  1686. "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
  1687. "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
  1688. def merge(self, buffer, pbuffer):
  1689. buffer[0].extend(pbuffer[0])
  1690. def terminate(self, buffer):
  1691. list_group = []
  1692. the_group = buffer[0]
  1693. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1694. _index = 0
  1695. if len(the_group)>0:
  1696. _index = 1
  1697. while _index<len(the_group):
  1698. document_greater = the_group[_index]
  1699. docchannel_greater = document_greater["docchannel"]
  1700. page_time_greater = document_greater["page_time"]
  1701. doctitle_refine_greater = document_greater["doctitle_refine"]
  1702. project_code_greater = document_greater["project_code"]
  1703. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  1704. tenderee_greater = document_greater["tenderee"]
  1705. agency_greater = document_greater["agency"]
  1706. win_tenderer_greater = document_greater["win_tenderer"]
  1707. bidding_budget_greater = document_greater["bidding_budget"]
  1708. win_bid_price_greater = document_greater["win_bid_price"]
  1709. product_greater = document_greater["product"]
  1710. package_greater = document_greater["package"]
  1711. json_time_greater = document_greater["json_dicttime"]
  1712. _less_index = 0
  1713. while _less_index<_index:
  1714. document_less = the_group[_less_index]
  1715. docchannel_less = document_less["docchannel"]
  1716. page_time_less = document_less["page_time"]
  1717. doctitle_refine_less = document_less["doctitle_refine"]
  1718. project_code_less = document_less["project_code"]
  1719. nlp_enterprise_less = document_less["nlp_enterprise"]
  1720. tenderee_less = document_less["tenderee"]
  1721. agency_less = document_less["agency"]
  1722. win_tenderer_less = document_less["win_tenderer"]
  1723. bidding_budget_less = document_less["bidding_budget"]
  1724. win_bid_price_less = document_less["win_bid_price"]
  1725. product_less = document_less["product"]
  1726. package_less = document_less["package"]
  1727. json_time_less = document_less["json_dicttime"]
  1728. check_result = {"pass":1}
  1729. if docchannel_less in (51,102,103,104,115,116,117):
  1730. if doctitle_refine_less!=doctitle_refine_greater:
  1731. if page_time_less!=page_time_greater:
  1732. check_result["docchannel"] = 0
  1733. check_result["pass"] = 0
  1734. else:
  1735. check_result["docchannel"] = 2
  1736. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
  1737. check_result["doctitle"] = 0
  1738. check_result["pass"] = 0
  1739. logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
  1740. else:
  1741. check_result["doctitle"] = 2
  1742. #added check
  1743. if not check_codes([project_code_less],[project_code_greater]):
  1744. check_result["code"] = 0
  1745. check_result["pass"] = 0
  1746. logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
  1747. else:
  1748. if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
  1749. check_result["code"] = 2
  1750. else:
  1751. check_result["code"] = 1
  1752. if not check_product(product_less,product_greater):
  1753. check_result["product"] = 0
  1754. check_result["pass"] = 0
  1755. logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
  1756. else:
  1757. if getLength(product_less)>0 and getLength(product_greater)>0:
  1758. check_result["product"] = 2
  1759. else:
  1760. check_result["product"] = 1
  1761. if not check_demand():
  1762. check_result["pass"] = 0
  1763. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  1764. tenderee_less,tenderee_greater,
  1765. agency_less,agency_greater,
  1766. win_tenderer_less,win_tenderer_greater):
  1767. check_result["entity"] = 0
  1768. check_result["pass"] = 0
  1769. logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  1770. else:
  1771. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  1772. check_result["entity"] = 2
  1773. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  1774. check_result["entity"] = 2
  1775. else:
  1776. check_result["entity"] = 1
  1777. if not check_money(bidding_budget_less,bidding_budget_greater,
  1778. win_bid_price_less,win_bid_price_greater):
  1779. logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  1780. check_result["money"] = 0
  1781. check_result["pass"] = 0
  1782. else:
  1783. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  1784. check_result["money"] = 2
  1785. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  1786. check_result["money"] = 2
  1787. else:
  1788. check_result["money"] = 1
  1789. #added check
  1790. if not check_package(package_less,package_greater):
  1791. logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
  1792. check_result["package"] = 0
  1793. check_result["pass"] = 0
  1794. else:
  1795. if getLength(package_less)>0 and getLength(package_greater)>0:
  1796. check_result["package"] = 2
  1797. else:
  1798. check_result["package"] = 1
  1799. #added check
  1800. if not check_time(json_time_less,json_time_greater):
  1801. logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
  1802. check_result["time"] = 0
  1803. check_result["pass"] = 0
  1804. else:
  1805. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  1806. check_result["time"] = 2
  1807. else:
  1808. check_result["time"] = 1
  1809. if check_result.get("pass",0)==0:
  1810. logging.info(str(check_result))
  1811. if check_result.get("time",1)==0:
  1812. break
  1813. if check_result.get("money",1)==0:
  1814. break
  1815. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
  1816. pass
  1817. else:
  1818. break
  1819. _less_index += 1
  1820. if _less_index!=_index:
  1821. break
  1822. _index += 1
  1823. dumplicates = ""
  1824. if _index>1:
  1825. logging.info("index/whole:%d/%d"%(_index,len(the_group)))
  1826. final_group = the_group[:_index]
  1827. final_group.sort(key=lambda x:x["docid"])
  1828. final_group.sort(key=lambda x:x["extract_count"],reverse=True)
  1829. _set = set()
  1830. for _d in final_group:
  1831. _docid = _d["docid"]
  1832. if _docid in _set:
  1833. continue
  1834. dumplicates += "%d,"%_docid
  1835. _set.add(_docid)
  1836. dumplicates = dumplicates[:-1]
  1837. return dumplicates
  1838. @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
  1839. class f_set_docid_binaryChart(BaseUDAF):
  1840. '''
  1841. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1842. '''
  1843. def __init__(self):
  1844. import json
  1845. global json
  1846. def new_buffer(self):
  1847. return [[]]
  1848. def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
  1849. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
  1850. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
  1851. "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
  1852. "agency":agency,"web_source_no":web_source_no})
  1853. def merge(self, buffer, pbuffer):
  1854. buffer[0].extend(pbuffer[0])
  1855. def terminate(self, buffer):
  1856. list_docs = buffer[0]
  1857. list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
  1858. list_group = []
  1859. empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
  1860. for _timeGroups in list_timeGroups:
  1861. list_empty = []
  1862. list_notEmpty = []
  1863. for _item in _timeGroups:
  1864. empty_flag = True
  1865. for _key in empty_key:
  1866. if not isEmpty(_item[_key]):
  1867. empty_flag = False
  1868. break
  1869. if empty_flag:
  1870. list_empty.append(_item)
  1871. else:
  1872. list_notEmpty.append(_item)
  1873. for _e in list_empty:
  1874. _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
  1875. _e_tenderee = _e["tenderee"]
  1876. for _ne in list_notEmpty:
  1877. if "set_webSource" not in _ne:
  1878. _ne["set_webSource"] = set()
  1879. _ne["set_webSource"].add(_ne["web_source_no"])
  1880. _suit = False
  1881. if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
  1882. _suit = True
  1883. elif isEmpty(_e_tenderee):
  1884. _suit = True
  1885. if _suit:
  1886. if _e["web_source_no"] not in _ne["set_webSource"]:
  1887. _ne["set_webSource"].add(_e["web_source_no"])
  1888. _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
  1889. break
  1890. if len(_group)>1:
  1891. list_group.append(_group)
  1892. return json.dumps(list_group)
  1893. def split_with_time(list_dict,sort_key,timedelta=86400*7):
  1894. if len(list_dict)>0:
  1895. if sort_key in list_dict[0]:
  1896. list_dict.sort(key=lambda x:x[sort_key])
  1897. list_group = []
  1898. _begin = 0
  1899. for i in range(len(list_dict)-1):
  1900. if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
  1901. continue
  1902. else:
  1903. _group = []
  1904. for j in range(_begin,i+1):
  1905. _group.append(list_dict[j])
  1906. if len(_group)>1:
  1907. list_group.append(_group)
  1908. _begin = i + 1
  1909. if len(list_dict)>1:
  1910. _group = []
  1911. for j in range(_begin,len(list_dict)):
  1912. _group.append(list_dict[j])
  1913. if len(_group)>1:
  1914. list_group.append(_group)
  1915. return list_group
  1916. return [list_dict]
  1917. @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
  1918. class f_set_docid_limitNum_contain(BaseUDAF):
  1919. '''
  1920. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  1921. '''
  1922. def __init__(self):
  1923. import logging
  1924. import json,re
  1925. global json,logging,re
  1926. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1927. def new_buffer(self):
  1928. return [list()]
  1929. def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
  1930. buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
  1931. "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
  1932. "contain_column":contain_column})
  1933. def merge(self, buffer, pbuffer):
  1934. buffer[0].extend(pbuffer[0])
  1935. def terminate(self, buffer):
  1936. list_split = split_with_time(buffer[0],"page_time_stamp")
  1937. list_group = []
  1938. for _split in list_split:
  1939. flag = True
  1940. keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
  1941. for _key in keys:
  1942. logging.info(_key+str(getSet(_split,_key)))
  1943. if len(getSet(_split,_key))>1:
  1944. flag = False
  1945. break
  1946. MAX_CONTAIN_COLUMN = None
  1947. #判断组内每条公告是否包含
  1948. if flag:
  1949. for _d in _split:
  1950. contain_column = _d["contain_column"]
  1951. if contain_column is not None and contain_column !="":
  1952. if MAX_CONTAIN_COLUMN is None:
  1953. MAX_CONTAIN_COLUMN = contain_column
  1954. else:
  1955. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  1956. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  1957. flag = False
  1958. break
  1959. MAX_CONTAIN_COLUMN = contain_column
  1960. else:
  1961. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  1962. flag = False
  1963. break
  1964. if flag:
  1965. if len(_split)>1:
  1966. _group = []
  1967. for _item in _split:
  1968. _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
  1969. list_group.append(_group)
  1970. return json.dumps(list_group)
  1971. @annotate('bigint->string')
  1972. class f_stamp_squence(BaseUDAF):
  1973. '''
  1974. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  1975. '''
  1976. def __init__(self):
  1977. import json
  1978. global json
  1979. import logging
  1980. global logging
  1981. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1982. def new_buffer(self):
  1983. return [set()]
  1984. def iterate(self, buffer,page_time_stamp):
  1985. buffer[0].add(page_time_stamp)
  1986. def merge(self, buffer, pbuffer):
  1987. buffer[0] |= pbuffer[0]
  1988. def terminate(self, buffer):
  1989. if 0 in buffer[0]:
  1990. buffer[0].remove(0)
  1991. list_stamp = list(buffer[0])
  1992. list_stamp.sort(key=lambda x:x)
  1993. list_stamp_final = []
  1994. _begin = 0
  1995. _time_decase = 86400*7
  1996. logging.info(str(list_stamp))
  1997. for _index in range(len(list_stamp)-1):
  1998. if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
  1999. continue
  2000. else:
  2001. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
  2002. _begin = _index+1
  2003. if len(list_stamp)>0:
  2004. list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
  2005. return json.dumps(list_stamp_final)
  2006. @annotate("bigint,string->bigint")
  2007. class in_stamp(object):
  2008. def __init__(self):
  2009. import logging
  2010. import re
  2011. import json
  2012. global logging,re,json
  2013. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2014. def evaluate(self, page_time_stamp,json_stamp):
  2015. list_stamp = json.loads(json_stamp)
  2016. int_flag = 0
  2017. for item in list_stamp:
  2018. if page_time_stamp <item[0]:
  2019. break
  2020. if page_time_stamp>item[0] and page_time_stamp<item[1]:
  2021. int_flag = 1
  2022. break
  2023. return int_flag
  2024. def getConfidence(rule_id):
  2025. if rule_id ==0:
  2026. return 30
  2027. elif rule_id >=1 and rule_id <30:
  2028. return 20
  2029. else:
  2030. return 10
  2031. @annotate('string,string -> string')
  2032. class f_splitStr(BaseUDTF):
  2033. '''
  2034. 将多个组拆解成多条记录
  2035. '''
  2036. def __init__(self):
  2037. import logging
  2038. import json
  2039. global json,logging
  2040. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2041. def process(self, str_split,_split):
  2042. try:
  2043. for _s in str_split.split(_split):
  2044. self.forward(_s)
  2045. except Exception as e:
  2046. pass
  2047. @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
  2048. class f_split_group_single(BaseUDTF):
  2049. '''
  2050. 将多个组拆解成多条记录
  2051. '''
  2052. def __init__(self):
  2053. import logging
  2054. import json
  2055. global json,logging
  2056. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2057. def process(self, json_set_docid,rule_id):
  2058. list_group = json.loads(json_set_docid)
  2059. for item in list_group:
  2060. if len(item)>100:
  2061. item.sort(key=lambda x:x["docid"],reverse=True)
  2062. index_i = 0
  2063. for index_j in range(1,len(item)):
  2064. if item[index_i]["docid"]!=item[index_j]["docid"]:
  2065. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2066. else:
  2067. for index_i in range(len(item)):
  2068. for index_j in range(len(item)):
  2069. if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
  2070. self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
  2071. @annotate('bigint,string->string')
  2072. class group_document(BaseUDAF):
  2073. '''
  2074. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2075. '''
  2076. def __init__(self):
  2077. import json
  2078. global json
  2079. def new_buffer(self):
  2080. return [[]]
  2081. def iterate(self, buffer,id,json_set_docid):
  2082. buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
  2083. def merge(self, buffer, pbuffer):
  2084. buffer[0].extend(pbuffer[0])
  2085. def terminate(self, buffer):
  2086. return json.dumps(buffer[0])
  2087. @annotate('bigint,string,bigint,string -> bigint,bigint,string')
  2088. class decare_document(BaseUDTF):
  2089. '''
  2090. 将多个组拆解成多条记录
  2091. '''
  2092. def __init__(self):
  2093. import logging
  2094. import json
  2095. global json,logging
  2096. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2097. def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
  2098. #y=x,少掉近一半的数据
  2099. if group_id1>=group_id2:
  2100. list_doc1 = json.loads(json_list_doc1)
  2101. list_doc2 = json.loads(json_list_doc2)
  2102. for _doc1 in list_doc1:
  2103. for _doc2 in list_doc2:
  2104. #同一个重复group不做判断
  2105. if _doc1["id"]!=_doc2["id"]:
  2106. #判断两个group是否有重复
  2107. _set1 = set()
  2108. for _item1 in _doc1["json_set_docid"]:
  2109. _set1.add(_item1["docid"])
  2110. _set2 = set()
  2111. for _item2 in _doc2["json_set_docid"]:
  2112. _set2.add(_item2["docid"])
  2113. if len(_set1&_set2)>0:
  2114. new_json_set_docid = _doc1["json_set_docid"]
  2115. for _item2 in _doc2["json_set_docid"]:
  2116. if _item2["docid"] not in _set1:
  2117. new_json_set_docid.append(_item2)
  2118. self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
  2119. def getBestDocid(list_pair):
  2120. # [docid1,extract_count1,docid2,extract_count2]
  2121. # list_pair.sort(key=lambda x:x[3],reverse=True)
  2122. # _max_count = max(list_pair[0][3],list_pair[0][1])
  2123. # set_candidate = set()
  2124. # if list_pair[0][1]==_max_count:
  2125. # set_candidate.add(list_pair[0][0])
  2126. # for item in list_pair:
  2127. # if item[3]==_max_count:
  2128. # set_candidate.add(item[2])
  2129. # else:
  2130. # break
  2131. # list_candidate = list(set_candidate)
  2132. # list_candidate.sort(key=lambda x:x)
  2133. new_pair = []
  2134. new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
  2135. for item in list_pair:
  2136. new_pair.append([item[0],item[2],item[3]])
  2137. new_pair.sort(key=lambda x:x[1])
  2138. new_pair.sort(key=lambda x:x[2],reverse=True)
  2139. return new_pair[0][1]
  2140. @annotate('bigint,bigint,bigint,bigint->string')
  2141. class choose_document(BaseUDAF):
  2142. '''
  2143. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2144. '''
  2145. def __init__(self):
  2146. import json
  2147. global json
  2148. def new_buffer(self):
  2149. return [[]]
  2150. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2151. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2152. def merge(self, buffer, pbuffer):
  2153. buffer[0].extend(pbuffer[0])
  2154. def terminate(self, buffer):
  2155. list_pair = buffer[0]
  2156. _set = set()
  2157. for item in buffer[0]:
  2158. _set.add(str(item[2]))
  2159. list_dumplicate = list(_set)
  2160. best_docid = getBestDocid(list_pair)
  2161. if best_docid==list_pair[0][0]:
  2162. save_flag = 1
  2163. else:
  2164. save_flag = 0
  2165. return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
  2166. @annotate('string -> bigint,string')
  2167. class f_get_choose_document(BaseUDTF):
  2168. '''
  2169. 将多个组拆解成多条记录
  2170. '''
  2171. def __init__(self):
  2172. import logging
  2173. import json
  2174. global json,logging
  2175. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2176. def process(self,json_choose):
  2177. if json_choose is None:
  2178. self.forward(1,None)
  2179. else:
  2180. _choose = json.loads(json_choose)
  2181. self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
  2182. @annotate('string->bigint')
  2183. class f_get_codes_count(object):
  2184. def evaluate(self,extract_json):
  2185. if extract_json is None or extract_json=="":
  2186. extract_json = "{}"
  2187. _extract = json.loads(extract_json)
  2188. _codes = _extract.get("code",[])
  2189. return len(_codes)
  2190. @annotate('string->string')
  2191. class f_get_codes(object):
  2192. def evaluate(self,extract_json):
  2193. if extract_json is None or extract_json=="":
  2194. extract_json = "{}"
  2195. _extract = json.loads(extract_json)
  2196. _codes = _extract.get("code",[])
  2197. return ",".join(_codes)
  2198. @annotate('bigint,bigint,bigint,bigint->string')
  2199. class group_document_bestFirst(BaseUDAF):
  2200. '''
  2201. 将组里面最优的放在前面
  2202. '''
  2203. def __init__(self):
  2204. import json
  2205. global json
  2206. def new_buffer(self):
  2207. return [[]]
  2208. def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
  2209. buffer[0].append([docid1,extract_count1,docid2,extract_count2])
  2210. def merge(self, buffer, pbuffer):
  2211. buffer[0].extend(pbuffer[0])
  2212. def terminate(self, buffer):
  2213. list_pair = buffer[0]
  2214. _set = set()
  2215. for item in buffer[0]:
  2216. _set.add(item[2])
  2217. _set.add(list_pair[0][0])
  2218. best_docid = getBestDocid(list_pair)
  2219. _set.remove(best_docid)
  2220. list_dumplicate = list(_set)
  2221. list_dumplicate.sort(key=lambda x:x)
  2222. list_dumplicate.insert(0,best_docid)
  2223. list_dumplicate_str = []
  2224. for item in list_dumplicate:
  2225. list_dumplicate_str.append(str(item))
  2226. return ",".join(list_dumplicate_str)
  2227. @annotate('string -> bigint,string')
  2228. class f_get_best_dumplicates(BaseUDTF):
  2229. '''
  2230. 得到每个分组中最优的那一条及其重复记录
  2231. '''
  2232. def __init__(self):
  2233. import logging
  2234. import json
  2235. global json,logging
  2236. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2237. def process(self,list_dumplicate_str):
  2238. if list_dumplicate_str is None or list_dumplicate_str=='':
  2239. pass
  2240. else:
  2241. list_dumplicate = list_dumplicate_str.split(",")
  2242. if len(list_dumplicate)>0:
  2243. self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
  2244. else:
  2245. pass
  2246. @annotate('bigint,bigint->string')
  2247. class bridge2group(BaseUDAF):
  2248. '''
  2249. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
  2250. '''
  2251. def __init__(self):
  2252. import json
  2253. global json
  2254. def new_buffer(self):
  2255. return [set()]
  2256. def iterate(self, buffer,docid1,docid2):
  2257. buffer[0].add(docid1)
  2258. buffer[0].add(docid2)
  2259. def merge(self, buffer, pbuffer):
  2260. buffer[0] |= pbuffer[0]
  2261. def terminate(self, buffer):
  2262. list_pair = list(buffer[0])
  2263. list_pair.sort(key=lambda x:x,reverse=True)
  2264. return json.dumps(list_pair)
  2265. @annotate('string -> bigint,bigint')
  2266. class group2bridge(BaseUDTF):
  2267. '''
  2268. 将多个组拆解成多条记录
  2269. '''
  2270. def __init__(self):
  2271. import logging
  2272. import json
  2273. global json,logging
  2274. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2275. def process(self,json_list_docid):
  2276. list_docid = json.loads(json_list_docid)
  2277. for _docid in list_docid:
  2278. self.forward(list_docid[-1],_docid)
  2279. @annotate('string->string')
  2280. class to_url(object):
  2281. def evaluate(self,_s):
  2282. if _s is None or _s=="":
  2283. return
  2284. else:
  2285. list_l = []
  2286. for l in _s.split(","):
  2287. list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
  2288. return ",".join(list_l)
  2289. @annotate('bigint,bigint,string -> bigint')
  2290. class f_get_dump_docid(BaseUDTF):
  2291. '''
  2292. 将多个组拆解成多条记录
  2293. '''
  2294. def __init__(self):
  2295. import logging
  2296. import json
  2297. global json,logging
  2298. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2299. def process(self,docid,save_flag,dumplicates):
  2300. if save_flag==0:
  2301. self.forward(docid)
  2302. if dumplicates is not None:
  2303. list_docid = dumplicates.split(",")
  2304. if len(list_docid)>0:
  2305. for _docid in list_docid[1:]:
  2306. self.forward(int(_docid))
  2307. else:
  2308. if dumplicates is not None:
  2309. list_docid = dumplicates.split(",")
  2310. if len(list_docid)>0:
  2311. for _docid in list_docid:
  2312. self.forward(int(_docid))
  2313. @annotate('string -> bigint,bigint')
  2314. class f_get_docid(BaseUDTF):
  2315. '''
  2316. 将多个组拆解成多条记录
  2317. '''
  2318. def __init__(self):
  2319. import logging
  2320. import json
  2321. global json,logging
  2322. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2323. def process(self,json_set_docid):
  2324. team_id = 0
  2325. if json_set_docid is not None:
  2326. list_docses = json.loads(json_set_docid)
  2327. for list_docs in list_docses:
  2328. team_id += 1
  2329. for item in list_docs:
  2330. self.forward(team_id,item["docid"])
  2331. @annotate("string->bigint")
  2332. class get_count_dump(object):
  2333. def __init__(self):
  2334. import logging
  2335. import re
  2336. global logging,re
  2337. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2338. def evaluate(self, title):
  2339. _count = 0
  2340. if title is not None:
  2341. _count = len(title.split(","))
  2342. return _count
  2343. def getSet(list_dict,key):
  2344. _set = set()
  2345. for item in list_dict:
  2346. if key in item:
  2347. if item[key]!='' and item[key] is not None:
  2348. if re.search("^\d[\d\.]*$",item[key]) is not None:
  2349. _set.add(str(float(item[key])))
  2350. else:
  2351. _set.add(str(item[key]))
  2352. return _set
  2353. def getDiffIndex(list_dict,key,confidence=100):
  2354. '''
  2355. 优化为相似度判断
  2356. :param list_dict:
  2357. :param key:
  2358. :param confidence:
  2359. :return:
  2360. '''
  2361. # _set = set()
  2362. # for _i in range(len(list_dict)):
  2363. # item = list_dict[_i]
  2364. # if item["confidence"]>=confidence:
  2365. # continue
  2366. # if key in item:
  2367. # if item[key]!='' and item[key] is not None:
  2368. # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2369. # _set.add(str(float(item[key])))
  2370. # else:
  2371. # _set.add(str(item[key]))
  2372. # if len(_set)>1:
  2373. # return _i
  2374. # ==============================
  2375. _set = set()
  2376. _set_m = set()
  2377. base_s = ""
  2378. for _i in range(len(list_dict)):
  2379. item = list_dict[_i]
  2380. if item["confidence"]>=confidence:
  2381. continue
  2382. if key in item:
  2383. if item[key]!='' and item[key] is not None:
  2384. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  2385. _m = float(item[key])
  2386. if _m>100000:
  2387. _m = _m//10000*10000
  2388. _set_m.add(str(_m))
  2389. else:
  2390. _s = str(item[key])
  2391. if base_s=="":
  2392. base_s = _s
  2393. else:
  2394. simi = getSimilarityOfString(base_s,_s)
  2395. if simi<0.8:
  2396. return _i
  2397. if len(_set_m)>1:
  2398. return _i
  2399. return len(list_dict)
  2400. @annotate('bigint,string -> bigint,bigint')
  2401. class f_getGroup_dumpFinal(BaseUDTF):
  2402. '''
  2403. 从最后的结果中获取组
  2404. '''
  2405. def __init__(self):
  2406. import logging
  2407. import json
  2408. global json,logging
  2409. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2410. def process(self,docid,dumplicates):
  2411. self.forward(int(docid),int(docid))
  2412. if dumplicates is not None:
  2413. list_docids = dumplicates.split(",")
  2414. for _docid in list_docids:
  2415. self.forward(int(docid),int(_docid))
  2416. @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
  2417. class f_redump_limit_num(BaseUDAF):
  2418. '''
  2419. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2420. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2421. '''
  2422. def __init__(self):
  2423. import logging
  2424. import json,re
  2425. global json,logging,re
  2426. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2427. def new_buffer(self):
  2428. return [list()]
  2429. def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
  2430. buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
  2431. "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
  2432. "extract_count2":extract_count2,"confidence":confidence})
  2433. def merge(self, buffer, pbuffer):
  2434. buffer[0].extend(pbuffer[0])
  2435. def terminate(self, buffer):
  2436. list_group = []
  2437. the_group = buffer[0]
  2438. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2439. if len(the_group)>5:
  2440. keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
  2441. else:
  2442. keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
  2443. final_group = []
  2444. #置信度
  2445. list_key_index = []
  2446. for _k in keys:
  2447. if _k=="doctitle":
  2448. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2449. else:
  2450. list_key_index.append(getDiffIndex(the_group,_k))
  2451. _index = min(list_key_index)
  2452. if _index>1:
  2453. main_docid = the_group[0]["main_docid"]
  2454. for item in the_group[:_index]:
  2455. if item["docid"]!=main_docid:
  2456. final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
  2457. # stay = True
  2458. # for _key in keys:
  2459. # if len(getSet(the_group,_key))>1:
  2460. # stay = False
  2461. # break
  2462. #
  2463. # if stay:
  2464. # main_docid = the_group[0]["main_docid"]
  2465. # for item in the_group:
  2466. # if item["docid"]!=main_docid:
  2467. # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
  2468. return json.dumps(final_group)
  2469. @annotate('string -> bigint,bigint,bigint,bigint,bigint')
  2470. class f_get_dumpFinal_checked(BaseUDTF):
  2471. '''
  2472. 从最后的结果中获取组
  2473. '''
  2474. def __init__(self):
  2475. import logging
  2476. import json
  2477. global json,logging
  2478. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2479. def process(self,list_group):
  2480. if list_group is not None:
  2481. final_group = json.loads(list_group)
  2482. for _group in final_group:
  2483. self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
  2484. @annotate('string -> bigint')
  2485. class f_getDumplicateDocids(BaseUDTF):
  2486. '''
  2487. 从最后的结果中获取组
  2488. '''
  2489. def __init__(self):
  2490. import logging
  2491. import json
  2492. global json,logging
  2493. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2494. def process(self,dumplicates):
  2495. list_docids = dumplicates.split(",")
  2496. for _d in list_docids:
  2497. self.forward(int(_d))
  2498. def jaccard_score(source,target):
  2499. source_set = set([s for s in source])
  2500. target_set = set([s for s in target])
  2501. if len(source_set)==0 or len(target_set)==0:
  2502. return 0
  2503. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  2504. def getSimilarityOfString(str1,str2):
  2505. _set1 = set()
  2506. _set2 = set()
  2507. if str1 is not None:
  2508. for i in range(1,len(str1)):
  2509. _set1.add(str1[i-1:i+1])
  2510. for i in range(2,len(str1)):
  2511. _set1.add(str1[i-2:i+1])
  2512. if str2 is not None:
  2513. for i in range(1,len(str2)):
  2514. _set2.add(str2[i-1:i+1])
  2515. for i in range(2,len(str2)):
  2516. _set2.add(str2[i-2:i+1])
  2517. _len = max(1,min(len(_set1),len(_set2)))
  2518. return len(_set1&_set2)/_len
  2519. @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
  2520. class f_is_legal(object):
  2521. def __init__(self):
  2522. import logging
  2523. import re
  2524. global logging,re
  2525. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2526. def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
  2527. if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
  2528. return 0
  2529. if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
  2530. return 0
  2531. if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
  2532. return 0
  2533. if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
  2534. return 0
  2535. _sim = getSimilarityOfString(project_code1,project_code2)
  2536. if _sim>0.7 and _sim<1:
  2537. return 0
  2538. return 1
  2539. @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
  2540. class f_autorule_group(BaseUDAF):
  2541. '''
  2542. 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2543. 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
  2544. '''
  2545. def __init__(self):
  2546. import logging
  2547. import json,re
  2548. global json,logging,re
  2549. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2550. def new_buffer(self):
  2551. return [list()]
  2552. def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
  2553. project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
  2554. buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
  2555. "doctitle_refine":doctitle_refine,"area":area,"province":province,
  2556. "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
  2557. "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
  2558. "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
  2559. "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
  2560. def merge(self, buffer, pbuffer):
  2561. buffer[0].extend(pbuffer[0][:100])
  2562. buffer[0] = buffer[0][:100]
  2563. def getSameKeys(self,_dict1,_dict2):
  2564. list_keys = []
  2565. for k,v in _dict1.items():
  2566. if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
  2567. continue
  2568. v2 = _dict2.get(k,"")
  2569. if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
  2570. list_keys.append(k)
  2571. list_keys.sort(key=lambda x:x)
  2572. return "=".join(list_keys)
  2573. def terminate(self, buffer):
  2574. list_group = []
  2575. the_group = buffer[0]
  2576. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2577. if len(the_group)>5:
  2578. keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2579. else:
  2580. keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
  2581. #置信度
  2582. list_key_index = []
  2583. for _k in keys:
  2584. if _k=="doctitle":
  2585. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  2586. else:
  2587. list_key_index.append(getDiffIndex(the_group,_k))
  2588. final_group = []
  2589. _index = min(list_key_index)
  2590. if _index>1:
  2591. for item in the_group[:_index]:
  2592. final_group.append(item)
  2593. list_rules = []
  2594. for i in range(len(final_group)):
  2595. for j in range(i+1,len(final_group)):
  2596. _dict1 = final_group[i]
  2597. _dict2 = final_group[j]
  2598. _rule = self.getSameKeys(_dict1,_dict2)
  2599. list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
  2600. return json.dumps(list_rules)
  2601. @annotate('string -> string,bigint,bigint')
  2602. class f_autorule_group_extract(BaseUDTF):
  2603. '''
  2604. 从最后的结果中获取组
  2605. '''
  2606. def __init__(self):
  2607. import logging
  2608. import json
  2609. global json,logging
  2610. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  2611. def process(self,rules_json):
  2612. list_rules = json.loads(rules_json)
  2613. for _rule in list_rules:
  2614. self.forward(_rule[0],_rule[1],_rule[2])
  2615. if __name__ == '__main__':
  2616. # f = f_decode_for_dumplicate()
  2617. # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
  2618. # print(b)
  2619. print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
  2620. # f = f_get_extractCount()
  2621. # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
  2622. # print(f.evaluate(j))
  2623. # _str1 = "PMJJ-202211030004001"
  2624. # _str2 = "PMJJ-202211030001001"
  2625. # print(getSimilarityOfString(_str1,_str2))
  2626. # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
  2627. # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
  2628. # print(check_product(None,None))
  2629. # print(check_code("4451020073383382206021325","4451020073383382206021322"))
  2630. # print(check_money("550.0","440.0","",""))
  2631. # for i in range(0,2):
  2632. # print(i)
  2633. # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
  2634. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
  2635. # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
  2636. # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
  2637. # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
  2638. # if _match is not None:
  2639. # print(_match.groupdict()["name"])
  2640. # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
  2641. # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
  2642. # c = f_get_extractCount()
  2643. # _json = '''
  2644. # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
  2645. # '''
  2646. # c = f_get_nlp_enterprise()
  2647. # print(c.evaluate("山东东岳项目管理有限公司",_json))
  2648. # print(c.evaluate(_json))
  2649. # c = f_set_docid()
  2650. # _s = '''
  2651. # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2652. # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2653. # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2654. # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
  2655. # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2656. # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
  2657. # '''
  2658. # buffer = c.new_buffer()
  2659. # for _line in _s.split("\n"):
  2660. # _line = _line.strip()
  2661. # if _line=="":
  2662. # continue
  2663. # l_column = _line.split("\t")
  2664. # print(l_column)
  2665. # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
  2666. # page_time_stamp = int(page_time_stamp)
  2667. # extract_count = int(extract_count)
  2668. # num = 1
  2669. # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
  2670. # print(c.terminate(buffer))