1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911 |
- #coding:UTF8
- from odps.udf import annotate
- from odps.udf import BaseUDTF
- from odps.udf import BaseUDAF
- import re
- @annotate('string,string -> string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string')
- class f_decode_extract(BaseUDTF):
- def __init__(self):
- import logging
- import json
- import time,re
- global json,logging,time,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- self.dict_channel = {"公告变更":51,
- "招标公告":52,
- "中标信息":101,
- "招标预告":102,
- "招标答疑":103,
- "资审结果":105,
- "法律法规":106,
- "新闻资讯":107,
- "采购意向":114,
- "拍卖出让":115,
- "土地矿产":116,
- "产权交易":117,
- "废标公告":118,
- "候选人公示":119,
- "合同公告":120}
- def process(self, extractjson,otherjson):
- if extractjson is not None:
- _extract = json.loads(extractjson)
- else:
- _extract = {}
- if otherjson is not None:
- _other = json.loads(otherjson)
- else:
- _other = {}
- project_code = ""
- project_name = ""
- tenderee = ""
- agency = ""
- win_tenderer = ""
- bidding_budget = ""
- win_bid_price = ""
- fingerprint = ""
- page_time_stamp = 0
- docchannel = 0
- extract_count = 0
- page_time = _other.get("pageTime",time.strftime('%Y-%m-%d',time.localtime()))
- doctitle = _other.get("doctitle","")
- doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', doctitle)
- area = _other.get("area","")
- province = _other.get("province","")
- city = _other.get("city","")
- district = _other.get("district","")
- web_source_no = _other.get("webSourceNo","")
- time_bidclose = _extract.get("time_bidclose")
- time_bidopen = _extract.get("time_bidopen")
- time_bidstart = _extract.get("time_bidstart")
- time_commencement = _extract.get("time_commencement")
- time_completion = _extract.get("time_completion")
- time_earnest_money_end = _extract.get("time_earnestMoneyEnd")
- time_earnest_money_start = _extract.get("time_earnestMoneyStart")
- time_get_file_end = _extract.get("time_getFileEnd")
- time_get_file_start = _extract.get("time_getFileStart")
- time_publicity_end = _extract.get("time_publicityEnd")
- time_publicity_start = _extract.get("time_publicityStart")
- time_registration_end = _extract.get("time_registrationEnd")
- time_registration_start = _extract.get("time_registrationStart")
- time_release = _extract.get("time_release")
- # docchannel = _other.get("docchannel",0)
- docchannel_name = _extract.get("docchannel",{}).get("docchannel")
- doctype_name = _extract.get("docchannel",{}).get("doctype")
- if doctype_name in ["法律法规","新闻资讯","拍卖出让","土地矿产"]:
- docchannel_name = doctype_name
- docchannel = self.dict_channel.get(docchannel_name,0)
- if re.search(self.time_pattern,page_time) is not None:
- try:
- timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
- page_time_stamp = int(time.mktime(timeArray))
- except Exception as e:
- pass
- list_code = _extract.get("code",[])
- if len(list_code)>0:
- project_code = list_code[0]
- project_name = _extract.get("name","")
- fingerprint = _extract.get("fingerprint","")
- dict_pack = _extract.get("prem",{})
- logging.info(dict_pack)
- for _key in dict_pack.keys():
- if dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
- extract_count += 1
- if bidding_budget=="":
- bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
- for _role in dict_pack[_key]["roleList"]:
- if isinstance(_role,list):
- extract_count += 1
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 1
- if _role[0]=="tenderee":
- tenderee = _role[1]
- if _role[0]=="win_tenderer":
- if win_tenderer=="":
- win_tenderer = _role[1]
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 1
- if win_bid_price=="":
- win_bid_price = str(float(_role[2]))
- if _role[0]=="agency":
- agency = _role[1]
- if isinstance(_role,dict):
- extract_count += 1
- if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
- extract_count += 1
- if _role["role_name"]=="tenderee":
- tenderee = _role["role_text"]
- if _role["role_name"]=="win_tenderer":
- if win_tenderer=="":
- win_tenderer = _role["role_text"]
- if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
- extract_count += 1
- if win_bid_price=="":
- win_bid_price = str(float(_role["role_money"]["money"]))
- if _role["role_name"]=="agency":
- agency = _role["role_text"]
- if project_code!="":
- extract_count += 1
- if project_name!="":
- extract_count += 1
- logging.info(page_time+doctitle+doctitle_refine+area+province+city+
- district+web_source_no+project_code+project_name+tenderee+agency+win_tenderer+bidding_budget+win_bid_price)
- self.forward(page_time,page_time_stamp,docchannel,doctitle,doctitle_refine,area,province,city,
- district,web_source_no,fingerprint,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,
- time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,
- time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release)
- @annotate("string->string")
- class f_get_product(object):
- def __init__(self):
- import time
- global time
- import logging
- import json
- import re
- global json,logging,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, extractjson):
- if extractjson is None or extractjson=="":
- extractjson = "{}"
- _extract = json.loads(extractjson)
- return ",".join(_extract.get("product",[]))
- @annotate("string->string")
- class f_get_package(object):
- def __init__(self):
- import time
- global time
- import logging
- import json
- import re
- global json,logging,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, extractjson):
- if extractjson is None or extractjson=="":
- extractjson = "{}"
- _extract = json.loads(extractjson)
- prem = _extract.get("prem",{})
- list_pack = []
- for k,v in prem.items():
- if k!="Project":
- list_pack.append(k)
- return ",".join(list_pack)
- @annotate("string->string")
- class f_get_nlp_enterprise(object):
- def __init__(self):
- import time
- global time
- import logging
- import json
- import re
- global json,logging,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, extractjson):
- if extractjson is None or extractjson=="":
- extractjson = "{}"
- _extract = json.loads(extractjson)
- nlp_enterprise = _extract.get("nlp_enterprise",[])
- nlp_enterprise_attachment = _extract.get("nlp_enterprise_attachment",[])
- if len(nlp_enterprise)==0 and len(nlp_enterprise_attachment)==0:
- dict_pack = _extract.get("prem",{})
- for _key in dict_pack.keys():
- for _role in dict_pack[_key]["roleList"]:
- if isinstance(_role,list):
- _entity = _role[1]
- nlp_enterprise.append(_entity)
- if isinstance(_role,dict):
- _entity = _role["role_text"]
- nlp_enterprise.append(_entity)
- nlp_enterprise = list(set(nlp_enterprise))
- dict_entity = {"indoctextcon":nlp_enterprise,
- "notindoctextcon":nlp_enterprise_attachment}
- return json.dumps(dict_entity,ensure_ascii=False)
- @annotate("string->bigint")
- class f_get_extractCount(object):
- def __init__(self):
- import time
- global time
- import logging
- import json
- import re
- global json,logging,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, extractjson):
- if extractjson is not None:
- _extract = json.loads(extractjson)
- else:
- _extract = {}
- dict_pack = _extract.get("prem",{})
- extract_count = 0
- list_code = _extract.get("code",[])
- if len(list_code)>0:
- project_code = list_code[0]
- else:
- project_code = ""
- project_name = _extract.get("name","")
- bidding_budget = ""
- win_tenderer = ""
- win_bid_price = ""
- linklist_count = 0
- for _key in dict_pack.keys():
- if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
- extract_count += 1
- if bidding_budget=="":
- bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
- for _role in dict_pack[_key]["roleList"]:
- if isinstance(_role,list):
- extract_count += 1
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 1
- if _role[0]=="tenderee":
- tenderee = _role[1]
- if _role[0]=="win_tenderer":
- if win_tenderer=="":
- win_tenderer = _role[1]
- if _role[2]!='' and float(_role[2])>0:
- extract_count += 1
- if win_bid_price=="":
- win_bid_price = str(float(_role[2]))
- if _role[0]=="agency":
- agency = _role[1]
- if isinstance(_role,dict):
- extract_count += 1
- if "role_money" in _role:
- if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
- extract_count += 1
- if _role.get("role_name")=="tenderee":
- tenderee = _role["role_text"]
- if _role.get("role_name")=="win_tenderer":
- if win_tenderer=="":
- win_tenderer = _role["role_text"]
- if "role_money" in _role:
- if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
- extract_count += 1
- if win_bid_price=="":
- win_bid_price = str(float(_role["role_money"]["money"]))
- if _role["role_name"]=="agency":
- agency = _role["role_text"]
- linklist = _role.get("linklist",[])
- for link in linklist:
- for l in link:
- if l!="":
- linklist_count += 1
- extract_count += linklist_count//2
- if project_code!="":
- extract_count += 1
- if project_name!="":
- extract_count += 1
- return extract_count
- @annotate('string,string,string,string,string -> string,string,string,bigint')
- class f_decode_sub_docs_json(BaseUDTF):
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self, project_code,project_name,tenderee,agency,sub_docs_json):
- columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- extract_count = 0
- if project_code is not None and project_code!="":
- extract_count += 1
- if project_name is not None and project_name!="":
- extract_count += 1
- if tenderee is not None and tenderee!="":
- extract_count += 1
- if agency is not None and agency!="":
- extract_count += 1
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- for _key_sub_docs in sub_docs.keys():
- extract_count += 1
- if _key_sub_docs in columns:
- if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- if float(sub_docs[_key_sub_docs])>0:
- columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
- else:
- columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- self.forward(columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count)
- @annotate('string,string,string -> string,string,string,string,string,string,string')
- class f_decode_for_dumplicate(BaseUDTF):
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,sub_docs_json,extractjson,extract):
- if extractjson is None or extractjson=="":
- extractjson = "{}"
- try:
- _extract = json.loads(extractjson)
- except Exception as e:
- _extract = {}
- product = ",".join(_extract.get("product",[]))
- list_product = product.split(",")
- project_codes = ",".join(_extract.get("code",[]))
- list_code = project_codes.split(",")
- if sub_docs_json is not None:
- list_sub_docs = json.loads(sub_docs_json)
- else:
- list_sub_docs = [{}]
- max_len = max([len(list_product),len(list_code),len(list_sub_docs)])
- if extract!="extract":
- win_tenderer = ""
- bidding_budget = ""
- win_bid_price = ""
- for _subdoc in list_sub_docs:
- win_tenderer = _subdoc.get("win_tenderer","")
- bidding_budget = _subdoc.get("bidding_budget","0")
- if float(bidding_budget)==0:
- bidding_budget = ""
- else:
- bidding_budget = str(float(bidding_budget))
- win_bid_price = _subdoc.get("win_bid_price","0")
- if float(win_bid_price)==0:
- win_bid_price = ""
- else:
- win_bid_price = str(float(win_bid_price))
- if len(set([win_tenderer,bidding_budget,win_bid_price]))>=3:
- break
- print(("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price))
- self.forward("",product,"",project_codes,win_tenderer,bidding_budget,win_bid_price)
- else:
- for _i in range(max_len):
- _product = list_product[_i%len(list_product)]
- _code = list_code[_i%len(list_code)]
- _subdoc = list_sub_docs[_i%len(list_sub_docs)]
- win_tenderer = _subdoc.get("win_tenderer","")
- bidding_budget = _subdoc.get("bidding_budget","0")
- if float(bidding_budget)==0:
- bidding_budget = ""
- else:
- bidding_budget = str(float(bidding_budget))
- win_bid_price = _subdoc.get("win_bid_price","0")
- if float(win_bid_price)==0:
- win_bid_price = ""
- else:
- win_bid_price = str(float(win_bid_price))
- self.forward(_product,product,_code,project_codes,win_tenderer,bidding_budget,win_bid_price)
- @annotate("string->bigint")
- class totimestamp(object):
- def __init__(self):
- import time
- global time
- import logging
- import json
- import re
- global json,logging,re
- self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, str_time):
- try:
- logging.info(str_time)
- if str_time is not None and re.search(self.time_pattern,str_time) is not None:
- timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
- timeStamp = int(time.mktime(timeArray))
- return timeStamp
- else:
- return 0
- except Exception as e:
- return 0
- @annotate("string->string")
- class refind_name(object):
- def __init__(self):
- import logging
- import re
- global logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, title):
- if title is not None:
- return re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|\[|\]|【|】', '', title)
- return ""
- @annotate('bigint,bigint,bigint,string,bigint,string->string')
- class f_set_docid(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid, page_time_stamp,extract_count,defind_column,defind_count,tenderee):
- buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
- "defind_column":defind_column,"defind_count":defind_count,"tenderee":tenderee})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_docs = buffer[0]
- list_docs.sort(key=lambda x:x["page_time_stamp"])
- list_group = []
- _begin = 0
- defind_count = 0
- if len(list_docs)>0:
- defind_count = list_docs[0]["defind_count"]
- print(defind_count)
- for i in range(len(list_docs)-1):
- if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
- continue
- else:
- _group = []
- _set_column = set()
- _set_tenderee = set()
- for j in range(_begin,i+1):
- if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
- _set_tenderee.add(list_docs[j]["tenderee"])
- _set_column.add(list_docs[j]["defind_column"])
- _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
- if len(_group)>=3 and len(_set_tenderee)>1:
- pass
- else:
- print(defind_count,len(_set_column))
- if len(_group)>1:
- if defind_count==2:
- if len(_set_column)>=2:
- list_group.append(_group)
- elif defind_count==1:
- if len(_set_column)==1:
- list_group.append(_group)
- elif defind_count==0:
- list_group.append(_group)
- _begin = i+1
- if len(list_docs)>1:
- _set_column = set()
- _set_tenderee = set()
- _group = []
- for j in range(_begin,len(list_docs)):
- if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
- _set_tenderee.add(list_docs[j]["tenderee"])
- _set_column.add(list_docs[j]["defind_column"])
- _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
- if len(_group)>=3 and len(_set_tenderee)>1:
- pass
- else:
- if len(_group)>1:
- if defind_count==2:
- if len(_set_column)>=2:
- list_group.append(_group)
- elif defind_count==1:
- if len(_set_column)==1:
- list_group.append(_group)
- elif defind_count==0:
- list_group.append(_group)
- return json.dumps(list_group)
- # def terminate(self, buffer):
- #
- #
- # list_docs = buffer[0]
- # if len(list_docs)>0:
- # defind_count = list_docs[0]["defind_count"]
- #
- # list_time_group = split_with_time(list_docs,"page_time_stamp",86400*2)
- #
- # list_group = []
- # for time_group in list_time_group:
- # _group = []
- # _set_column = set()
- # base_tenderee = ""
- # _set_tenderee = set()
- # for j in range(len(time_group)):
- # if time_group[j]["tenderee"] is not None and time_group[j]["tenderee"]!="":
- # # if base_tenderee =="":
- # # base_tenderee = time_group[j]["tenderee"]
- # # _set_tenderee.add(time_group[j]["tenderee"])
- # # simi = getSimilarityOfString(base_tenderee,time_group[j]["tenderee"])
- # # if simi<0.8:
- # # _set_tenderee.add(time_group[j]["tenderee"])
- #
- # _set_tenderee.add(time_group[j]["tenderee"])
- # _set_column.add(time_group[j]["defind_column"])
- # _group.append({"docid":time_group[j]["docid"],"extract_count":time_group[j]["extract_count"]})
- #
- # if len(_group)>=3 and len(_set_tenderee)>1:
- # pass
- # else:
- # if len(_group)>1:
- # if defind_count==2:
- # if len(_set_column)>=2:
- # list_group.append(_group)
- # elif defind_count==1:
- # if len(_set_column)==1:
- # list_group.append(_group)
- # elif defind_count==0:
- # list_group.append(_group)
- #
- # return json.dumps(list_group)
- def isEmpty(_str):
- if _str is None or _str=="":
- return True
- return False
- @annotate('bigint->string')
- class f_group_fingerprint(BaseUDAF):
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid):
- buffer[0].append(docid)
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0][:100000])
- def terminate(self, buffer):
- list_docid = buffer[0][:100000]
- list_docid.sort(key=lambda x:x)
- return ",".join([str(a) for a in list_docid])
- @annotate('string->bigint,string')
- class f_ungroup_fingerprint(BaseUDTF):
- def process(self,dumplicates):
- list_docid = dumplicates.split(",")
- self.forward(int(list_docid[0]),",".join(list_docid[1:]))
- @annotate('bigint,bigint,string->string')
- class f_dump_probability(BaseUDAF):
- '''
- 合并组为一条记录
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid,page_time_stamp,_type):
- buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_dict = buffer[0]
- _set = set()
- list_data = []
- for _dict in list_dict:
- docid = _dict["docid"]
- if docid in _set:
- continue
- _set.add(docid)
- list_data.append(_dict)
- if len(list_data)>10000:
- break
- list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
- return json.dumps(list_group)
- @annotate('string -> bigint,bigint,bigint,bigint,string')
- class f_split_dumplicate_probability(BaseUDTF):
- def __init__(self):
- import logging
- import json
- global logging,json
- logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,list_group_str):
- logging.info("0")
- logging.info(list_group_str)
- if list_group_str is not None:
- logging.info("1")
- try:
- list_group = json.loads(list_group_str)
- logging.info("2")
- for _group in list_group:
- if len(_group)>0:
- _type = _group[0].get("type","")
- logging.info("3%d"%len(list_group))
- # _group.sort(key=lambda x:x["page_time_stamp"])
- _len = min(100,len(_group))
- for _index_i in range(_len):
- _count = 0
- for _index_j in range(_index_i+1,_len):
- if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
- break
- _count += 1
- _docid1 = _group[_index_i]["docid"]
- _docid2 = _group[_index_j]["docid"]
- if _docid1<_docid2:
- self.forward(_docid1,_docid2,1,_len,_type)
- elif _docid1>_docid2:
- self.forward(_docid2,_docid1,1,_len,_type)
- except Exception as e:
- logging(str(e))
- @annotate('bigint,bigint,string->string')
- class f_dumplicate_groupPairs(BaseUDAF):
- '''
- 合并组为一条记录
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,is_exists,counts,_type):
- buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_dict = buffer[0]
- list_dict = list_dict[:10000]
- return json.dumps(list_dict)
- def check_columns(tenderee_less,tenderee_greater,
- agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
- win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
- bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
- flag = True
- _set_tenderee = set()
- if tenderee_less is not None and tenderee_less!="":
- _set_tenderee.add(tenderee_less)
- if tenderee_greater is not None and tenderee_greater!="":
- _set_tenderee.add(tenderee_greater)
- if len(_set_tenderee)>1:
- return False
- code_sim = getSimilarityOfString(project_code_less,project_code_greater)
- if code_sim>0.6 and code_sim<1:
- return False
- #同批次不同编号
- if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
- _split_code_less = project_code_less.split("-")
- _split_code_greater = project_code_greater.split("-")
- if len(_split_code_less)>1 and len(_split_code_greater)>1:
- if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
- return False
- _set_win_tenderer = set()
- if win_tenderer_less is not None and win_tenderer_less!="":
- _set_win_tenderer.add(win_tenderer_less)
- if win_tenderer_greater is not None and win_tenderer_greater!="":
- _set_win_tenderer.add(win_tenderer_greater)
- if len(_set_win_tenderer)>1:
- return False
- _set_win_bid_price = set()
- if win_bid_price_less is not None and win_bid_price_less!="":
- _set_win_bid_price.add(float(win_bid_price_less))
- if win_bid_price_greater is not None and win_bid_price_greater!="":
- _set_win_bid_price.add(float(win_bid_price_greater))
- if len(_set_win_bid_price)>1:
- return False
- _set_bidding_budget = set()
- if bidding_budget_less is not None and bidding_budget_less!="":
- _set_bidding_budget.add(float(bidding_budget_less))
- if bidding_budget_greater is not None and bidding_budget_greater!="":
- _set_bidding_budget.add(float(bidding_budget_greater))
- if len(_set_bidding_budget)>1:
- return False
- return True
- import math
- def featurnCount(_count,max_count=100):
- return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
- def getSimLevel(str1,str2):
- str1_null = False
- str2_null = False
- _v = 0
- if str1 is None or str1=="":
- str1_null = True
- if str2 is None or str2=="":
- str2_null = True
- if str1_null and str2_null:
- _v = 2
- elif str1_null and not str2_null:
- _v = 4
- elif not str1_null and str2_null:
- _v = 6
- elif not str1_null and not str2_null:
- if str1==str2:
- _v = 10
- else:
- _v = 0
- return _v
- def getLength(_str):
- return len(_str if _str is not None else "")
- def check_money(bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater,
- moneys_less,moneys_greater,
- moneys_attachment_less,moneys_attachment_greater):
- #只判断最高前六位
- if getLength(bidding_budget_less)>0:
- bidding_budget_less = round(float(bidding_budget_less))
- bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
- if getLength(bidding_budget_greater)>0:
- bidding_budget_greater = round(float(bidding_budget_greater))
- bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
- if getLength(win_bid_price_less)>0:
- win_bid_price_less = round(float(win_bid_price_less))
- win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
- if getLength(win_bid_price_greater)>0:
- win_bid_price_greater = round(float(win_bid_price_greater))
- win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
- #check saming
- budget_is_same = ""
- price_is_same = ""
- if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
- budget_less = float(bidding_budget_less)
- budget_greater = float(bidding_budget_greater)
- if budget_less!=budget_greater:
- if min(budget_less,budget_greater)>0:
- if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
- budget_is_same = True
- if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
- budget_is_same = True
- if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
- budget_is_same = True
- if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
- budget_is_same = True
- if budget_is_same=="":
- return False
- if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
- price_less = float(win_bid_price_less)
- price_greater = float(win_bid_price_greater)
- if price_less!=price_greater:
- if min(price_less,price_greater)>0:
- if max(price_less,price_greater)/min(price_less,price_greater)==10000:
- price_is_same = True
- if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
- price_is_same = True
- if price_less in moneys_greater or price_less in moneys_attachment_greater:
- price_is_same = True
- if price_greater in moneys_less or price_greater in moneys_attachment_less:
- price_is_same = True
- if price_is_same=="":
- return False
- return True
- def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
- tenderee_less,tenderee_greater,
- agency_less,agency_greater,
- win_tenderer_less,win_tenderer_greater,
- similarity=0.85):
- def get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,entity_less,entity_greater,similarity):
- if getLength(entity_less)>0 and getLength(entity_greater)>0:
- if entity_less!=entity_greater:
- is_same = ''
- _sim = jaccard_score(entity_less,entity_greater)
- if _sim>similarity:
- is_same = True
- if is_same=='':
- if str(nlp_enterprise_less).find(entity_greater)>0 or str(nlp_enterprise_greater).find(entity_less)>0:
- is_same = True
- if is_same=='':
- return False
- return True
- if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,similarity):
- return False
- if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,agency_less,agency_greater,similarity):
- return False
- if not get_same_of_entity(nlp_enterprise_less,nlp_enterprise_greater,win_tenderer_less,win_tenderer_greater,similarity):
- return False
- return True
- def check_codes(project_codes_less,project_codes_greater):
- #check the similarity
- is_same = False
- is_sim = False
- for project_code_less in project_codes_less:
- for project_code_greater in project_codes_greater:
- code_sim = getSimilarityOfString(project_code_less,project_code_greater)
- if project_code_less is not None and project_code_greater is not None:
- if code_sim>0.6:
- if str(project_code_less).find(str(project_code_greater))>=0 or str(project_code_greater).find(str(project_code_less))>=0:
- is_same = True
- else:
- is_sim = True
- if project_code_less!=project_code_greater:
- if code_sim>0.4 and len(project_code_less)==len(project_code_greater):
- is_sim = True
- if is_same:
- return True
- if is_sim:
- return False
- return True
- def check_demand():
- return True
- package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
- code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
- num_pattern = re.compile("^\d+(?:\.\d+)?$")
- num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
- location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
- building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
- date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
- def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
- if code_greater is None:
- code_greater = []
- doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
- doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
- for _c in codes_less:
- doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
- for _c in code_greater:
- doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
- doctitle_refind_less = re.sub(date_pattern,"",doctitle_refind_less)
- doctitle_refind_greater = re.sub(date_pattern,"",doctitle_refind_greater)
- #check the package
- if doctitle_refind_less is None:
- doctitle_refind_less = ""
- if doctitle_refind_greater is None:
- doctitle_refind_greater = ""
- _pack1 = None
- _pack2 = None
- #if contain then pass
- if doctitle_refind_less.find(doctitle_refind_greater)>=0 or doctitle_refind_greater.find(doctitle_refind_less)>=0:
- return True
- #check the package in title
- _match = re.search(package_number_pattern,doctitle_refind_less)
- if _match is not None:
- _pack1 = _match.groupdict()["name"]
- _match = re.search(package_number_pattern,doctitle_refind_greater)
- if _match is not None:
- _pack2 = _match.groupdict()["name"]
- if _pack1 is not None and _pack2 is not None:
- if _pack1!=_pack2:
- return False
- #check the nums in title
- doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)
- doctitle_refind_greater = re.sub(package_number_pattern,"",doctitle_refind_greater)
- #check the nums,location,building in title
- for _p in [code_pattern]:
- num_all_l = re.findall(_p,doctitle_refind_less)
- num_all_g = re.findall(_p,doctitle_refind_greater)
- set_num_l = set()
- set_num_g = set()
- for _l in num_all_l:
- if re.search(num_pattern,_l) is not None:
- if _l.find(".")>0:
- set_num_l.add(_l)
- elif len(_l)<4:
- set_num_l.add(_l)
- for _g in num_all_g:
- if re.search(num_pattern,_g) is not None:
- if _g.find(".")>0:
- set_num_g.add(_g)
- elif len(_g)<4:
- set_num_g.add(_g)
- if len(set_num_l)>0 and len(set_num_g)>0:
- if len(set_num_l&set_num_g)!=len(set_num_l):
- return False
- #check location and keywords
- for _p in [num1_pattern,building_pattern]:
- num_all_l = re.findall(_p,doctitle_refind_less)
- num_all_g = re.findall(_p,doctitle_refind_greater)
- set_num_l = set(num_all_l)
- set_num_g = set(num_all_g)
- if len(set_num_l)==len(set_num_g):
- if len(set_num_l&set_num_g)!=len(set_num_l):
- return False
- #check the location has conflict
- for _p in [location_pattern]:
- num_all_l = re.findall(_p,doctitle_refind_less)
- num_all_g = re.findall(_p,doctitle_refind_greater)
- dict_num_l = {}
- dict_num_g = {}
- for _l in num_all_l:
- if len(_l)>0:
- key = _l[-1:]
- if key not in dict_num_l:
- dict_num_l[key] = set()
- dict_num_l[key].add(_l)
- for _g in num_all_g:
- if len(_g)>0:
- key = _g[-1:]
- if key not in dict_num_g:
- dict_num_g[key] = set()
- dict_num_g[key].add(_g)
- for k,v in dict_num_l.items():
- if k in dict_num_g:
- if len(v&dict_num_g[k])==0:
- return False
- return True
- def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
- if getLength(product_less)>0 and getLength(product_greater)>0:
- _product_l = product_less.split(split_char)
- _product_g = product_greater.split(split_char)
- same_count = 0
- if len(_product_l)>len(_product_g):
- a = _product_g
- _product_g = _product_l
- _product_l = a
- for _l in _product_l:
- for _g in _product_g:
- if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
- same_count += 1
- break
- if same_count/len(_product_l)>=0.5:
- return True
- return False
- return True
- def check_package(package_less,package_greater,split_char=","):
- if getLength(package_less)>0 and getLength(package_greater)>0:
- _product_l = package_less.split(split_char)
- _product_g = package_greater.split(split_char)
- for _l in _product_l:
- for _g in _product_g:
- if _l==_g:
- return True
- return False
- return True
- def check_time(json_time_less,json_time_greater):
- has_same = False
- has_diff = False
- if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
- if isinstance(json_time_less,dict):
- time_less = json_time_less
- else:
- time_less = json.loads(json_time_less)
- if isinstance(json_time_greater,dict):
- time_greater = json_time_greater
- else:
- time_greater = json.loads(json_time_greater)
- for k,v in time_less.items():
- if getLength(v)>0:
- v1 = time_greater.get(k,"")
- if getLength(v1)>0:
- if v[:10]!=v1[:10]:
- has_diff = True
- else:
- has_same = True
- if has_same:
- if has_diff:
- return 1
- return 2
- if has_diff:
- return 0
- return 1
- def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
- if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
- return 1
- #一篇要素都在附件,且两篇附件md5有重叠
- set_md5_less = set()
- set_md5_greater = set()
- list_md5_less = json.loads(page_attachments_less)
- list_md5_greater = json.loads(page_attachments_greater)
- for _l in list_md5_less:
- _md5 = _l.get("fileMd5")
- if _md5 is not None:
- set_md5_less.add(_md5)
- for _l in list_md5_greater:
- _md5 = _l.get("fileMd5")
- if _md5 is not None:
- set_md5_greater.add(_md5)
- if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
- one_in_attach = False
- dict_enterprise_less = json.loads(nlp_enterprise_less)
- dict_enterprise_greater = json.loads(nlp_enterprise_greater)
- indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
- notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
- indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
- notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
- if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
- one_in_attach = True
- if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
- one_in_attach = True
- if one_in_attach:
- if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
- return 1
- if isinstance(project_codes_less,str):
- project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
- elif project_codes_less is None:
- project_codes_less = []
- if isinstance(project_codes_greater,str):
- project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
- elif project_codes_greater is None:
- project_codes_greater = []
- same_count = 0
- all_count = 8
- if len(set(project_codes_less) & set(project_codes_greater))>0:
- same_count += 1
- if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
- same_count += 1
- if getLength(agency_less)>0 and agency_less==agency_greater:
- same_count += 1
- if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
- same_count += 1
- if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
- same_count += 1
- if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
- same_count += 1
- if getLength(project_name_less)>0 and project_name_less==project_name_greater:
- same_count += 1
- if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
- same_count += 1
- base_prob = 0
- if min_counts<3:
- base_prob = 0.9
- elif min_counts<5:
- base_prob = 0.8
- elif min_counts<8:
- base_prob = 0.7
- else:
- base_prob = 0.6
- _prob = base_prob*same_count/all_count
- if min(extract_count_less,extract_count_greater)<=3:
- if _prob<0.1:
- _prob = 0.15
- if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
- return 0
- if _prob<0.1:
- return _prob
- check_result = {"pass":1}
- if docchannel_less in (51,102,103,104,115,116,117):
- if doctitle_refine_less!=doctitle_refine_greater:
- if page_time_less!=page_time_greater:
- check_result["docchannel"] = 0
- check_result["pass"] = 0
- else:
- check_result["docchannel"] = 2
- if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
- check_result["doctitle"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
- else:
- check_result["doctitle"] = 2
- #added check
- if not check_codes(project_codes_less,project_codes_greater):
- check_result["code"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
- else:
- if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
- check_result["code"] = 2
- else:
- check_result["code"] = 1
- if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
- check_result["product"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
- else:
- if getLength(product_less)>0 and getLength(product_greater)>0:
- check_result["product"] = 2
- else:
- check_result["product"] = 1
- if not check_demand():
- check_result["pass"] = 0
- if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
- tenderee_less,tenderee_greater,
- agency_less,agency_greater,
- win_tenderer_less,win_tenderer_greater):
- check_result["entity"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
- check_result["entity"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
- check_result["entity"] = 2
- else:
- check_result["entity"] = 1
- logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
- logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
- if not check_money(bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater,
- moneys_less,moneys_greater,
- moneys_attachment_less,moneys_attachment_greater):
- if b_log:
- logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
- check_result["money"] = 0
- check_result["pass"] = 0
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
- check_result["money"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
- check_result["money"] = 2
- else:
- check_result["money"] = 1
- #added check
- if not check_package(package_less,package_greater):
- if b_log:
- logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
- check_result["package"] = 0
- check_result["pass"] = 0
- else:
- if getLength(package_less)>0 and getLength(package_greater)>0:
- check_result["package"] = 2
- else:
- check_result["package"] = 1
- #added check
- _time_check = check_time(json_time_less,json_time_greater)
- if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
- if b_log:
- logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
- if isinstance(json_time_less,dict):
- time_less = json_time_less
- else:
- time_less = json.loads(json_time_less)
- if isinstance(json_time_greater,dict):
- time_greater = json_time_greater
- else:
- time_greater = json.loads(json_time_greater)
- for k,v in time_less.items():
- if getLength(v)>0:
- v1 = time_greater.get(k,"")
- if getLength(v1)>0:
- if v!=v1:
- logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
- check_result["time"] = 0
- check_result["pass"] = 0
- else:
- if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
- check_result["time"] = 2
- else:
- check_result["time"] = 1
- if hard_level==2 and check_result["product"]<=1:
- return 0
- if check_result.get("pass",0)==0:
- if b_log:
- logging.info(str(check_result))
- if check_result.get("money",1)==0:
- return 0
- if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
- return _prob
- else:
- return 0
- return _prob
- def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
- if web_source_no_less==web_source_no_greater:
- if fingerprint_less==fingerprint_greater:
- return 1
- else:
- return 0
- if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
- return 1
- if isinstance(project_codes_less,str):
- project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
- elif project_codes_less is None:
- project_codes_less = []
- if isinstance(project_codes_greater,str):
- project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
- elif project_codes_greater is None:
- project_codes_greater = []
- same_count = 0
- all_count = 8
- if len(set(project_codes_less) & set(project_codes_greater))>0:
- same_count += 1
- if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
- same_count += 1
- if getLength(agency_less)>0 and agency_less==agency_greater:
- same_count += 1
- if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
- same_count += 1
- if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
- same_count += 1
- if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
- same_count += 1
- if getLength(project_name_less)>0 and project_name_less==project_name_greater:
- same_count += 1
- if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
- same_count += 1
- base_prob = 0
- if min_counts<3:
- base_prob = 0.9
- elif min_counts<5:
- base_prob = 0.8
- elif min_counts<8:
- base_prob = 0.7
- else:
- base_prob = 0.6
- _prob = base_prob*same_count/all_count
- if min(extract_count_less,extract_count_greater)<=3:
- if _prob<0.1:
- _prob = 0.15
- if province_less!=province_greater:
- return 0
- if _prob<0.1:
- return _prob
- check_result = {"pass":1}
- if docchannel_less in (51,102,103,104,115,116,117):
- if doctitle_refine_less!=doctitle_refine_greater:
- if page_time_less!=page_time_greater:
- check_result["docchannel"] = 0
- check_result["pass"] = 0
- else:
- check_result["docchannel"] = 2
- if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
- check_result["doctitle"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
- else:
- check_result["doctitle"] = 2
- #added check
- if not check_codes(project_codes_less,project_codes_greater):
- check_result["code"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
- else:
- if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
- check_result["code"] = 2
- else:
- check_result["code"] = 1
- if not check_product(product_less,product_greater):
- check_result["product"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
- else:
- if getLength(product_less)>0 and getLength(product_greater)>0:
- check_result["product"] = 2
- else:
- check_result["product"] = 1
- if not check_demand():
- check_result["pass"] = 0
- if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
- tenderee_less,tenderee_greater,
- agency_less,agency_greater,
- win_tenderer_less,win_tenderer_greater):
- check_result["entity"] = 0
- check_result["pass"] = 0
- if b_log:
- logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
- check_result["entity"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
- check_result["entity"] = 2
- else:
- check_result["entity"] = 1
- if not check_money(bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater):
- if b_log:
- logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
- check_result["money"] = 0
- check_result["pass"] = 0
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
- check_result["money"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
- check_result["money"] = 2
- else:
- check_result["money"] = 1
- #added check
- if not check_package(package_less,package_greater):
- if b_log:
- logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
- check_result["package"] = 0
- check_result["pass"] = 0
- else:
- if getLength(package_less)>0 and getLength(package_greater)>0:
- check_result["package"] = 2
- else:
- check_result["package"] = 1
- #added check
- if not check_time(json_time_less,json_time_greater):
- if b_log:
- logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
- if isinstance(json_time_less,dict):
- time_less = json_time_less
- else:
- time_less = json.loads(json_time_less)
- if isinstance(json_time_greater,dict):
- time_greater = json_time_greater
- else:
- time_greater = json.loads(json_time_greater)
- for k,v in time_less.items():
- if getLength(v)>0:
- v1 = time_greater.get(k,"")
- if getLength(v1)>0:
- if v!=v1:
- logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
- check_result["time"] = 0
- check_result["pass"] = 0
- else:
- if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
- check_result["time"] = 2
- else:
- check_result["time"] = 1
- if hard_level==2 and check_result["product"]<=1:
- return 0
- if check_result.get("pass",0)==0:
- if b_log:
- logging.info(str(check_result))
- if check_result.get("money",1)==0:
- return 0
- if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
- return _prob
- else:
- return 0
- if check_result.get("time",1)==0:
- return 0
- return _prob
- @annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
- class f_dumplicate_check(BaseUDTF):
- def __init__(self):
- import logging
- import json
- global logging,json
- def process(self,docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,
- tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,
- bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,
- project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
- extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
- page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
- package_less,package_greater,json_time_less,json_time_greater,json_context,
- province_less,province_greater,city_less,city_greater,district_less,district_greater,
- web_source_no_less,web_source_no_greater,
- extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
- min_counts = 100
- if json_context is not None:
- _context = json.loads(json_context)
- for item in _context:
- if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
- min_counts = item["counts"]
- _extract_less = {}
- if extract_json_less is not None:
- _extract_less = json.loads(extract_json_less)
- _extract_greater = {}
- if extract_json_greater is not None:
- _extract_greater = json.loads(extract_json_greater)
- moneys_less = set(_extract_less.get("moneys",[]))
- moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
- moneys_greater = set(_extract_greater.get("moneys",[]))
- moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
- if page_attachments_less is None:
- page_attachments_less = '[]'
- if page_attachments_greater is None:
- page_attachments_greater = '[]'
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
- self.forward(_prob)
- @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
- class f_dumplicate_featureMatrix(BaseUDTF):
- def __init__(self):
- import logging
- import json
- global logging,json
- def process(self,json_context,docchannel_less,docchannel_greater,page_time_less,page_time_greater,nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,
- agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
- win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
- bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater,product_less,product_greater):
- #check the page_time by special docchannel
- if docchannel_less in (51,102,103,104,115,116,117):
- if doctitle_refine_less!=doctitle_refine_greater:
- if page_time_less!=page_time_greater:
- self.forward("[1-%s]"%(str(docchannel_less)),0)
- return
- if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
- self.forward("[2-%s]"%(str(doctitle_refine_less)+"=="+str(doctitle_refine_greater)),0)
- return
- # if not check_codes([project_code_less],[project_code_greater]):
- # self.forward("[3-%s]"%(str(project_code_less)+"=="+str(project_code_greater)),0)
- # return
- if not check_demand():
- self.forward("[4-]",0)
- return
- if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
- tenderee_less,tenderee_greater,
- agency_less,agency_greater,
- win_tenderer_less,win_tenderer_greater):
- _error = ""
- for a in [nlp_enterprise_less,nlp_enterprise_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater]:
- _error += str(a)
- self.forward("[5-%s]"%_error,0)
- return
- if not check_money(bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater):
- _error = ""
- for a in [bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater]:
- _error += str(a)
- self.forward("[6-%s]"%_error,0)
- return
- if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
- _error = "%s=%s"%(str(product_less),str(product_greater))
- self.forward("7-%s"%_error,0)
- return
- _context = json.loads(json_context)
- min_counts = 100
- dict_context = {}
- for item in _context:
- if item["counts"]<min_counts:
- min_counts = item["counts"]
- dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
- context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
- list_matrix = []
- #get the featurn of the context into matrix
- # for index_i in range(len(context_key)):
- # for index_j in range(index_i+1,len(context_key)):
- # _key = "%s&%s"%(context_key[index_i],context_key[index_j])
- # _v = featurnCount(dict_context.get(_key,[0,0])[1])
- # list_matrix.append(_v)
- # context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
- # for index_i in range(len(context3_key)):
- # for index_j in range(index_i+1,len(context3_key)):
- # for index_k in range(index_j+1,len(context3_key)):
- # _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
- # _v = featurnCount(dict_context.get(_key,[0,0])[1])
- # list_matrix.append(_v)
- # list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
- # list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
- # list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
- # list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
- # list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
- # list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
- # list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
- # list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
- json_matrix = json.dumps(list_matrix)
- same_count = 0
- all_count = 8
- if getSimilarityOfString(project_code_less,project_code_greater)==1:
- same_count += 1
- if getSimilarityOfString(tenderee_less,tenderee_greater)==1:
- same_count += 1
- if getSimilarityOfString(agency_less,agency_greater)==1:
- same_count += 1
- if getSimilarityOfString(win_tenderer_less,win_tenderer_greater)==1:
- same_count += 1
- if getSimilarityOfString(bidding_budget_less,bidding_budget_greater)==1:
- same_count += 1
- if getSimilarityOfString(win_bid_price_less,win_bid_price_greater)==1:
- same_count += 1
- if getSimilarityOfString(project_name_less,project_name_greater)==1:
- same_count += 1
- if getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater)==1:
- same_count += 1
- base_prob = 0
- if min_counts<3:
- base_prob = 0.9
- elif min_counts<5:
- base_prob = 0.8
- elif min_counts<8:
- base_prob = 0.7
- else:
- base_prob = 0.6
- _prob = base_prob*same_count/all_count
- json_matrix = "[==%s]"%(str(base_prob)+"="+str(same_count)+"="+str(all_count)+str(product_less)+str(product_greater))
- self.forward(json_matrix,_prob)
- return
- @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
- class f_redump_probability_final_check(BaseUDAF):
- '''
- 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
- 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
- '''
- def __init__(self):
- import logging
- import json,re
- global json,logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [list()]
- def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
- province,city,district,web_source_no,extract_json,page_attachments):
- buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
- "project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
- "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
- "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_group = []
- the_group = buffer[0]
- the_group.sort(key=lambda x:x["confidence"],reverse=True)
- _index = 0
- final_group = []
- if len(the_group)>0:
- _index = 0
- while _index<len(the_group):
- document_greater = the_group[_index]
- docid_greater = document_greater["docid"]
- docchannel_greater = document_greater["docchannel"]
- page_time_greater = document_greater["page_time"]
- doctitle_refine_greater = document_greater["doctitle_refine"]
- project_codes_greater = document_greater["project_codes"]
- nlp_enterprise_greater = document_greater["nlp_enterprise"]
- tenderee_greater = document_greater["tenderee"]
- agency_greater = document_greater["agency"]
- win_tenderer_greater = document_greater["win_tenderer"]
- bidding_budget_greater = document_greater["bidding_budget"]
- win_bid_price_greater = document_greater["win_bid_price"]
- product_greater = document_greater["product"]
- package_greater = document_greater["package"]
- json_time_greater = document_greater["json_dicttime"]
- fingerprint_greater = document_greater.get("fingerprint","")
- project_name_greater = document_greater["project_name"]
- extract_count_greater = document_greater["extract_count"]
- province_greater = document_greater["province"]
- city_greater = document_greater["city"]
- district_greater = document_greater["district"]
- web_source_no_greater = document_greater["web_source_no"]
- extract_json_greater = document_greater["extract_json"]
- page_attachments_greater = document_greater["page_attachments"]
- _pass = True
- for document_less in final_group:
- docid_less = document_less["docid"]
- docchannel_less = document_less["docchannel"]
- page_time_less = document_less["page_time"]
- doctitle_refine_less = document_less["doctitle_refine"]
- project_codes_less = document_less["project_codes"]
- nlp_enterprise_less = document_less["nlp_enterprise"]
- tenderee_less = document_less["tenderee"]
- agency_less = document_less["agency"]
- win_tenderer_less = document_less["win_tenderer"]
- bidding_budget_less = document_less["bidding_budget"]
- win_bid_price_less = document_less["win_bid_price"]
- product_less = document_less["product"]
- package_less = document_less["package"]
- json_time_less = document_less["json_dicttime"]
- fingerprint_less = document_less.get("fingerprint","")
- project_name_less = document_less["project_name"]
- extract_count_less = document_less["extract_count"]
- province_less = document_less["province"]
- city_less = document_less["city"]
- district_less = document_less["district"]
- web_source_no_less = document_less["web_source_no"]
- extract_json_less = document_less["extract_json"]
- page_attachments_less = document_less["page_attachments"]
- if extract_json_less is not None:
- _extract_less = json.loads(extract_json_less)
- _extract_greater = {}
- if extract_json_greater is not None:
- _extract_greater = json.loads(extract_json_greater)
- moneys_less = set(_extract_less.get("moneys",[]))
- moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
- moneys_greater = set(_extract_greater.get("moneys",[]))
- moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
- if page_attachments_less is None:
- page_attachments_less = '[]'
- if page_attachments_greater is None:
- page_attachments_greater = '[]'
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
- if _prob<0.1:
- _pass = False
- break
- if _pass:
- final_group.append(document_greater)
- else:
- break
- _index += 1
- dumplicates = ""
- if _index>1:
- logging.info("index/whole:%d/%d"%(_index,len(the_group)))
- final_group.sort(key=lambda x:x["docid"])
- final_group.sort(key=lambda x:x["extract_count"],reverse=True)
- _set = set()
- for _d in final_group:
- _docid = _d["docid"]
- if _docid in _set:
- continue
- dumplicates += "%d,"%_docid
- _set.add(_docid)
- dumplicates = dumplicates[:-1]
- return dumplicates
- @annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
- class f_redump_probability_final_check_bak(BaseUDAF):
- '''
- 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
- 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
- '''
- def __init__(self):
- import logging
- import json,re
- global json,logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [list()]
- def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_code,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
- buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
- "project_code":project_code,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
- "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_group = []
- the_group = buffer[0]
- the_group.sort(key=lambda x:x["confidence"],reverse=True)
- _index = 0
- if len(the_group)>0:
- _index = 1
- while _index<len(the_group):
- document_greater = the_group[_index]
- docchannel_greater = document_greater["docchannel"]
- page_time_greater = document_greater["page_time"]
- doctitle_refine_greater = document_greater["doctitle_refine"]
- project_code_greater = document_greater["project_code"]
- nlp_enterprise_greater = document_greater["nlp_enterprise"]
- tenderee_greater = document_greater["tenderee"]
- agency_greater = document_greater["agency"]
- win_tenderer_greater = document_greater["win_tenderer"]
- bidding_budget_greater = document_greater["bidding_budget"]
- win_bid_price_greater = document_greater["win_bid_price"]
- product_greater = document_greater["product"]
- package_greater = document_greater["package"]
- json_time_greater = document_greater["json_dicttime"]
- _less_index = 0
- while _less_index<_index:
- document_less = the_group[_less_index]
- docchannel_less = document_less["docchannel"]
- page_time_less = document_less["page_time"]
- doctitle_refine_less = document_less["doctitle_refine"]
- project_code_less = document_less["project_code"]
- nlp_enterprise_less = document_less["nlp_enterprise"]
- tenderee_less = document_less["tenderee"]
- agency_less = document_less["agency"]
- win_tenderer_less = document_less["win_tenderer"]
- bidding_budget_less = document_less["bidding_budget"]
- win_bid_price_less = document_less["win_bid_price"]
- product_less = document_less["product"]
- package_less = document_less["package"]
- json_time_less = document_less["json_dicttime"]
- check_result = {"pass":1}
- if docchannel_less in (51,102,103,104,115,116,117):
- if doctitle_refine_less!=doctitle_refine_greater:
- if page_time_less!=page_time_greater:
- check_result["docchannel"] = 0
- check_result["pass"] = 0
- else:
- check_result["docchannel"] = 2
- if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,[str(project_code_less)],[str(project_code_greater)]):
- check_result["doctitle"] = 0
- check_result["pass"] = 0
- logging.info("check_doctitle_failed:%s==%s"%(str(doctitle_refine_less),str(doctitle_refine_greater)))
- else:
- check_result["doctitle"] = 2
- #added check
- if not check_codes([project_code_less],[project_code_greater]):
- check_result["code"] = 0
- check_result["pass"] = 0
- logging.info("check_code_failed:%s==%s"%(str(project_code_less),str(project_code_greater)))
- else:
- if getLength(project_code_less)>0 and getLength(project_code_greater)>0 and project_code_less==project_code_greater:
- check_result["code"] = 2
- else:
- check_result["code"] = 1
- if not check_product(product_less,product_greater):
- check_result["product"] = 0
- check_result["pass"] = 0
- logging.info("check_product_failed:%s==%s"%(str(product_less),str(product_greater)))
- else:
- if getLength(product_less)>0 and getLength(product_greater)>0:
- check_result["product"] = 2
- else:
- check_result["product"] = 1
- if not check_demand():
- check_result["pass"] = 0
- if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
- tenderee_less,tenderee_greater,
- agency_less,agency_greater,
- win_tenderer_less,win_tenderer_greater):
- check_result["entity"] = 0
- check_result["pass"] = 0
- logging.info("check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
- check_result["entity"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
- check_result["entity"] = 2
- else:
- check_result["entity"] = 1
- if not check_money(bidding_budget_less,bidding_budget_greater,
- win_bid_price_less,win_bid_price_greater):
- logging.info("check_money_failed:%s==%s==%s==%s"%(str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
- check_result["money"] = 0
- check_result["pass"] = 0
- else:
- if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
- check_result["money"] = 2
- elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
- check_result["money"] = 2
- else:
- check_result["money"] = 1
- #added check
- if not check_package(package_less,package_greater):
- logging.info("check_package_failed:%s==%s"%(str(package_less),str(package_greater)))
- check_result["package"] = 0
- check_result["pass"] = 0
- else:
- if getLength(package_less)>0 and getLength(package_greater)>0:
- check_result["package"] = 2
- else:
- check_result["package"] = 1
- #added check
- if not check_time(json_time_less,json_time_greater):
- logging.info("check_time_failed:%s==%s"%(str(json_time_less),str(json_time_greater)))
- check_result["time"] = 0
- check_result["pass"] = 0
- else:
- if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
- check_result["time"] = 2
- else:
- check_result["time"] = 1
- if check_result.get("pass",0)==0:
- logging.info(str(check_result))
- if check_result.get("time",1)==0:
- break
- if check_result.get("money",1)==0:
- break
- if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2:
- pass
- else:
- break
- _less_index += 1
- if _less_index!=_index:
- break
- _index += 1
- dumplicates = ""
- if _index>1:
- logging.info("index/whole:%d/%d"%(_index,len(the_group)))
- final_group = the_group[:_index]
- final_group.sort(key=lambda x:x["docid"])
- final_group.sort(key=lambda x:x["extract_count"],reverse=True)
- _set = set()
- for _d in final_group:
- _docid = _d["docid"]
- if _docid in _set:
- continue
- dumplicates += "%d,"%_docid
- _set.add(_docid)
- dumplicates = dumplicates[:-1]
- return dumplicates
- @annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,string->string')
- class f_set_docid_binaryChart(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid, page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no):
- buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,
- "project_code":project_code,"project_name":project_name,"tenderee":tenderee,
- "bidding_budget":bidding_budget,"win_tenderer":win_tenderer,"win_bid_price":win_bid_price,
- "agency":agency,"web_source_no":web_source_no})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_docs = buffer[0]
- list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
- list_group = []
- empty_key = ["project_code","bidding_budget","win_tenderer","win_bid_price","agency"]
- for _timeGroups in list_timeGroups:
- list_empty = []
- list_notEmpty = []
- for _item in _timeGroups:
- empty_flag = True
- for _key in empty_key:
- if not isEmpty(_item[_key]):
- empty_flag = False
- break
- if empty_flag:
- list_empty.append(_item)
- else:
- list_notEmpty.append(_item)
- for _e in list_empty:
- _group = [{"docid":_e["docid"],"extract_count":_e["extract_count"]}]
- _e_tenderee = _e["tenderee"]
- for _ne in list_notEmpty:
- if "set_webSource" not in _ne:
- _ne["set_webSource"] = set()
- _ne["set_webSource"].add(_ne["web_source_no"])
- _suit = False
- if not isEmpty(_e_tenderee) and _e_tenderee==_ne["tenderee"]:
- _suit = True
- elif isEmpty(_e_tenderee):
- _suit = True
- if _suit:
- if _e["web_source_no"] not in _ne["set_webSource"]:
- _ne["set_webSource"].add(_e["web_source_no"])
- _group.append({"docid":_ne["docid"],"extract_count":_ne["extract_count"]})
- break
- if len(_group)>1:
- list_group.append(_group)
- return json.dumps(list_group)
- def split_with_time(list_dict,sort_key,timedelta=86400*7):
- if len(list_dict)>0:
- if sort_key in list_dict[0]:
- list_dict.sort(key=lambda x:x[sort_key])
- list_group = []
- _begin = 0
- for i in range(len(list_dict)-1):
- if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
- continue
- else:
- _group = []
- for j in range(_begin,i+1):
- _group.append(list_dict[j])
- if len(_group)>1:
- list_group.append(_group)
- _begin = i + 1
- if len(list_dict)>1:
- _group = []
- for j in range(_begin,len(list_dict)):
- _group.append(list_dict[j])
- if len(_group)>1:
- list_group.append(_group)
- return list_group
- return [list_dict]
- @annotate('bigint,bigint,bigint,string,string,string,string,string->string')
- class f_set_docid_limitNum_contain(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
- '''
- def __init__(self):
- import logging
- import json,re
- global json,logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [list()]
- def iterate(self, buffer,docid,page_time_stamp,extract_count,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column):
- buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"extract_count":extract_count,"set_limit_column1":set_limit_column1,
- "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
- "contain_column":contain_column})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_split = split_with_time(buffer[0],"page_time_stamp")
- list_group = []
- for _split in list_split:
- flag = True
- keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
- for _key in keys:
- logging.info(_key+str(getSet(_split,_key)))
- if len(getSet(_split,_key))>1:
- flag = False
- break
- MAX_CONTAIN_COLUMN = None
- #判断组内每条公告是否包含
- if flag:
- for _d in _split:
- contain_column = _d["contain_column"]
- if contain_column is not None and contain_column !="":
- if MAX_CONTAIN_COLUMN is None:
- MAX_CONTAIN_COLUMN = contain_column
- else:
- if len(MAX_CONTAIN_COLUMN)<len(contain_column):
- if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
- flag = False
- break
- MAX_CONTAIN_COLUMN = contain_column
- else:
- if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
- flag = False
- break
- if flag:
- if len(_split)>1:
- _group = []
- for _item in _split:
- _group.append({"docid":_item["docid"],"extract_count":_item["extract_count"]})
- list_group.append(_group)
- return json.dumps(list_group)
- @annotate('bigint->string')
- class f_stamp_squence(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- import logging
- global logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [set()]
- def iterate(self, buffer,page_time_stamp):
- buffer[0].add(page_time_stamp)
- def merge(self, buffer, pbuffer):
- buffer[0] |= pbuffer[0]
- def terminate(self, buffer):
- if 0 in buffer[0]:
- buffer[0].remove(0)
- list_stamp = list(buffer[0])
- list_stamp.sort(key=lambda x:x)
- list_stamp_final = []
- _begin = 0
- _time_decase = 86400*7
- logging.info(str(list_stamp))
- for _index in range(len(list_stamp)-1):
- if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
- continue
- else:
- list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[_index]+_time_decase])
- _begin = _index+1
- if len(list_stamp)>0:
- list_stamp_final.append([list_stamp[_begin]-_time_decase,list_stamp[-1]+_time_decase])
- return json.dumps(list_stamp_final)
- @annotate("bigint,string->bigint")
- class in_stamp(object):
- def __init__(self):
- import logging
- import re
- import json
- global logging,re,json
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, page_time_stamp,json_stamp):
- list_stamp = json.loads(json_stamp)
- int_flag = 0
- for item in list_stamp:
- if page_time_stamp <item[0]:
- break
- if page_time_stamp>item[0] and page_time_stamp<item[1]:
- int_flag = 1
- break
- return int_flag
- def getConfidence(rule_id):
- if rule_id ==0:
- return 30
- elif rule_id >=1 and rule_id <30:
- return 20
- else:
- return 10
- @annotate('string,string -> string')
- class f_splitStr(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self, str_split,_split):
- try:
- for _s in str_split.split(_split):
- self.forward(_s)
- except Exception as e:
- pass
- @annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
- class f_split_group_single(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self, json_set_docid,rule_id):
- list_group = json.loads(json_set_docid)
- for item in list_group:
- if len(item)>100:
- item.sort(key=lambda x:x["docid"],reverse=True)
- index_i = 0
- for index_j in range(1,len(item)):
- if item[index_i]["docid"]!=item[index_j]["docid"]:
- self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
- else:
- for index_i in range(len(item)):
- for index_j in range(len(item)):
- if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
- self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
- @annotate('bigint,string->string')
- class group_document(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,id,json_set_docid):
- buffer[0].append({"id":id,"json_set_docid":json.loads(json_set_docid)})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- return json.dumps(buffer[0])
- @annotate('bigint,string,bigint,string -> bigint,bigint,string')
- class decare_document(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,group_id1, json_list_doc1,group_id2,json_list_doc2):
- #y=x,少掉近一半的数据
- if group_id1>=group_id2:
- list_doc1 = json.loads(json_list_doc1)
- list_doc2 = json.loads(json_list_doc2)
- for _doc1 in list_doc1:
- for _doc2 in list_doc2:
- #同一个重复group不做判断
- if _doc1["id"]!=_doc2["id"]:
- #判断两个group是否有重复
- _set1 = set()
- for _item1 in _doc1["json_set_docid"]:
- _set1.add(_item1["docid"])
- _set2 = set()
- for _item2 in _doc2["json_set_docid"]:
- _set2.add(_item2["docid"])
- if len(_set1&_set2)>0:
- new_json_set_docid = _doc1["json_set_docid"]
- for _item2 in _doc2["json_set_docid"]:
- if _item2["docid"] not in _set1:
- new_json_set_docid.append(_item2)
- self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
- def getBestDocid(list_pair):
- # [docid1,extract_count1,docid2,extract_count2]
- # list_pair.sort(key=lambda x:x[3],reverse=True)
- # _max_count = max(list_pair[0][3],list_pair[0][1])
- # set_candidate = set()
- # if list_pair[0][1]==_max_count:
- # set_candidate.add(list_pair[0][0])
- # for item in list_pair:
- # if item[3]==_max_count:
- # set_candidate.add(item[2])
- # else:
- # break
- # list_candidate = list(set_candidate)
- # list_candidate.sort(key=lambda x:x)
- new_pair = []
- new_pair.append([list_pair[0][0],list_pair[0][0],list_pair[0][1]])
- for item in list_pair:
- new_pair.append([item[0],item[2],item[3]])
- new_pair.sort(key=lambda x:x[1])
- new_pair.sort(key=lambda x:x[2],reverse=True)
- return new_pair[0][1]
- @annotate('bigint,bigint,bigint,bigint->string')
- class choose_document(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
- buffer[0].append([docid1,extract_count1,docid2,extract_count2])
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_pair = buffer[0]
- _set = set()
- for item in buffer[0]:
- _set.add(str(item[2]))
- list_dumplicate = list(_set)
- best_docid = getBestDocid(list_pair)
- if best_docid==list_pair[0][0]:
- save_flag = 1
- else:
- save_flag = 0
- return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
- @annotate('string -> bigint,string')
- class f_get_choose_document(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,json_choose):
- if json_choose is None:
- self.forward(1,None)
- else:
- _choose = json.loads(json_choose)
- self.forward(_choose["save_flag"],",".join(_choose["dumplicates"]))
- @annotate('string->bigint')
- class f_get_codes_count(object):
- def evaluate(self,extract_json):
- if extract_json is None or extract_json=="":
- extract_json = "{}"
- _extract = json.loads(extract_json)
- _codes = _extract.get("code",[])
- return len(_codes)
- @annotate('string->string')
- class f_get_codes(object):
- def evaluate(self,extract_json):
- if extract_json is None or extract_json=="":
- extract_json = "{}"
- _extract = json.loads(extract_json)
- _codes = _extract.get("code",[])
- return ",".join(_codes)
- @annotate('bigint,bigint,bigint,bigint->string')
- class group_document_bestFirst(BaseUDAF):
- '''
- 将组里面最优的放在前面
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [[]]
- def iterate(self, buffer,docid1,extract_count1,docid2,extract_count2):
- buffer[0].append([docid1,extract_count1,docid2,extract_count2])
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_pair = buffer[0]
- _set = set()
- for item in buffer[0]:
- _set.add(item[2])
- _set.add(list_pair[0][0])
- best_docid = getBestDocid(list_pair)
- _set.remove(best_docid)
- list_dumplicate = list(_set)
- list_dumplicate.sort(key=lambda x:x)
- list_dumplicate.insert(0,best_docid)
- list_dumplicate_str = []
- for item in list_dumplicate:
- list_dumplicate_str.append(str(item))
- return ",".join(list_dumplicate_str)
- @annotate('string -> bigint,string')
- class f_get_best_dumplicates(BaseUDTF):
- '''
- 得到每个分组中最优的那一条及其重复记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,list_dumplicate_str):
- if list_dumplicate_str is None or list_dumplicate_str=='':
- pass
- else:
- list_dumplicate = list_dumplicate_str.split(",")
- if len(list_dumplicate)>0:
- self.forward(int(list_dumplicate[0]),",".join(list_dumplicate[1:]))
- else:
- pass
- @annotate('bigint,bigint->string')
- class bridge2group(BaseUDAF):
- '''
- 项目编号、中标单位、len(项目编号)>7、中标单位<> ""
- '''
- def __init__(self):
- import json
- global json
- def new_buffer(self):
- return [set()]
- def iterate(self, buffer,docid1,docid2):
- buffer[0].add(docid1)
- buffer[0].add(docid2)
- def merge(self, buffer, pbuffer):
- buffer[0] |= pbuffer[0]
- def terminate(self, buffer):
- list_pair = list(buffer[0])
- list_pair.sort(key=lambda x:x,reverse=True)
- return json.dumps(list_pair)
- @annotate('string -> bigint,bigint')
- class group2bridge(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,json_list_docid):
- list_docid = json.loads(json_list_docid)
- for _docid in list_docid:
- self.forward(list_docid[-1],_docid)
- @annotate('string->string')
- class to_url(object):
- def evaluate(self,_s):
- if _s is None or _s=="":
- return
- else:
- list_l = []
- for l in _s.split(","):
- list_l.append("http://www.bidizhaobiao.com/info-%s.html"%l)
- return ",".join(list_l)
- @annotate('bigint,bigint,string -> bigint')
- class f_get_dump_docid(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,docid,save_flag,dumplicates):
- if save_flag==0:
- self.forward(docid)
- if dumplicates is not None:
- list_docid = dumplicates.split(",")
- if len(list_docid)>0:
- for _docid in list_docid[1:]:
- self.forward(int(_docid))
- else:
- if dumplicates is not None:
- list_docid = dumplicates.split(",")
- if len(list_docid)>0:
- for _docid in list_docid:
- self.forward(int(_docid))
- @annotate('string -> bigint,bigint')
- class f_get_docid(BaseUDTF):
- '''
- 将多个组拆解成多条记录
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,json_set_docid):
- team_id = 0
- if json_set_docid is not None:
- list_docses = json.loads(json_set_docid)
- for list_docs in list_docses:
- team_id += 1
- for item in list_docs:
- self.forward(team_id,item["docid"])
- @annotate("string->bigint")
- class get_count_dump(object):
- def __init__(self):
- import logging
- import re
- global logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, title):
- _count = 0
- if title is not None:
- _count = len(title.split(","))
- return _count
- def getSet(list_dict,key):
- _set = set()
- for item in list_dict:
- if key in item:
- if item[key]!='' and item[key] is not None:
- if re.search("^\d[\d\.]*$",item[key]) is not None:
- _set.add(str(float(item[key])))
- else:
- _set.add(str(item[key]))
- return _set
- def getDiffIndex(list_dict,key,confidence=100):
- '''
- 优化为相似度判断
- :param list_dict:
- :param key:
- :param confidence:
- :return:
- '''
- # _set = set()
- # for _i in range(len(list_dict)):
- # item = list_dict[_i]
- # if item["confidence"]>=confidence:
- # continue
- # if key in item:
- # if item[key]!='' and item[key] is not None:
- # if re.search("^\d+(\.\d+)?$",item[key]) is not None:
- # _set.add(str(float(item[key])))
- # else:
- # _set.add(str(item[key]))
- # if len(_set)>1:
- # return _i
- # ==============================
- _set = set()
- _set_m = set()
- base_s = ""
- for _i in range(len(list_dict)):
- item = list_dict[_i]
- if item["confidence"]>=confidence:
- continue
- if key in item:
- if item[key]!='' and item[key] is not None:
- if re.search("^\d+(\.\d+)?$",item[key]) is not None:
- _m = float(item[key])
- if _m>100000:
- _m = _m//10000*10000
- _set_m.add(str(_m))
- else:
- _s = str(item[key])
- if base_s=="":
- base_s = _s
- else:
- simi = getSimilarityOfString(base_s,_s)
- if simi<0.8:
- return _i
- if len(_set_m)>1:
- return _i
- return len(list_dict)
- @annotate('bigint,string -> bigint,bigint')
- class f_getGroup_dumpFinal(BaseUDTF):
- '''
- 从最后的结果中获取组
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,docid,dumplicates):
- self.forward(int(docid),int(docid))
- if dumplicates is not None:
- list_docids = dumplicates.split(",")
- for _docid in list_docids:
- self.forward(int(docid),int(_docid))
- @annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
- class f_redump_limit_num(BaseUDAF):
- '''
- 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
- 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
- '''
- def __init__(self):
- import logging
- import json,re
- global json,logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [list()]
- def iterate(self, buffer,main_docid,docid,doctitle,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
- buffer[0].append({"main_docid":main_docid,"docid":docid,"doctitle":doctitle,"set_limit_column2":set_limit_column2,
- "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
- "extract_count2":extract_count2,"confidence":confidence})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0])
- def terminate(self, buffer):
- list_group = []
- the_group = buffer[0]
- the_group.sort(key=lambda x:x["confidence"],reverse=True)
- if len(the_group)>5:
- keys = ["doctitle","set_limit_column2","set_limit_column3","set_limit_column4"]
- else:
- keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
- final_group = []
- #置信度
- list_key_index = []
- for _k in keys:
- if _k=="doctitle":
- list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
- else:
- list_key_index.append(getDiffIndex(the_group,_k))
- _index = min(list_key_index)
- if _index>1:
- main_docid = the_group[0]["main_docid"]
- for item in the_group[:_index]:
- if item["docid"]!=main_docid:
- final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
- # stay = True
- # for _key in keys:
- # if len(getSet(the_group,_key))>1:
- # stay = False
- # break
- #
- # if stay:
- # main_docid = the_group[0]["main_docid"]
- # for item in the_group:
- # if item["docid"]!=main_docid:
- # final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
- return json.dumps(final_group)
- @annotate('string -> bigint,bigint,bigint,bigint,bigint')
- class f_get_dumpFinal_checked(BaseUDTF):
- '''
- 从最后的结果中获取组
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,list_group):
- if list_group is not None:
- final_group = json.loads(list_group)
- for _group in final_group:
- self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])
- @annotate('string -> bigint')
- class f_getDumplicateDocids(BaseUDTF):
- '''
- 从最后的结果中获取组
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,dumplicates):
- list_docids = dumplicates.split(",")
- for _d in list_docids:
- self.forward(int(_d))
- def jaccard_score(source,target):
- source_set = set([s for s in source])
- target_set = set([s for s in target])
- if len(source_set)==0 or len(target_set)==0:
- return 0
- return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
- def getSimilarityOfString(str1,str2):
- _set1 = set()
- _set2 = set()
- if str1 is not None:
- for i in range(1,len(str1)):
- _set1.add(str1[i-1:i+1])
- for i in range(2,len(str1)):
- _set1.add(str1[i-2:i+1])
- if str2 is not None:
- for i in range(1,len(str2)):
- _set2.add(str2[i-1:i+1])
- for i in range(2,len(str2)):
- _set2.add(str2[i-2:i+1])
- _len = max(1,min(len(_set1),len(_set2)))
- return len(_set1&_set2)/_len
- @annotate("string,string,string,string,string,string,string,string,string,string->bigint")
- class f_is_legal(object):
- def __init__(self):
- import logging
- import re
- global logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def evaluate(self, tenderee1,tenderee2,bidding_budget1,budding_budget2,win_tenderee1,win_tenderee2,win_bid_price1,win_bid_price2,project_code1,project_code2):
- if tenderee1 is not None and tenderee1!="" and tenderee2 is not None and tenderee2!="" and tenderee1!=tenderee2:
- return 0
- if bidding_budget1 is not None and bidding_budget1!="" and budding_budget2 is not None and budding_budget2!="" and bidding_budget1!=budding_budget2:
- return 0
- if win_tenderee1 is not None and win_tenderee1!="" and win_tenderee2 is not None and win_tenderee2!="" and win_tenderee1!=win_tenderee2:
- return 0
- if win_bid_price1 is not None and win_bid_price1!="" and win_bid_price2 is not None and win_bid_price2!="" and win_bid_price1!=win_bid_price2:
- return 0
- _sim = getSimilarityOfString(project_code1,project_code2)
- if _sim>0.7 and _sim<1:
- return 0
- return 1
- @annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint->string')
- class f_autorule_group(BaseUDAF):
- '''
- 去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
- 组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
- '''
- def __init__(self):
- import logging
- import json,re
- global json,logging,re
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def new_buffer(self):
- return [list()]
- def iterate(self, buffer,main_docid,docid,docchannel,doctitle,doctitle_refine,area,province,city,district,web_source_no,fingerprint,
- project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count1,extract_count2,confidence):
- buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"doctitle":doctitle,
- "doctitle_refine":doctitle_refine,"area":area,"province":province,
- "city":city,"district":district,"web_source_no":web_source_no,"fingerprint":fingerprint,
- "project_code":project_code,"project_name":project_name,"tenderee":tenderee,"agency":agency,
- "win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price,
- "extract_count1":extract_count1,"extract_count2":extract_count2,"confidence":confidence})
- def merge(self, buffer, pbuffer):
- buffer[0].extend(pbuffer[0][:100])
- buffer[0] = buffer[0][:100]
- def getSameKeys(self,_dict1,_dict2):
- list_keys = []
- for k,v in _dict1.items():
- if k in ["area","city","confidence","district","extract_count1","extract_count2","main_docid","province"]:
- continue
- v2 = _dict2.get(k,"")
- if v is not None and v!="" and v2 is not None and v2!="" and v==v2:
- list_keys.append(k)
- list_keys.sort(key=lambda x:x)
- return "=".join(list_keys)
- def terminate(self, buffer):
- list_group = []
- the_group = buffer[0]
- the_group.sort(key=lambda x:x["confidence"],reverse=True)
- if len(the_group)>5:
- keys = ["doctitle","tenderee","win_tenderer","bidding_budget","win_bid_price"]
- else:
- keys = ["tenderee","win_tenderer","bidding_budget","win_bid_price"]
- #置信度
- list_key_index = []
- for _k in keys:
- if _k=="doctitle":
- list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
- else:
- list_key_index.append(getDiffIndex(the_group,_k))
- final_group = []
- _index = min(list_key_index)
- if _index>1:
- for item in the_group[:_index]:
- final_group.append(item)
- list_rules = []
- for i in range(len(final_group)):
- for j in range(i+1,len(final_group)):
- _dict1 = final_group[i]
- _dict2 = final_group[j]
- _rule = self.getSameKeys(_dict1,_dict2)
- list_rules.append([_rule,_dict1.get("docid"),_dict2.get("docid")])
- return json.dumps(list_rules)
- @annotate('string -> string,bigint,bigint')
- class f_autorule_group_extract(BaseUDTF):
- '''
- 从最后的结果中获取组
- '''
- def __init__(self):
- import logging
- import json
- global json,logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- def process(self,rules_json):
- list_rules = json.loads(rules_json)
- for _rule in list_rules:
- self.forward(_rule[0],_rule[1],_rule[2])
- if __name__ == '__main__':
- # f = f_decode_for_dumplicate()
- # b = f.process('[{}]','{ "attachmentTypes": "", "bidway": "", "candidate": "", "code": [], "cost_time": { "attrs": 0.0, "codename": 0.03, "deposit": 0.0, "district": 0.03, "moneygrade": 0.0, "nerToken": 0.06, "person": 0.0, "prem": 0.02, "preprocess": 0.1, "product": 0.04, "product_attrs": 0.01, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.05, "tableToText": 0.030002145767211913, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "厦门", "district": "未知", "is_in_text": false, "province": "福建" }, "docchannel": { "docchannel": "招标公告", "doctype": "采招数据", "life_docchannel": "招标公告" }, "docid": "", "doctitle_refine": "C70U264COM6项目所需直流屏", "exist_table": 1, "extract_count": 1, "fail_reason": "", "fingerprint": "md5=3da15e8c6f69a1d766bfe155092b1638", "industry": { "class": "零售批发", "class_name": "广播、电视、电影设备", "subclass": "通用设备" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "C70U264COM6项目所需直流屏", "nlp_enterprise": [], "nlp_enterprise_attachment": [], "person_review": [], "prem": {}, "process_time": "2022-12-08 04:43:18", "product": [ "直流屏" ], "product_attrs": { "data": [ { "brand": "", "product": "直流屏65AH", "quantity": "1.0", "quantity_unit": "台", "specs": "带逆变,蓄电池采用原装进口免维护蓄电池(必须是原产地进口,注明电池进口产地)等,由供应商负责采购,使用寿命10年及以上", "unitPrice": "" } ], "header": [ "产品名称_产品数量____产品规格" ], "header_col": [ "产品名称_产品编号_产品规格_产品材质_产品数量_备注" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2022-11-24" }','')
- # print(b)
- print(check_doctitle(doctitle_refind_less="山西银行晋城分行对A公司清算处置审计服务项目供应商征集公告",doctitle_refind_greater="山西银行晋城分行对B公司清算处置审计服务项目供应商征集公告"))
- # f = f_get_extractCount()
- # j = '''{ "attachmentTypes": "", "bidway": "", "candidate": "湖南省金达工程建设有限公司", "code": [ "丰汇-YCYZ2022-001-1" ], "cost_time": { "attrs": 0.33, "codename": 0.14, "deposit": 0.0, "district": 0.02, "moneygrade": 0.0, "nerToken": 0.27, "person": 0.01, "prem": 0.06, "preprocess": 0.71, "product": 0.15, "product_attrs": 0.02, "roleRuleFinal": 0.0, "rolegrade": 0.0, "rule": 0.0, "rule_channel": 0.26, "tableToText": 0.11000882148742676, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "district": { "area": "华东", "city": "宜春", "district": "袁州", "is_in_text": false, "province": "江西" }, "docchannel": { "docchannel": "中标信息", "doctype": "采招数据", "life_docchannel": "中标信息" }, "docid": "", "doctitle_refine": "2022年宜春市袁州区县乡村道安全生命防护项目(二)(第二次)", "exist_table": 1, "extract_count": 6, "fail_reason": "", "fingerprint": "md5=23e9e56f2a6ec0c73e1838670e630948", "industry": { "class": "建筑业", "class_name": "其他土木工程建筑", "subclass": "土木工程建筑业" }, "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "nlp_enterprise": [ "湖南省金达工程建设有限公司", "丰汇国际项目管理有限公司" ], "nlp_enterprise_attachment": [], "person_review": [ "宋明勇", "刘定良", "张来弟", "许卫秀", "宋明勇", "刘定良", "张来弟", "许卫秀" ], "prem": { "Project": { "code": "", "roleList": [ { "address": "宜春市袁州区明月袁山中路356号", "linklist": [ [ "胡柯", "13766445188" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "丰汇国际项目管理有限公司", "serviceTime": "" }, { "address": "湖南省长沙市开福区中山路589号开福万达广场C区2号写字楼", "linklist": [ [ "刘华夏", "18570640155" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": "4351680.70", "money_unit": "元" }, "role_name": "win_tenderer", "role_text": "湖南省金达工程建设有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" } }, "process_time": "2023-02-28 02:04:42", "product": [ "安全生命防护工程" ], "product_attrs": { "data": [ { "brand": "详见开标一览表明细", "product": "2022年宜春市袁州区县乡村道安全生命防护工程采购项目", "quantity": "1", "quantity_unit": "", "specs": "详见开标一览表明细", "unitPrice": "4351680.7" } ], "header": [ "名称_数量__单价_品牌_规格型号" ], "header_col": [ "名称_品牌_规格型号_数量_单价" ] }, "serviceTime": "", "success": true, "time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "", "time_getFileStart": "", "time_listingEnd": "", "time_listingStart": "", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2023-02-28", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "", "version_date": "2023-02-20" }'''
- # print(f.evaluate(j))
- # _str1 = "PMJJ-202211030004001"
- # _str2 = "PMJJ-202211030001001"
- # print(getSimilarityOfString(_str1,_str2))
- # print(check_doctitle("强化桂城街道工地扬尘防控监管巡查第三方(二次)","广东省强化桂城街道工地扬尘防控监管巡查第三方(二次)"))
- # print(check_codes(["F-2022-027(MASCG-2-F-F-2022-0462)"],["F-2022-027(MASCG-2-F-F-2022-0462)"]))
- # print(check_product(None,None))
- # print(check_code("4451020073383382206021325","4451020073383382206021322"))
- # print(check_money("550.0","440.0","",""))
- # for i in range(0,2):
- # print(i)
- # location_pattern = re.compile(".{1,2}市|.{1,2}区|.{1,2}镇|.{1,2}县|.{1,2}村")
- # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇前路富代都村示范点农用塑料薄膜棚)"))
- # print(re.findall(location_pattern,"宁古线乡村振兴高优农业融合发展建设项目(洋中镇天湖村粮蔬基地农用塑料薄膜棚)"))
- # package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
- # _match = re.search(package_number_pattern,"2021年盘山县高标准农田建设项目三标段(高升街道)开标记录")
- # if _match is not None:
- # print(_match.groupdict()["name"])
- # print(re.findall("((标[段号的包])[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4})","[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ"))
- # print(check_doctitle("[南宁市]桂林银行南宁办公大楼装修工程标段Ⅲ","桂林银行南宁办公大楼装修工程标段ⅡGXYLG20182005-N中标公告"))
- # c = f_get_extractCount()
- # _json = '''
- # { "attachmentTypes": "", "bidway": "", "code": [ "LCQTCG-2022-313" ], "cost_time": { "attrs": 0.02, "codename": 0.16, "deposit": 0.0, "nerToken": 0.8400000000000001, "person": 0.01, "prem": 0.02, "preprocess": 0.96, "product": 0.12, "product_attrs": 0.01, "punish": 0.11, "roleRuleFinal": 0.0, "rule": 0.0, "rule_channel": 0.0, "tableToText": 0.09000381469726562, "tendereeRuleRecall": 0.0, "time": 0.01, "total_unit_money": 0.0 }, "demand_info": { "data": [], "header": [], "header_col": [] }, "deposit_patment_way": "", "docchannel": { "docchannel": "招标公告", "doctype": "采招数据" }, "docid": "", "doctitle_refine": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设(一标段)膨胀剂(暂估价)项目", "exist_table": 1, "extract_count": 5, "fail_reason": "", "fingerprint": "md5=b1ab0ee9cf9e1c5acc17477b9c0433cc", "match_enterprise": [], "match_enterprise_type": 0, "moneysource": "", "name": "郑济高铁聊城西站配套基础设施建设项目一期枢纽功能区建设工程(一标段)膨胀剂(暂估价)采购项目", "nlp_enterprise": [ "中建八局第一建设有限公司", "山东东岳项目管理有限公司", "聊城市公共资源交易中心", "江苏国泰新点软件有限公司" ], "person_review": [], "prem": { "Project": { "code": "", "roleList": [ { "linklist": [ [ "", "15540110649" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "tenderee", "role_text": "中建八局第一建设有限公司", "serviceTime": "" }, { "linklist": [ [ "武工", "0635-2992305" ] ], "role_money": { "discount_ratio": "", "downward_floating_ratio": "", "floating_ratio": "", "money": 0, "money_unit": "" }, "role_name": "agency", "role_text": "山东东岳项目管理有限公司", "serviceTime": "" } ], "tendereeMoney": 0, "tendereeMoneyUnit": "" }, "一": { "code": "", "roleList": [], "tendereeMoney": 3267000.0, "tendereeMoneyUnit": "万元" } }, "process_time": "2022-05-30 14:31:13", "product": [ "枢纽功能区建设工程", "膨胀剂", "配套基础设施建设" ], "product_attrs": { "data": [], "header": [], "header_col": [] }, "serviceTime": "", "success": true, "time_bidclose": "2022-06-16", "time_bidopen": "2022-06-16", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnestMoneyEnd": "", "time_earnestMoneyStart": "", "time_getFileEnd": "2022-06-01", "time_getFileStart": "2022-05-26", "time_publicityEnd": "", "time_publicityStart": "", "time_registrationEnd": "", "time_registrationStart": "", "time_release": "2022-05-25", "total_tendereeMoney": 0, "total_tendereeMoneyUnit": "" }
- # '''
- # c = f_get_nlp_enterprise()
- # print(c.evaluate("山东东岳项目管理有限公司",_json))
- # print(c.evaluate(_json))
- # c = f_set_docid()
- # _s = '''
- # 154064190 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
- # 154064188 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
- # 154064175 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
- # 30201228 1512489600 4 04111-1 1 大连市妇女儿童医疗中心
- # 154064160 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
- # 154064168 1512489600 4 03689-11 1 大连市妇女儿童医疗中心
- # '''
- # buffer = c.new_buffer()
- # for _line in _s.split("\n"):
- # _line = _line.strip()
- # if _line=="":
- # continue
- # l_column = _line.split("\t")
- # print(l_column)
- # docid,page_time_stamp,extract_count,web_source_no,num,tenderee = l_column
- # page_time_stamp = int(page_time_stamp)
- # extract_count = int(extract_count)
- # num = 1
- # c.iterate(buffer,docid,page_time_stamp,extract_count,web_source_no,num,tenderee)
- # print(c.terminate(buffer))
|