#coding:utf8 from odps.udf import annotate,BaseUDAF,BaseUDTF @annotate('string->string') class getYearMonth(object): def evaluate(self,page_time): if page_time is None: return "" return str(page_time[:7]) @annotate('double->string') class getMoneyRange(object): def evaluate(self,money): if money is None or money==0: return '等于0或空' elif money<10*10000: return '(0,10万)' elif money<100*10000: return '[10万,100万)' elif money<500*10000: return '[100万,500万)' elif money<1000*10000: return '[500万,1000万)' elif money<10000*10000: return '[1000万,1亿)' elif money<10*10000*10000: return '[1亿,10亿)' elif money<100*10000*10000: return '[10亿,100亿)' else: return '[100亿,500亿]' @annotate('string->bigint') class getdocidFromDocids(BaseUDTF): def process(self,docids): for docid in docids.split(","): self.forward(int(docid)) @annotate('string->string') class fixEnterpriseName(object): def __init__(self): import re global re def evaluate(self,name): new_name = re.sub("[#!!&@$'\s\*\"{};;]","",name) new_name = re.sub("amp|lt|bramp|gt|nbsp|br","",new_name) _s = re.search("\*+",name) if _s is not None: if _s.span()[1]-_s.span()[0]>=3: new_name = "" if len(new_name)<4: new_name = "" if new_name.find("有限公司")>=0 and len(new_name)<=7: new_name = "" return new_name @annotate('string->string') class removeCommonWord(object): def __init__(self): from AreaGet import AreaGet import re global re self.dict_area = AreaGet().getDict_area() _pattern = "" list_name = [] for k,v in self.dict_area.items(): _name = v.get("cname","") if _name!="": list_name.append(_name) _pattern = "|".join(list_name)+"|[省市区县]|有限|公司|股份|分公司|责任" self.pattern = re.compile(_pattern) def evaluate(self,name): return re.sub(self.pattern,"",name) @annotate("string->string,string,string,string,bigint,bigint") class dealEnterpriseCircle(BaseUDTF): def __init__(self): from AreaGet import AreaGet import re global re self.dict_area = AreaGet().getDict_area() set_area = set() for k,v in self.dict_area.items(): set_area.add(v.get("cname")) self.set_area = set_area def process(self,name): name = re.sub("\s+","",name) new_name = name.replace("(","(").replace(")",")") new_name = re.sub("\(+",'(',new_name) new_name = re.sub("\)+",')',new_name) bool_area = 0 bool_end = 0 circle = "" before = "" for _s in re.finditer("\(.+?\)",new_name): circle = new_name[_s.span()[0]:_s.span()[1]][1:-1] if _s.span()[1]>=len(new_name): bool_end = 1 before = new_name[:_s.span()[0]] if circle in self.set_area: bool_area = 1 else: bool_area = 0 self.forward(name,new_name,before,circle,bool_area,bool_end) @annotate('string->string') class f_turn_circle(object): def __init__(self): import re global re def evaluate(self,name): if name is not None: return name.replace("(","(").replace(")",")") else: return ""