123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- #coding:utf8
- from odps.udf import annotate,BaseUDAF,BaseUDTF
- @annotate('string->string')
- class getYearMonth(object):
- def evaluate(self,page_time):
- if page_time is None:
- return ""
- return str(page_time[:7])
- @annotate('double->string')
- class getMoneyRange(object):
- def evaluate(self,money):
- if money is None or money==0:
- return '等于0或空'
- elif money<10*10000:
- return '(0,10万)'
- elif money<100*10000:
- return '[10万,100万)'
- elif money<500*10000:
- return '[100万,500万)'
- elif money<1000*10000:
- return '[500万,1000万)'
- elif money<10000*10000:
- return '[1000万,1亿)'
- elif money<10*10000*10000:
- return '[1亿,10亿)'
- elif money<100*10000*10000:
- return '[10亿,100亿)'
- else:
- return '[100亿,500亿]'
- @annotate('string->bigint')
- class getdocidFromDocids(BaseUDTF):
- def process(self,docids):
- for docid in docids.split(","):
- self.forward(int(docid))
- @annotate('string->string')
- class fixEnterpriseName(object):
- def __init__(self):
- import re
- global re
- def evaluate(self,name):
- new_name = re.sub("[#!!&@$'\s\*\"{};;]","",name)
- new_name = re.sub("amp|lt|bramp|gt|nbsp|br","",new_name)
- _s = re.search("\*+",name)
- if _s is not None:
- if _s.span()[1]-_s.span()[0]>=3:
- new_name = ""
- if len(new_name)<4:
- new_name = ""
- if new_name.find("有限公司")>=0 and len(new_name)<=7:
- new_name = ""
- return new_name
- @annotate('string->string')
- class removeCommonWord(object):
- def __init__(self):
- from AreaGet import AreaGet
- import re
- global re
- self.dict_area = AreaGet().getDict_area()
- _pattern = ""
- list_name = []
- for k,v in self.dict_area.items():
- _name = v.get("cname","")
- if _name!="":
- list_name.append(_name)
- _pattern = "|".join(list_name)+"|[省市区县]|有限|公司|股份|分公司|责任"
- self.pattern = re.compile(_pattern)
- def evaluate(self,name):
- return re.sub(self.pattern,"",name)
- @annotate("string->string,string,string,string,bigint,bigint")
- class dealEnterpriseCircle(BaseUDTF):
- def __init__(self):
- from AreaGet import AreaGet
- import re
- global re
- self.dict_area = AreaGet().getDict_area()
- set_area = set()
- for k,v in self.dict_area.items():
- set_area.add(v.get("cname"))
- self.set_area = set_area
- def process(self,name):
- name = re.sub("\s+","",name)
- new_name = name.replace("(","(").replace(")",")")
- new_name = re.sub("\(+",'(',new_name)
- new_name = re.sub("\)+",')',new_name)
- bool_area = 0
- bool_end = 0
- circle = ""
- before = ""
- for _s in re.finditer("\(.+?\)",new_name):
- circle = new_name[_s.span()[0]:_s.span()[1]][1:-1]
- if _s.span()[1]>=len(new_name):
- bool_end = 1
- before = new_name[:_s.span()[0]]
- if circle in self.set_area:
- bool_area = 1
- else:
- bool_area = 0
- self.forward(name,new_name,before,circle,bool_area,bool_end)
- @annotate('string->string')
- class f_turn_circle(object):
- def __init__(self):
- import re
- global re
- def evaluate(self,name):
- if name is not None:
- return name.replace("(","(").replace(")",")")
- else:
- return ""
|