enterpriseFix.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #coding:utf8
  2. from odps.udf import annotate,BaseUDAF,BaseUDTF
  3. @annotate('string->string')
  4. class getYearMonth(object):
  5. def evaluate(self,page_time):
  6. if page_time is None:
  7. return ""
  8. return str(page_time[:7])
  9. @annotate('double->string')
  10. class getMoneyRange(object):
  11. def evaluate(self,money):
  12. if money is None or money==0:
  13. return '等于0或空'
  14. elif money<10*10000:
  15. return '(0,10万)'
  16. elif money<100*10000:
  17. return '[10万,100万)'
  18. elif money<500*10000:
  19. return '[100万,500万)'
  20. elif money<1000*10000:
  21. return '[500万,1000万)'
  22. elif money<10000*10000:
  23. return '[1000万,1亿)'
  24. elif money<10*10000*10000:
  25. return '[1亿,10亿)'
  26. elif money<100*10000*10000:
  27. return '[10亿,100亿)'
  28. else:
  29. return '[100亿,500亿]'
  30. @annotate('string->bigint')
  31. class getdocidFromDocids(BaseUDTF):
  32. def process(self,docids):
  33. for docid in docids.split(","):
  34. self.forward(int(docid))
  35. @annotate('string->string')
  36. class fixEnterpriseName(object):
  37. def __init__(self):
  38. import re
  39. global re
  40. def evaluate(self,name):
  41. new_name = re.sub("[#!!&@$'\s\*\"{};;]","",name)
  42. new_name = re.sub("amp|lt|bramp|gt|nbsp|br","",new_name)
  43. _s = re.search("\*+",name)
  44. if _s is not None:
  45. if _s.span()[1]-_s.span()[0]>=3:
  46. new_name = ""
  47. if len(new_name)<4:
  48. new_name = ""
  49. if new_name.find("有限公司")>=0 and len(new_name)<=7:
  50. new_name = ""
  51. return new_name
  52. @annotate('string->string')
  53. class removeCommonWord(object):
  54. def __init__(self):
  55. from AreaGet import AreaGet
  56. import re
  57. global re
  58. self.dict_area = AreaGet().getDict_area()
  59. _pattern = ""
  60. list_name = []
  61. for k,v in self.dict_area.items():
  62. _name = v.get("cname","")
  63. if _name!="":
  64. list_name.append(_name)
  65. _pattern = "|".join(list_name)+"|[省市区县]|有限|公司|股份|分公司|责任"
  66. self.pattern = re.compile(_pattern)
  67. def evaluate(self,name):
  68. return re.sub(self.pattern,"",name)
  69. @annotate("string->string,string,string,string,bigint,bigint")
  70. class dealEnterpriseCircle(BaseUDTF):
  71. def __init__(self):
  72. from AreaGet import AreaGet
  73. import re
  74. global re
  75. self.dict_area = AreaGet().getDict_area()
  76. set_area = set()
  77. for k,v in self.dict_area.items():
  78. set_area.add(v.get("cname"))
  79. self.set_area = set_area
  80. def process(self,name):
  81. name = re.sub("\s+","",name)
  82. new_name = name.replace("(","(").replace(")",")")
  83. new_name = re.sub("\(+",'(',new_name)
  84. new_name = re.sub("\)+",')',new_name)
  85. bool_area = 0
  86. bool_end = 0
  87. circle = ""
  88. before = ""
  89. for _s in re.finditer("\(.+?\)",new_name):
  90. circle = new_name[_s.span()[0]:_s.span()[1]][1:-1]
  91. if _s.span()[1]>=len(new_name):
  92. bool_end = 1
  93. before = new_name[:_s.span()[0]]
  94. if circle in self.set_area:
  95. bool_area = 1
  96. else:
  97. bool_area = 0
  98. self.forward(name,new_name,before,circle,bool_area,bool_end)
  99. @annotate('string->string')
  100. class f_turn_circle(object):
  101. def __init__(self):
  102. import re
  103. global re
  104. def evaluate(self,name):
  105. if name is not None:
  106. return name.replace("(","(").replace(")",")")
  107. else:
  108. return ""