from odps.udf import annotate from odps.udf import BaseUDTF @annotate('string -> string') class f_analysis_type(BaseUDTF): def __init__(self): import logging import json import time,re global json,logging,time,re self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*" logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') def process(self, doctextcon): if doctextcon is not None: list_match = [] dict_type_keyword = {"风电":[['风电|风力发电']], "火电":[['煤电|煤发电|燃煤机组|燃气热电|焚烧发电|火电|火力发电|锅炉|燃机']], "水电":[['水电|水力发电']], "送变电":[['变电|送出|输送|架线|配电|电压穿越']], "核电":[['核电|核能发电']], "光伏发电":[['光伏|太阳能发电']], "调试":[["整套启动|性能试验|调整试验|调试|试验|测试|检测|预试"]], "监理":[["监理"]], "施工":[["施工|工程|建设"]] } for k,v in dict_type_keyword.items(): for searchItem in v: all_match = True for _item in searchItem: if re.search(_item,doctextcon) is None: all_match = False if all_match: list_match.append(k) if len(list_match)>0: self.forward(",".join(list_match))