documentAnalysis.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. from odps.udf import annotate
  2. from odps.udf import BaseUDTF
  3. @annotate('string -> string')
  4. class f_analysis_type(BaseUDTF):
  5. def __init__(self):
  6. import logging
  7. import json
  8. import time,re
  9. global json,logging,time,re
  10. self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
  11. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  12. def process(self, doctextcon):
  13. if doctextcon is not None:
  14. list_match = []
  15. dict_type_keyword = {"风电":[['风电|风力发电']],
  16. "火电":[['煤电|煤发电|燃煤机组|燃气热电|焚烧发电|火电|火力发电|锅炉|燃机']],
  17. "水电":[['水电|水力发电']],
  18. "送变电":[['变电|送出|输送|架线|配电|电压穿越']],
  19. "核电":[['核电|核能发电']],
  20. "光伏发电":[['光伏|太阳能发电']],
  21. "调试":[["整套启动|性能试验|调整试验|调试|试验|测试|检测|预试"]],
  22. "监理":[["监理"]],
  23. "施工":[["施工|工程|建设"]]
  24. }
  25. for k,v in dict_type_keyword.items():
  26. for searchItem in v:
  27. all_match = True
  28. for _item in searchItem:
  29. if re.search(_item,doctextcon) is None:
  30. all_match = False
  31. if all_match:
  32. list_match.append(k)
  33. if len(list_match)>0:
  34. self.forward(",".join(list_match))