proposedBuildingProject.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. from odps.udf import annotate
  2. from odps.distcache import get_cache_archive
  3. from odps.distcache import get_cache_file
  4. from odps.udf import BaseUDTF
  5. from odps.udf import BaseUDAF
  6. import threading
  7. import logging
  8. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  9. import time
  10. import uuid
  11. import re
  12. # 配置pandas依赖包
  13. def include_package_path(res_name):
  14. import os, sys
  15. archive_files = get_cache_archive(res_name)
  16. dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
  17. if '.dist_info' not in f.name], key=lambda v: len(v))
  18. sys.path.append(dir_names[0])
  19. return os.path.dirname(dir_names[0])
  20. # 可能出现类似RuntimeError: xxx has been blocked by sandbox
  21. # 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
  22. def include_file(file_name):
  23. import os, sys
  24. so_file = get_cache_file(file_name)
  25. sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
  26. def include_so(file_name):
  27. import os, sys
  28. so_file = get_cache_file(file_name)
  29. with open(so_file.name, 'rb') as fp:
  30. content=fp.read()
  31. so = open(file_name, "wb")
  32. so.write(content)
  33. so.flush()
  34. so.close()
  35. #初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
  36. def init_env(list_files,package_name):
  37. import os,sys
  38. if len(list_files)==1:
  39. so_file = get_cache_file(list_files[0])
  40. cmd_line = os.path.abspath(so_file.name)
  41. os.system("unzip -o %s -d %s"%(cmd_line,package_name))
  42. elif len(list_files)>1:
  43. cmd_line = "cat"
  44. for _file in list_files:
  45. so_file = get_cache_file(_file)
  46. cmd_line += " "+os.path.abspath(so_file.name)
  47. cmd_line += " > temp.zip"
  48. os.system(cmd_line)
  49. os.system("unzip -o temp.zip -d %s"%(package_name))
  50. # os.system("rm -rf %s/*.dist-info"%(package_name))
  51. # return os.listdir(os.path.abspath("local_package"))
  52. # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
  53. # os.system("source ~/.bashrc")
  54. sys.path.insert(0,os.path.abspath(package_name))
  55. # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
  56. def multiLoadEnv():
  57. def load_project():
  58. start_time = time.time()
  59. # init_env(["BiddingKG.zip.env.line"],str(uuid.uuid4()))
  60. init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
  61. logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
  62. def load_vector():
  63. start_time = time.time()
  64. init_env(["wiki_128_word_embedding_new.vector.env"],".")
  65. logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
  66. start_time = time.time()
  67. init_env(["enterprise.zip.env"],".")
  68. # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
  69. logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
  70. start_time = time.time()
  71. init_env(["so.env"],".")
  72. logging.info("init so.env cost %d"%(time.time()-start_time))
  73. def load_py():
  74. start_time = time.time()
  75. # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
  76. include_package_path("envs_py37.env.zip")
  77. logging.info("init envs_py37 cost %d"%(time.time()-start_time))
  78. load_project()
  79. load_vector()
  80. load_py()
  81. def getPattern():
  82. filename = "proposedBuildingKeyword.zip.env"
  83. init_env([filename],".")
  84. df = pd.read_excel("proposedBuildingKeyword.xlsx")
  85. dict_industry_keywords = {}
  86. for _industry,_keyword in zip(df["类别"],df["关键词"]):
  87. if _industry not in dict_industry_keywords:
  88. dict_industry_keywords[_industry] = set()
  89. dict_industry_keywords[_industry].add(_keyword)
  90. list_industry_p = []
  91. for k,v in dict_industry_keywords.items():
  92. if len(v)>0:
  93. list_industry_p.append("(?P<%s>%s)"%(k,"|".join(list(v))))
  94. _pattern = re.compile("|".join(list_industry_p))
  95. return _pattern
  96. dict_stage = {"设计阶段":"设计",
  97. "环评阶段":"环评",
  98. "施工准备":"监理",
  99. "施工在建":"施工"}
  100. list_stage_v = []
  101. for k,v in dict_stage.items():
  102. list_stage_v.append("(?P<%s>%s)"%(k,v))
  103. stage_pattern = "|".join(list_stage_v)
  104. def extract_industry(content,_pattern):
  105. list_stage = []
  106. for stage_search in re.finditer(_pattern,content):
  107. for k,v in stage_search.groupdict().items():
  108. if v is not None:
  109. list_stage.append(k)
  110. if len(list_stage)>0:
  111. return list_stage[0]
  112. return None
  113. def extract_legal_stage(content):
  114. if re.search("拍卖|转让|产权|出让|租赁|招租|采购",content) is not None:
  115. return None
  116. list_stage = []
  117. for stage_search in re.finditer(stage_pattern,content):
  118. for k,v in stage_search.groupdict().items():
  119. if v is not None:
  120. list_stage.append(k)
  121. if len(list_stage)>0:
  122. return list_stage[-1]
  123. return None
  124. def extract_proportion(content):
  125. _pattern = "(?P<proportion>((建筑|建设)面积|全长)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
  126. _pattern_search = re.search(_pattern,content)
  127. _proportion = ""
  128. if _pattern_search is not None:
  129. _proportion = _pattern_search.groupdict().get("proportion","")
  130. if _proportion=="":
  131. _pattern = "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
  132. _pattern_search = re.search(_pattern,content)
  133. if _pattern_search is not None:
  134. _proportion = _pattern_search.groupdict().get("proportion","")
  135. return _proportion
  136. def extract_projectDigest(content):
  137. _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
  138. _pattern_search = re.search(_pattern,content)
  139. _projectDigest = ""
  140. _find = ""
  141. if _pattern_search is not None:
  142. _find = _pattern_search.groupdict().get("projectDigest","")
  143. if len(_find)>0:
  144. _projectDigest = "。".join(_find.split("。")[0:3])
  145. return _projectDigest
  146. def extract_projectAddress(list_sentence,list_entity):
  147. for p_entity in list_entity:
  148. if len(p_entity.entity_text)>10 and p_entity.entity_type=="location":
  149. for _sentence in list_sentence:
  150. if _sentence.sentence_index==p_entity.sentence_index:
  151. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  152. if re.search("(项目|建设)(地址|地点)",_span[0]) is not None:
  153. return p_entity.entity_text
  154. return None
  155. def extract_begin_end_time(list_sentence,list_entity):
  156. _begin_time = None
  157. _end_time = None
  158. for p_entity in list_entity:
  159. if p_entity.entity_type=="time":
  160. for _sentence in list_sentence:
  161. if _sentence.sentence_index==p_entity.sentence_index:
  162. _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
  163. if re.search("开工(时间|日期)",_span[0]) is not None:
  164. _time_temp = timeFormat(p_entity.entity_text)
  165. if len(_time_temp)>0:
  166. _begin_time = _time_temp
  167. if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
  168. _time_temp = timeFormat(p_entity.entity_text)
  169. if len(_time_temp)>0:
  170. _end_time = _time_temp
  171. return _begin_time,_end_time
  172. @annotate('bigint,string,string,string -> string,string,string,string,string,string,string,string')
  173. class extract_proposedBuilding(BaseUDTF):
  174. def __init__(self):
  175. multiLoadEnv()
  176. import pandas as pd
  177. global pd
  178. self._pattern = getPattern()
  179. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  180. from BiddingKG.dl.common.Utils import spanWindow,timeFormat
  181. global Preprocessing,spanWindow,timeFormat
  182. def process(self, doc_id,dochtmlcon,doctitle,project_name):
  183. _stage = extract_legal_stage(doctitle)
  184. if _stage is not None:
  185. list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle]],useselffool=True)
  186. for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  187. content = list_article.content
  188. _stage = extract_legal_stage(doctitle)
  189. if _stage is None:
  190. continue
  191. _industry = extract_industry(content,self._pattern)
  192. if _industry is None:
  193. continue
  194. _proportion = extract_proportion(content)
  195. _projectDigest = extract_projectDigest(content)
  196. _projectAddress = extract_projectAddress(list_sentence,list_entity)
  197. _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
  198. project_name_refind = ""
  199. if project_name is not None and len(project_name)>0:
  200. project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
  201. if _stage is not None:
  202. self.forward(_stage,_proportion,_projectDigest,_projectAddress,_begin_time,_end_time,project_name_refind,_industry)
  203. @annotate('bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
  204. class f_remege_proposedBuildingProject(BaseUDAF):
  205. '''
  206. 项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
  207. '''
  208. def __init__(self):
  209. import logging
  210. import json,re
  211. global json,logging,re
  212. logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  213. def new_buffer(self):
  214. return [list()]
  215. def iterate(self, buffer,docid,page_time,province,city,district,tenderee,tenderee_contact,tenderee_phone,agency,
  216. project_code,project_name,stage,proportion,projectDigest,projectAddress,begin_time,end_time,
  217. project_name_refind,industry):
  218. buffer[0].append({"docid":docid,"page_time":page_time,"province":province,"city":city,"district":district,
  219. "tenderee":tenderee,"tenderee_contact":tenderee_contact,"tenderee_phone":tenderee_phone,
  220. "agency":agency,"project_code":project_code,"project_name":project_name,"stage":stage,"proportion":proportion,
  221. "projectDigest":projectDigest,"projectAddress":projectAddress,"begin_time":begin_time,"end_time":end_time,
  222. "project_name_refind":project_name_refind,"industry":industry})
  223. def merge(self, buffer, pbuffer):
  224. buffer[0].extend(pbuffer[0])
  225. def terminate(self, buffer):
  226. list_group = buffer[0]
  227. return json.dumps(list_group,ensure_ascii=False)