Prechádzať zdrojové kódy

修改重复入库的判定,项目编号排序

rogel 4 rokov pred
rodič
commit
ca10715bdf

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -277,7 +277,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 
                                 
 if __name__=="__main__":
 if __name__=="__main__":
     # edit_distance("GUMBO","GAMBOL")
     # edit_distance("GUMBO","GAMBOL")
-    # print(jaccard_score("GUMBO","GAMBOL"))
+    print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
 
 
     sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
     sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
     print(match_enterprise_max_first(sentences))
     print(match_enterprise_max_first(sentences))

+ 16 - 5
BiddingKG/dl/fingerprint/documentFingerprint.py

@@ -2,13 +2,23 @@
 
 
 import hashlib
 import hashlib
 import codecs
 import codecs
+from bs4 import BeautifulSoup
+import re
+
+def getHtmlText(sourceHtml):
+    _text = BeautifulSoup(sourceHtml,"lxml").get_text()
+    _text = re.sub("\s*",'',_text)
+    if len(_text)==0:
+        _text = sourceHtml
+    return _text
 
 
 def getMD5(sourceHtml):
 def getMD5(sourceHtml):
     if sourceHtml is not None and len(sourceHtml)>0:
     if sourceHtml is not None and len(sourceHtml)>0:
-        if isinstance(sourceHtml,str):
-            bs = sourceHtml.encode()
-        elif isinstance(sourceHtml,bytes):
-            bs = sourceHtml
+        _text = getHtmlText(sourceHtml)
+        if isinstance(_text,str):
+            bs = _text.encode()
+        elif isinstance(_text,bytes):
+            bs = _text
         else:
         else:
             return ""
             return ""
         md5 = hashlib.md5()
         md5 = hashlib.md5()
@@ -25,5 +35,6 @@ def getFingerprint(sourceHtml):
     return _fingerprint
     return _fingerprint
 
 
 if __name__=="__main__":
 if __name__=="__main__":
-    sourceHtml = text = codecs.open("C:\\Users\\User\\Desktop\\2.html","rb",encoding="utf8").read()
+    sourceHtml = codecs.open("C:\\Users\\User\\Desktop\\2.html","rb",encoding="utf8").read()
+    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
     print(getFingerprint(sourceHtml))
     print(getFingerprint(sourceHtml))

+ 3 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -1811,6 +1811,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 end_index_temp = bidway['end_index']
                 end_index_temp = bidway['end_index']
                 begin_index = changeIndexFromWordToWords(tokens, begin_index_temp)
                 begin_index = changeIndexFromWordToWords(tokens, begin_index_temp)
                 end_index = changeIndexFromWordToWords(tokens, end_index_temp)
                 end_index = changeIndexFromWordToWords(tokens, end_index_temp)
+                if begin_index is None or end_index is None:
+                    continue
+                print(begin_index_temp,end_index_temp,begin_index,end_index)
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_text = bidway['body']
                 entity_text = bidway['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(

+ 8 - 5
BiddingKG/dl/interface/extract.py

@@ -97,11 +97,14 @@ def predict(doc_id,text,title=""):
     data_res["success"] = True
     data_res["success"] = True
 
 
 
 
-    # for list_entity in list_entitys:
-    #     for _entity in list_entity:
-    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
-    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
-    #                str(_entity.begin_index),str(_entity.end_index)))
+    for _article in list_articles:
+        log(_article.content)
+
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
+                  (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
+                   str(_entity.begin_index),str(_entity.end_index)))
 
 
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 
 

+ 2 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -288,7 +288,8 @@ def get_legal_comba(list_entity,dict_role_combination):
                 if _prob>MAX_PROB:
                 if _prob>MAX_PROB:
                     MAX_PROB = _prob
                     MAX_PROB = _prob
                     _MAX_PROB_COMBA = [item]
                     _MAX_PROB_COMBA = [item]
-            new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
+            if _MAX_PROB_COMBA is not None:
+                new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
         _dict_legal_combination = new_dict_legal_combination
         _dict_legal_combination = new_dict_legal_combination
     #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
     #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
     _list_final_comba = circle_pageages(_dict_legal_combination)
     _list_final_comba = circle_pageages(_dict_legal_combination)

+ 1 - 0
BiddingKG/dl/interface/predictor.py

@@ -434,6 +434,7 @@ class CodeNamePredict():
                     othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
                     othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
                     if othercode != None:
                     if othercode != None:
                         item['code'].append(othercode.group(3))
                         item['code'].append(othercode.group(3))
+            item['code'].sort(key=lambda x:len(x),reverse=True)
             result.append(item)
             result.append(item)
 
 
             list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
             list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)

+ 80 - 2
BiddingKG/dl/test/7.py

@@ -1,10 +1,88 @@
-
+#coding:UTF-8
 a = 1<<10
 a = 1<<10
 b = bin(a)
 b = bin(a)
+print(int(b[2:],2)>>10)
 c = a | (1<<2)
 c = a | (1<<2)
 print(b)
 print(b)
 print(a)
 print(a)
 
 
 a = "1234"
 a = "1234"
 
 
-print("-",a[3:])
+print("-",a[3:])
+
+import math
+print(2**32)
+print(math.pow(2,32))
+
+print(hex(ord('g')))
+
+def get_s16(val):
+    if val < 0x80000000:
+        return val
+    else:
+        return (val - 0x100000000)
+print(hex(336860180))
+
+print(0xf0551700)
+print(0xe8c81900)
+
+print([int(math.floor(abs(math.sin(i + 1)) * (2 ** 32))) for i in range(64)])
+
+import re
+_pattern = "(?P<projectDigest>项目概况.{10,1000})"
+# _pattern = "(建筑面积[约为是]*[\d,]+(\.\d+)?[十百千万]*(㎡|平方米))"
+
+text = '''
+项目名称:淮上区2020年市政道路维修及绿化养护施工工程监理 
+'''
+text = text.replace("\r","").replace("\n",'')
+
+
+
+def extract_proportion(content):
+    _pattern = "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万]*([\]】平方kK千万公㎡mM米里]*))"
+    _pattern_search = re.search(_pattern,content)
+    _proportion = ""
+    if _pattern_search is not None:
+        _proportion = _pattern_search.groupdict().get("proportion","")
+    return _proportion
+
+def extract_projectDigest(content):
+    _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
+    _pattern_search = re.search(_pattern,content)
+    _projectDigest = ""
+    _find = ""
+    if _pattern_search is not None:
+        _find = _pattern_search.groupdict().get("projectDigest","")
+    if len(_find)>0:
+        _projectDigest = "。".join(_find.split("。")[0:3])
+    return _projectDigest
+
+print(extract_proportion(text))
+
+print(re.findall(_pattern,text))
+print(extract_projectDigest(text))
+
+import uuid
+
+print(uuid.uuid4())
+
+def extract_legal_stage(content,_pattern):
+    dict_stage = {"设计阶段":"设计",
+                  "环评阶段":"环评",
+                  "施工准备":"监理",
+                  "施工在建":"施工"}
+    list_stage_v = []
+    for k,v in dict_stage.items():
+        list_stage_v.append("(?P<%s>%s)"%(k,v))
+    stage_pattern = "|".join(list_stage_v)
+    list_stage = []
+    for stage_search in re.finditer(stage_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[-1]
+    return None
+
+print(extract_legal_stage(text,"工程"))

+ 2 - 2
BiddingKG/dl/test/test4.py

@@ -61,8 +61,8 @@ if __name__=="__main__":
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # '''
     # '''
     print("start")
     print("start")
-    print(predict("12",content))
+    # print(predict("12",text))
     # print(predict("12", text))
     # print(predict("12", text))
-    # test("12",content)
+    test("12",text)
     print("takes",time.time()-_time1)
     print("takes",time.time()-_time1)
     pass
     pass

+ 56 - 16
BiddingKG/maxcompute/documentDumplicate.py

@@ -384,7 +384,15 @@ class in_stamp(object):
                 break
                 break
         return int_flag
         return int_flag
 
 
-@annotate('string -> bigint,bigint,bigint,bigint')
+def getConfidence(rule_id):
+    if rule_id ==0:
+        return 30
+    elif rule_id >=1 and rule_id <=26:
+        return 20
+    else:
+        return 10
+
+@annotate('string,bigint -> bigint,bigint,bigint,bigint,bigint')
 class f_split_group_single(BaseUDTF):
 class f_split_group_single(BaseUDTF):
     '''
     '''
     将多个组拆解成多条记录
     将多个组拆解成多条记录
@@ -396,13 +404,13 @@ class f_split_group_single(BaseUDTF):
         global json,logging
         global json,logging
         logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
         logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
-    def process(self, json_set_docid):
+    def process(self, json_set_docid,rule_id):
         list_group = json.loads(json_set_docid)
         list_group = json.loads(json_set_docid)
         for item in list_group:
         for item in list_group:
             for index_i in range(len(item)):
             for index_i in range(len(item)):
                 for index_j in range(len(item)):
                 for index_j in range(len(item)):
                     if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
                     if index_i!=index_j and item[index_i]["docid"]!=item[index_j]["docid"]:
-                        self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"])
+                        self.forward(item[index_i]["docid"],item[index_j]["docid"],item[index_i]["extract_count"],item[index_j]["extract_count"],getConfidence(rule_id))
 
 
 
 
 @annotate('bigint,string->string')
 @annotate('bigint,string->string')
@@ -703,6 +711,20 @@ def getSet(list_dict,key):
                     _set.add(str(item[key]))
                     _set.add(str(item[key]))
     return _set
     return _set
 
 
+def getDiffIndex(list_dict,key):
+    _set = set()
+    for _i in range(len(list_dict)):
+        item = list_dict[_i]
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^\d[\d\.]*$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+        if len(_set)>1:
+            return _i
+    return len(list_dict)
+
 @annotate('bigint,string -> bigint,bigint')
 @annotate('bigint,string -> bigint,bigint')
 class f_getGroup_dumpFinal(BaseUDTF):
 class f_getGroup_dumpFinal(BaseUDTF):
     '''
     '''
@@ -722,7 +744,7 @@ class f_getGroup_dumpFinal(BaseUDTF):
             for _docid in list_docids:
             for _docid in list_docids:
                 self.forward(int(docid),int(_docid))
                 self.forward(int(docid),int(_docid))
 
 
-@annotate('bigint,bigint,string,string,string,string,bigint,bigint->string')
+@annotate('bigint,bigint,string,string,string,string,bigint,bigint,bigint->string')
 class f_redump_limit_num(BaseUDAF):
 class f_redump_limit_num(BaseUDAF):
     '''
     '''
     去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
     去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
@@ -737,9 +759,10 @@ class f_redump_limit_num(BaseUDAF):
     def new_buffer(self):
     def new_buffer(self):
         return [list()]
         return [list()]
 
 
-    def iterate(self, buffer,main_docid,docid,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2):
+    def iterate(self, buffer,main_docid,docid,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2,confidence):
         buffer[0].append({"main_docid":main_docid,"docid":docid,"set_limit_column1":set_limit_column1,"set_limit_column2":set_limit_column2,
         buffer[0].append({"main_docid":main_docid,"docid":docid,"set_limit_column1":set_limit_column1,"set_limit_column2":set_limit_column2,
-                          "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,"extract_count2":extract_count2})
+                          "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,
+                          "extract_count2":extract_count2,"confidence":confidence})
 
 
     def merge(self, buffer, pbuffer):
     def merge(self, buffer, pbuffer):
         buffer[0].extend(pbuffer[0])
         buffer[0].extend(pbuffer[0])
@@ -747,25 +770,42 @@ class f_redump_limit_num(BaseUDAF):
     def terminate(self, buffer):
     def terminate(self, buffer):
         list_group = []
         list_group = []
         the_group = buffer[0]
         the_group = buffer[0]
+        the_group.sort(key=lambda x:x["confidence"],reverse=True)
         if len(the_group)>5:
         if len(the_group)>5:
             keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
             keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
         else:
         else:
             keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
             keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
-        stay = True
-        for _key in keys:
-            if len(getSet(the_group,_key))>1:
-                stay = False
-                break
+
         final_group = []
         final_group = []
-        if stay:
+
+        #置信度
+        list_key_index = []
+        for _k in keys:
+            list_key_index.append(getDiffIndex(the_group,_k))
+
+        _index = min(list_key_index)
+        if _index>1:
             main_docid = the_group[0]["main_docid"]
             main_docid = the_group[0]["main_docid"]
-            for item in the_group:
+            for item in the_group[:_index]:
                 if item["docid"]!=main_docid:
                 if item["docid"]!=main_docid:
-                    final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
+                    final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"],"confidence":item["confidence"]})
+
+
+        # stay = True
+        # for _key in keys:
+        #     if len(getSet(the_group,_key))>1:
+        #         stay = False
+        #         break
+        #
+        # if stay:
+        #     main_docid = the_group[0]["main_docid"]
+        #     for item in the_group:
+        #         if item["docid"]!=main_docid:
+        #             final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
 
 
         return json.dumps(final_group)
         return json.dumps(final_group)
 
 
-@annotate('string -> bigint,bigint,bigint,bigint')
+@annotate('string -> bigint,bigint,bigint,bigint,bigint')
 class f_get_dumpFinal_checked(BaseUDTF):
 class f_get_dumpFinal_checked(BaseUDTF):
     '''
     '''
     从最后的结果中获取组
     从最后的结果中获取组
@@ -781,4 +821,4 @@ class f_get_dumpFinal_checked(BaseUDTF):
         if list_group is not None:
         if list_group is not None:
             final_group = json.loads(list_group)
             final_group = json.loads(list_group)
             for _group in final_group:
             for _group in final_group:
-                self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"])
+                self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"],_group["confidence"])

+ 3 - 2
BiddingKG/maxcompute/evaluates.py

@@ -77,7 +77,8 @@ def multiLoadEnv():
         logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
         logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
 
 
         start_time = time.time()
         start_time = time.time()
-        init_env(["legal_enterprise.zip.env"],".")
+        init_env(["enterprise.zip.env"],".")
+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
         logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
         logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
 
 
         start_time = time.time()
         start_time = time.time()
@@ -147,6 +148,6 @@ class Extract(BaseUDTF):
                 return json.JSONEncoder.default(self, obj)
                 return json.JSONEncoder.default(self, obj)
 
 
     def process(self,content,_doc_id,_title,page_time):
     def process(self,content,_doc_id,_title,page_time):
-        if content is not None and _doc_id not in [105677700,126694044,126795572,126951461]:
+        if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072]:
             result_json = predict(str(_doc_id),content,str(_title))
             result_json = predict(str(_doc_id),content,str(_title))
             self.forward(page_time,int(_doc_id),result_json)
             self.forward(page_time,int(_doc_id),result_json)

+ 263 - 0
BiddingKG/maxcompute/proposedBuildingProject.py

@@ -0,0 +1,263 @@
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF
+from odps.udf import BaseUDAF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+import uuid
+import re
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+def multiLoadEnv():
+    def load_project():
+        start_time = time.time()
+        # init_env(["BiddingKG.zip.env.line"],str(uuid.uuid4()))
+        init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+        logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
+
+    def load_vector():
+        start_time = time.time()
+        init_env(["wiki_128_word_embedding_new.vector.env"],".")
+        logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        init_env(["enterprise.zip.env"],".")
+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+        logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        init_env(["so.env"],".")
+        logging.info("init so.env cost %d"%(time.time()-start_time))
+
+    def load_py():
+        start_time = time.time()
+        # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
+        include_package_path("envs_py37.env.zip")
+        logging.info("init envs_py37 cost %d"%(time.time()-start_time))
+
+    load_project()
+    load_vector()
+    load_py()
+
+def getPattern():
+    filename = "proposedBuildingKeyword.zip.env"
+    init_env([filename],".")
+    df = pd.read_excel("proposedBuildingKeyword.xlsx")
+    dict_industry_keywords = {}
+    for _industry,_keyword in zip(df["类别"],df["关键词"]):
+        if _industry not in dict_industry_keywords:
+            dict_industry_keywords[_industry] = set()
+        dict_industry_keywords[_industry].add(_keyword)
+    list_industry_p = []
+    for k,v in dict_industry_keywords.items():
+        if len(v)>0:
+            list_industry_p.append("(?P<%s>%s)"%(k,"|".join(list(v))))
+    _pattern = re.compile("|".join(list_industry_p))
+    return _pattern
+
+dict_stage = {"设计阶段":"设计",
+              "环评阶段":"环评",
+              "施工准备":"监理",
+              "施工在建":"施工"}
+list_stage_v = []
+for k,v in dict_stage.items():
+    list_stage_v.append("(?P<%s>%s)"%(k,v))
+stage_pattern = "|".join(list_stage_v)
+
+def extract_industry(content,_pattern):
+    list_stage = []
+    for stage_search in re.finditer(_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[0]
+    return None
+
+def extract_legal_stage(content):
+    if re.search("拍卖|转让|产权|出让|租赁|招租|采购",content) is not None:
+        return None
+    list_stage = []
+    for stage_search in re.finditer(stage_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[-1]
+    return None
+
+
+def extract_proportion(content):
+    _pattern = "(?P<proportion>((建筑|建设)面积|全长)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+))"
+    _pattern_search = re.search(_pattern,content)
+    _proportion = ""
+    if _pattern_search is not None:
+        _proportion = _pattern_search.groupdict().get("proportion","")
+    if _proportion=="":
+        _pattern = "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+))"
+        _pattern_search = re.search(_pattern,content)
+        if _pattern_search is not None:
+            _proportion = _pattern_search.groupdict().get("proportion","")
+    return _proportion
+
+def extract_projectDigest(content):
+    _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
+    _pattern_search = re.search(_pattern,content)
+    _projectDigest = ""
+    _find = ""
+    if _pattern_search is not None:
+        _find = _pattern_search.groupdict().get("projectDigest","")
+    if len(_find)>0:
+        _projectDigest = "。".join(_find.split("。")[0:3])
+    return _projectDigest
+
+def extract_projectAddress(list_sentence,list_entity):
+    for p_entity in list_entity:
+        if len(p_entity.entity_text)>10 and p_entity.entity_type=="location":
+            for _sentence in list_sentence:
+                if _sentence.sentence_index==p_entity.sentence_index:
+                    _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
+                    if re.search("(项目|建设)(地址|地点)",_span[0]) is not None:
+                        return p_entity.entity_text
+    return None
+
+def extract_begin_end_time(list_sentence,list_entity):
+    _begin_time = None
+    _end_time = None
+    for p_entity in list_entity:
+        if p_entity.entity_type=="time":
+            for _sentence in list_sentence:
+                if _sentence.sentence_index==p_entity.sentence_index:
+                    _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
+                    if re.search("开工(时间|日期)",_span[0]) is not None:
+                        _time_temp = timeFormat(p_entity.entity_text)
+                        if len(_time_temp)>0:
+                            _begin_time = _time_temp
+                    if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
+                        _time_temp = timeFormat(p_entity.entity_text)
+                        if len(_time_temp)>0:
+                            _end_time = _time_temp
+
+    return _begin_time,_end_time
+
+
+@annotate('bigint,string,string,string -> string,string,string,string,string,string,string,string')
+class extract_proposedBuilding(BaseUDTF):
+
+    def __init__(self):
+        multiLoadEnv()
+        import pandas as pd
+        global pd
+        self._pattern = getPattern()
+        import BiddingKG.dl.interface.Preprocessing as Preprocessing
+        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
+
+        global Preprocessing,spanWindow,timeFormat
+
+
+    def process(self, doc_id,dochtmlcon,doctitle,project_name):
+        _stage = extract_legal_stage(doctitle)
+        if _stage is not None:
+            list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle]],useselffool=True)
+            for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+                content = list_article.content
+                _stage = extract_legal_stage(doctitle)
+                if _stage is None:
+                    continue
+                _industry = extract_industry(content,self._pattern)
+                if _industry is None:
+                    continue
+                _proportion = extract_proportion(content)
+                _projectDigest = extract_projectDigest(content)
+                _projectAddress = extract_projectAddress(list_sentence,list_entity)
+                _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
+                project_name_refind = ""
+                if project_name is not None and len(project_name)>0:
+                    project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                if _stage is not None:
+                    self.forward(_stage,_proportion,_projectDigest,_projectAddress,_begin_time,_end_time,project_name_refind,_industry)
+
+
+@annotate('bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
+class f_remege_proposedBuildingProject(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time,tenderee,tenderee_contact,tenderee_phone,agency,
+                project_code,project_name,stage,proportion,projectDigest,projectAddress,begin_time,end_time,
+                project_name_refind,industry):
+        buffer[0].append({"docid":docid,"page_time":page_time,"tenderee":tenderee,"tenderee_contact":tenderee_contact,"tenderee_phone":tenderee_phone,
+                          "agency":agency,"project_code":project_code,"project_name":project_name,"stage":stage,"proportion":proportion,
+                          "projectDigest":projectDigest,"projectAddress":projectAddress,"begin_time":begin_time,"end_time":end_time,
+                          "project_name_refind":project_name_refind,"industry":industry})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = buffer[0]
+        return json.dumps(list_group)