2 年之前 · 1f9672fef5
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
															 <?xml version="1.0" encoding="UTF-8"?>
														
 
															 <project version="4">
														
 
															-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
														
 
															+  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK">
														
 
															     <output url="file://$PROJECT_DIR$/out" />
														
 
															   </component>
														
 
															 </project>
														
--- a/BaseDataMaintenance/dataMonitor/data_monitor.py
+++ b/BaseDataMaintenance/dataMonitor/data_monitor.py
@@ -13,7 +13,7 @@ from BaseDataMaintenance.dataSource.setttings import *
 
															 from queue import Queue
														
 
															 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
														
 
															-from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
														
 
															+
														
 
															 from BaseDataMaintenance.maintenance.dataflow_settings import *
														
 
															 import pandas as pd
														
@@ -30,6 +30,9 @@ flow_init_log_dir = "/data/python/flow_init_log"
 
															 flow_init_check_dir = "/data/python/flow_init_check"
														
 
															+flow_dumplicate_log_path = "/python_log/flow_dumplicate.log"
														
 
															+
														
 
															+
														
 
															 class BaseDataMonitor():
														
 
															     def __init__(self):
														
@@ -195,7 +198,7 @@ class BaseDataMonitor():
 
															     def monitor_attachment(self):
														
 
															-
														
 
															+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
														
 
															         try:
														
 
															             # query = BoolQuery(must_queries=[
														
 
															             #     RangeQuery("status",0,11),
														
@@ -280,7 +283,7 @@ class BaseDataMonitor():
 
															             traceback.print_exc()
														
 
															     def monitor_extract(self):
														
 
															-
														
 
															+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
														
 
															         try:
														
 
															             # query = BoolQuery(must_queries=[
														
 
															             #     RangeQuery("status",11,61),
														
@@ -488,8 +491,13 @@ class BaseDataMonitor():
 
															                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
														
 
															         if total_count>=1000:
														
 
															-            _msg = "数据流报警：待去重公告数为:%d"%(total_count)
														
 
															-            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS)
														
 
															+            _cmd = 'cat %s | grep -c "%s.*upgrate True save"'%(flow_dumplicate_log_path,self.get_last_tenmin_time())
														
 
															+            process_count = self.cmd_execute(_cmd)
														
 
															+            atAll = False
														
 
															+            if int(process_count)==0:
														
 
															+                atAll = True
														
 
															+            _msg = "数据流报警：待去重公告数为:%d,最近十分钟去重数为：%s"%(total_count,str(process_count))
														
 
															+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=atAll)
														
 
															             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
														
@@ -588,7 +596,7 @@ class BaseDataMonitor():
 
															         # scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
														
 
															         scheduler.add_job(self.monitor_extract,"cron",minute="*/10")
														
 
															         scheduler.add_job(self.monitor_proposedBuilding,"cron",hour="*/3")
														
 
															-        scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
														
 
															+        # scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
														
 
															         scheduler.add_job(self.monitor_sychr,"cron",minute="*/10")
														
 
															         scheduler.add_job(self.monitor_preproject,"cron",hour="8")
														
 
															         scheduler.add_job(self.monitor_merge,"cron",hour="*/1")
														
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -32,6 +32,8 @@ from BaseDataMaintenance.maxcompute.documentMerge import *
 
															 from BaseDataMaintenance.common.otsUtils import *
														
 
															 from BaseDataMaintenance.common.activateMQUtils import *
														
 
															+from BaseDataMaintenance.dataMonitor.data_monitor import BaseDataMonitor
														
 
															+
														
 
															 from BaseDataMaintenance.dataSource.pool import ConnectorPool
														
 
															 def getSet(list_dict,key):
														
@@ -2167,6 +2169,7 @@ class Dataflow_dumplicate(Dataflow):
 
															         logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															         self.fix_doc_docid = None
														
 
															+        self.bdm = BaseDataMonitor()
														
 
															         if start_delete_listener:
														
 
															             self.delete_comsumer_counts = 2
														
@@ -2931,17 +2934,15 @@ class Dataflow_dumplicate(Dataflow):
 
															         _dict = {}
														
 
															         #更新公共属性
														
 
															         for k,v in project_dict.items():
														
 
															-            if v is None or v=="":
														
 
															+            if v is None or v=="" or v=="[]" or v=="未知":
														
 
															                 continue
														
 
															             if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
														
 
															                 continue
														
 
															-            for _proj in projects:
														
 
															-                if k not in _proj:
														
 
															-                    _dict[k] = v
														
 
															-                elif _proj.get(k,"未知")=="未知":
														
 
															-                    _dict[k] = v
														
 
															         for _proj in projects:
														
 
															             _proj.update(_dict)
														
 
															+        for _proj in projects:
														
 
															+            if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
														
 
															+                _proj[project_page_time] = project_dict.get(project_page_time,"")
														
 
															         #拼接属性
														
 
															         append_dict = {}
														
@@ -3749,44 +3750,47 @@ class Dataflow_dumplicate(Dataflow):
 
															         :param status_to:
														
 
															         :return:
														
 
															         '''
														
 
															-        list_docids = []
														
 
															-        _docid = item.get(document_tmp_docid)
														
 
															-        list_docids.append(_docid)
														
 
															-        if isinstance(dup_docid,list):
														
 
															-            list_docids.extend(dup_docid)
														
 
															-        list_docids = [a for a in list_docids if a is not None]
														
 
															+        try:
														
 
															+            list_docids = []
														
 
															+            _docid = item.get(document_tmp_docid)
														
 
															+            list_docids.append(_docid)
														
 
															+            if isinstance(dup_docid,list):
														
 
															+                list_docids.extend(dup_docid)
														
 
															+            list_docids = [a for a in list_docids if a is not None]
														
 
															-        _time = time.time()
														
 
															-        list_projects = self.search_projects_with_document(list_docids)
														
 
															-        # log("search projects takes:%.3f"%(time.time()-_time))
														
 
															-        if len(list_projects)==0:
														
 
															-            # _time = time.time()
														
 
															-            list_docs = self.search_docs(list_docids)
														
 
															-            # log("search document takes:%.3f"%(time.time()-_time))
														
 
															-            # _time = time.time()
														
 
															-            list_projects = self.generate_projects_from_document(list_docs)
														
 
															-            # log("generate projects takes:%.3f"%(time.time()-_time))
														
 
															-        else:
														
 
															             _time = time.time()
														
 
															-            self.update_projects_by_document(_docid,save,list_projects)
														
 
															-            # log("update projects takes:%.3f"%(time.time()-_time))
														
 
															-        _time = time.time()
														
 
															-        list_projects = dumplicate_projects(list_projects)
														
 
															-        # log("dumplicate projects takes:%.3f"%(time.time()-_time))
														
 
															-        _time = time.time()
														
 
															-        list_projects = self.merge_projects(list_projects,b_log)
														
 
															-        # log("merge projects takes:%.3f"%(time.time()-_time))
														
 
															+            list_projects = self.search_projects_with_document(list_docids)
														
 
															+            # log("search projects takes:%.3f"%(time.time()-_time))
														
 
															+            if len(list_projects)==0:
														
 
															+                # _time = time.time()
														
 
															+                list_docs = self.search_docs(list_docids)
														
 
															+                # log("search document takes:%.3f"%(time.time()-_time))
														
 
															+                # _time = time.time()
														
 
															+                list_projects = self.generate_projects_from_document(list_docs)
														
 
															+                # log("generate projects takes:%.3f"%(time.time()-_time))
														
 
															+            else:
														
 
															+                _time = time.time()
														
 
															+                self.update_projects_by_document(_docid,save,list_projects)
														
 
															+                # log("update projects takes:%.3f"%(time.time()-_time))
														
 
															+            _time = time.time()
														
 
															+            list_projects = dumplicate_projects(list_projects)
														
 
															+            # log("dumplicate projects takes:%.3f"%(time.time()-_time))
														
 
															+            _time = time.time()
														
 
															+            list_projects = self.merge_projects(list_projects,b_log)
														
 
															+            # log("merge projects takes:%.3f"%(time.time()-_time))
														
 
															-        _time = time.time()
														
 
															-        dumplicate_document_in_merge(list_projects)
														
 
															-        log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
														
 
															+            _time = time.time()
														
 
															+            dumplicate_document_in_merge(list_projects)
														
 
															+            log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
														
 
															-        _time = time.time()
														
 
															-        project_json = to_project_json(list_projects)
														
 
															-        # log("json projects takes:%.3f"%(time.time()-_time))
														
 
															-        if b_log:
														
 
															-            log("project_json:%s"%project_json)
														
 
															-        return project_json
														
 
															+            _time = time.time()
														
 
															+            project_json = to_project_json(list_projects)
														
 
															+            # log("json projects takes:%.3f"%(time.time()-_time))
														
 
															+            if b_log:
														
 
															+                log("project_json:%s"%project_json)
														
 
															+            return project_json
														
 
															+        except Exception as e:
														
 
															+            raise RuntimeError("error on dumplicate")
														
 
															     def is_exist_fingerprint(self,final_list,_docid,_fingerprint,table_name):
														
 
															         set_fingerprint = set()
														
@@ -3905,7 +3909,7 @@ class Dataflow_dumplicate(Dataflow):
 
															                 dtmp.setValue(document_tmp_projects,"[]",True)
														
 
															             else:
														
 
															                 dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
														
 
															-
														
 
															+            log(dtmp.getProperties().get(document_tmp_projects))
														
 
															             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
														
 
															             if upgrade:
														
 
															                 if table_name=="document_tmp":
														
@@ -3914,7 +3918,16 @@ class Dataflow_dumplicate(Dataflow):
 
															                 # print(dtmp.getProperties())
														
 
															                 dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
														
 
															                 dtmp.setValue(document_tmp_best_docid,best_docid,True)
														
 
															-                dtmp.update_row(self.ots_client)
														
 
															+                _flag = dtmp.update_row(self.ots_client)
														
 
															+                if not _flag:
														
 
															+                    for i in range(10):
														
 
															+                        list_proj_json = dtmp.getProperties().get(document_tmp_projects)
														
 
															+                        if list_proj_json is not None:
														
 
															+                            list_proj = json.loads(list_proj_json)
														
 
															+                            dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
														
 
															+                            if dtmp.update_row(self.ots_client):
														
 
															+                                break
														
 
															+
														
 
															             # log("dump takes %.2f"%(time.time()-start_time))
														
 
															         except Exception as e:
														
@@ -3986,6 +3999,7 @@ class Dataflow_dumplicate(Dataflow):
 
															         schedule = BlockingScheduler()
														
 
															         schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
														
 
															         # schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
														
 
															+        schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
														
 
															         schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
														
 
															         schedule.start()
														
@@ -4107,7 +4121,7 @@ if __name__ == '__main__':
 
															     df_dump = Dataflow_dumplicate(start_delete_listener=False)
														
 
															     # df_dump.start_flow_dumplicate()
														
 
															     a = time.time()
														
 
															-    df_dump.test_dumplicate(268920229)
														
 
															+    df_dump.test_dumplicate(237450072)
														
 
															     # df_dump.test_merge([292315564],[287890754])
														
 
															     # df_dump.flow_remove_project_tmp()
														
 
															     print("takes",time.time()-a)
														
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -15,6 +15,8 @@ from BaseDataMaintenance.common.Utils import article_limit
 
															 from BaseDataMaintenance.common.documentFingerprint import getFingerprint
														
 
															 from BaseDataMaintenance.model.postgres.document_extract import *
														
 
															+
														
 
															+
														
 
															 class ActiveMQListener():
														
 
															     def __init__(self,conn,_queue,*args,**kwargs):
														
--- a/BaseDataMaintenance/maxcompute/documentAnalysis.py
+++ b/BaseDataMaintenance/maxcompute/documentAnalysis.py
@@ -1,6 +1,111 @@
 
															+#coding:utf8
														
 
															 from odps.udf import annotate
														
 
															-from odps.udf import BaseUDTF
														
 
															+from odps.udf import BaseUDTF,BaseUDAF
														
 
															+from odps.distcache import get_cache_archive
														
 
															+from odps.distcache import get_cache_file
														
 
															+import copy
														
 
															+import sys,os,re
														
 
															+
														
 
															+import threading
														
 
															+import logging
														
 
															+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+import time
														
 
															+from multiprocessing import Process,Queue
														
 
															+
														
 
															+def log(msg):
														
 
															+    logging.info(msg)
														
 
															+
														
 
															+
														
 
															+# 配置pandas依赖包
														
 
															+def include_package_path(res_name):
														
 
															+    import os, sys
														
 
															+    archive_files = get_cache_archive(res_name)
														
 
															+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
														
 
															+                        if '.dist_info' not in f.name], key=lambda v: len(v))
														
 
															+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
														
 
															+    log("add path:%s"%(_path))
														
 
															+    sys.path.append(_path)
														
 
															+    return _path
														
 
															+
														
 
															+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
														
 
															+# 这是因为包含C的库，会被沙盘block，可设置set odps.isolation.session.enable = true
														
 
															+def include_file(file_name):
														
 
															+    import os, sys
														
 
															+    so_file = get_cache_file(file_name)
														
 
															+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
														
 
															+
														
 
															+def include_so(file_name):
														
 
															+    import os, sys
														
 
															+    so_file = get_cache_file(file_name)
														
 
															+
														
 
															+    with open(so_file.name, 'rb') as fp:
														
 
															+        content=fp.read()
														
 
															+        so = open(file_name, "wb")
														
 
															+        so.write(content)
														
 
															+        so.flush()
														
 
															+        so.close()
														
 
															+
														
 
															+#初始化业务数据包，由于上传限制,python版本以及archive解压包不统一等各种问题，需要手动导入
														
 
															+def init_env(list_files,package_name):
														
 
															+    import os,sys
														
 
															+
														
 
															+    if len(list_files)==1:
														
 
															+        so_file = get_cache_file(list_files[0])
														
 
															+        cmd_line = os.path.abspath(so_file.name)
														
 
															+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
														
 
															+    elif len(list_files)>1:
														
 
															+        cmd_line = "cat"
														
 
															+        for _file in list_files:
														
 
															+            so_file = get_cache_file(_file)
														
 
															+            cmd_line += " "+os.path.abspath(so_file.name)
														
 
															+        cmd_line += " > temp.zip"
														
 
															+        os.system(cmd_line)
														
 
															+        os.system("unzip -o temp.zip -d %s"%(package_name))
														
 
															+    # os.system("rm -rf %s/*.dist-info"%(package_name))
														
 
															+    # return os.listdir(os.path.abspath("local_package"))
														
 
															+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
														
 
															+    # os.system("source ~/.bashrc")
														
 
															+    sys.path.insert(0,os.path.abspath(package_name))
														
 
															+
														
 
															+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
														
 
															+def multiLoadEnv():
														
 
															+    def load_project():
														
 
															+        start_time = time.time()
														
 
															+        ## init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
														
 
															+        # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
														
 
															+        #改为zip引入
														
 
															+        log("=======")
														
 
															+        include_package_path("BiddingKG.baseline.zip")
														
 
															+        # include_package_path("BiddingKG.backup.zip")
														
 
															+        logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
														
 
															+
														
 
															+    def load_vector():
														
 
															+        start_time = time.time()
														
 
															+        # init_env(["wiki_128_word_embedding_new.vector.env"],".")
														
 
															+        include_package_path("wiki.zip")
														
 
															+        logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
														
 
															+
														
 
															+        start_time = time.time()
														
 
															+        # init_env(["enterprise.zip.env"],".")
														
 
															+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
														
 
															+        include_package_path("enterprise.zip")
														
 
															+        logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
														
 
															+
														
 
															+        start_time = time.time()
														
 
															+        init_env(["so.env"],".")
														
 
															+        logging.info("init so.env cost %d"%(time.time()-start_time))
														
 
															+
														
 
															+    def load_py():
														
 
															+        start_time = time.time()
														
 
															+        # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
														
 
															+        include_package_path("envs_py37.env.zip")
														
 
															+        # include_package_path("envs_py35.zip")
														
 
															+        logging.info("init envs_py cost %d"%(time.time()-start_time))
														
 
															+
														
 
															+    load_project()
														
 
															+    load_vector()
														
 
															+    load_py()
														
 
															 @annotate('string -> string')
														
 
															 class f_analysis_type(BaseUDTF):
														
@@ -38,3 +143,834 @@ class f_analysis_type(BaseUDTF):
 
															             if len(list_match)>0:
														
 
															                 self.forward(",".join(list_match))
														
 
															+
														
 
															+#数据清洗
														
 
															+def segment(soup,final=True):
														
 
															+    # print("==")
														
 
															+    # print(soup)
														
 
															+    # print("====")
														
 
															+    #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
														
 
															+    subspaceList = ["td",'a',"span","p"]
														
 
															+    if soup.name in subspaceList:
														
 
															+        #判断有值叶子节点数
														
 
															+        _count = 0
														
 
															+        for child in soup.find_all(recursive=True):
														
 
															+            if child.get_text().strip()!="" and len(child.find_all())==0:
														
 
															+                _count += 1
														
 
															+        if _count<=1:
														
 
															+            text = soup.get_text()
														
 
															+            # 2020/11/24 大网站规则添加
														
 
															+            if 'title' in soup.attrs:
														
 
															+                if '...' in soup.get_text() and soup.get_text().strip()[:-3] in soup.attrs['title']:
														
 
															+                    text = soup.attrs['title']
														
 
															+
														
 
															+            _list = []
														
 
															+            for x in re.split("\s+",text):
														
 
															+                if x.strip()!="":
														
 
															+                    _list.append(len(x))
														
 
															+            if len(_list)>0:
														
 
															+                _minLength = min(_list)
														
 
															+                if _minLength>2:
														
 
															+                    _substr = "，"
														
 
															+                else:
														
 
															+                    _substr = ""
														
 
															+            else:
														
 
															+                _substr = ""
														
 
															+
														
 
															+            text = text.replace("\r\n","，").replace("\n","，")
														
 
															+            text = re.sub("\s+",_substr,text)
														
 
															+            # text = re.sub("\s+","##space##",text)
														
 
															+            return text
														
 
															+    segList = ["title"]
														
 
															+    commaList = ["div","br","td","p","li"]
														
 
															+    #commaList = []
														
 
															+    spaceList = ["span"]
														
 
															+    tbodies = soup.find_all('tbody')
														
 
															+    if len(tbodies) == 0:
														
 
															+        tbodies = soup.find_all('table')
														
 
															+    # 递归遍历所有节点,插入符号
														
 
															+    for child in soup.find_all(recursive=True):
														
 
															+        # print(child.name,child.get_text())
														
 
															+        if child.name in segList:
														
 
															+            child.insert_after("。")
														
 
															+        if child.name in commaList:
														
 
															+            child.insert_after("，")
														
 
															+        # if child.name == 'div' and 'class' in child.attrs:
														
 
															+        #     # 添加附件"attachment"标识
														
 
															+        #     if "richTextFetch" in child['class']:
														
 
															+        #         child.insert_before("##attachment##")
														
 
															+        # print(child.parent)
														
 
															+        # if child.name in subspaceList:
														
 
															+        #     child.insert_before("#subs"+str(child.name)+"#")
														
 
															+        #     child.insert_after("#sube"+str(child.name)+"#")
														
 
															+        # if child.name in spaceList:
														
 
															+        #     child.insert_after(" ")
														
 
															+    text = str(soup.get_text())
														
 
															+    #替换英文冒号为中文冒号
														
 
															+    text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])","：",text)
														
 
															+    #替换为中文逗号
														
 
															+    text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
														
 
															+    #替换为中文分号
														
 
															+    text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
														
 
															+    # 感叹号替换为中文句号
														
 
															+    text = re.sub("(?<=[\u4e00-\u9fa5])[!！]|[!！](?=[\u4e00-\u9fa5])","。",text)
														
 
															+    #替换格式未识别的问号为" " ,update:2021/7/20
														
 
															+    text = re.sub("[？\?]{2,}|\n"," ",text)
														
 
															+
														
 
															+
														
 
															+    #替换"""为"“",否则导入deepdive出错
														
 
															+    # text = text.replace('"',"“").replace("\r","").replace("\n","，")
														
 
															+    text = text.replace('"',"“").replace("\r","").replace("\n","").replace("\\n","") #2022/1/4修复 非分段\n 替换为逗号造成 公司拆分 span \n南航\n上海\n分公司
														
 
															+    # print('==1',text)
														
 
															+    # text = re.sub("\s{4,}","，",text)
														
 
															+    # 解决公告中的" "空格替换问题
														
 
															+    if re.search("\s{4,}",text):
														
 
															+        _text = ""
														
 
															+        for _sent in re.split("。+",text):
														
 
															+            for _sent2 in re.split('，+',_sent):
														
 
															+                for _sent3 in re.split("：+",_sent2):
														
 
															+                    for _t in re.split("\s{4,}",_sent3):
														
 
															+                        if len(_t)<3:
														
 
															+                            _text += _t
														
 
															+                        else:
														
 
															+                            _text += "，"+_t
														
 
															+                    _text += "："
														
 
															+                _text = _text[:-1]
														
 
															+                _text += "，"
														
 
															+            _text = _text[:-1]
														
 
															+            _text += "。"
														
 
															+        _text = _text[:-1]
														
 
															+        text = _text
														
 
															+    # print('==2',text)
														
 
															+    #替换标点
														
 
															+
														
 
															+    #替换连续的标点
														
 
															+
														
 
															+    if final:
														
 
															+        text = re.sub("##space##"," ",text)
														
 
															+
														
 
															+    punc_pattern = "(?P<del>[。，；：:,\s]+)"
														
 
															+
														
 
															+    list_punc = re.findall(punc_pattern,text)
														
 
															+    list_punc.sort(key=lambda x:len(x),reverse=True)
														
 
															+    for punc_del in list_punc:
														
 
															+        if len(punc_del)>1:
														
 
															+            if len(punc_del.strip())>0:
														
 
															+                if "：" in punc_del.strip():
														
 
															+                    if "。" in punc_del.strip():
														
 
															+                        text = re.sub(punc_del, "：。", text)
														
 
															+                    else:
														
 
															+                        text = re.sub(punc_del,"：",text)
														
 
															+                else:
														
 
															+                    text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
														
 
															+            else:
														
 
															+                text = re.sub(punc_del,"",text)
														
 
															+
														
 
															+
														
 
															+    #将连续的中文句号替换为一个
														
 
															+    text_split = text.split("。")
														
 
															+    text_split = [x for x in text_split if len(x)>0]
														
 
															+    text = "。".join(text_split)
														
 
															+
														
 
															+    # #删除标签中的所有空格
														
 
															+    # for subs in subspaceList:
														
 
															+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
														
 
															+    #     while(True):
														
 
															+    #         oneMatch = re.search(re.compile(patten),text)
														
 
															+    #         if oneMatch is not None:
														
 
															+    #             _match = oneMatch.group(1)
														
 
															+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
														
 
															+    #         else:
														
 
															+    #             break
														
 
															+
														
 
															+    # text过大报错
														
 
															+    LOOP_LEN = 10000
														
 
															+    LOOP_BEGIN = 0
														
 
															+    _text = ""
														
 
															+
														
 
															+
														
 
															+
														
 
															+    if len(text)<10000000:
														
 
															+        while(LOOP_BEGIN<len(text)):
														
 
															+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
														
 
															+            LOOP_BEGIN += LOOP_LEN
														
 
															+        text = _text
														
 
															+    # 附件标识前修改为句号，避免正文和附件内容混合在一起
														
 
															+    text = re.sub("[^。](?=##attachment##)","。",text)
														
 
															+    text = re.sub("[^。](?=##attachment_begin##)","。",text)
														
 
															+    text = re.sub("[^。](?=##attachment_end##)","。",text)
														
 
															+    text = re.sub("##attachment_begin##。","##attachment_begin##",text)
														
 
															+    text = re.sub("##attachment_end##。","##attachment_end##",text)
														
 
															+
														
 
															+    return text
														
 
															+def fixSpan(tbody):
														
 
															+    # 处理colspan, rowspan信息补全问题
														
 
															+    #trs = tbody.findChildren('tr', recursive=False)
														
 
															+
														
 
															+    trs = getTrs(tbody)
														
 
															+    ths_len = 0
														
 
															+    ths = list()
														
 
															+    #修改为先进行列补全再进行行补全，否则可能会出现表格解析混乱
														
 
															+    # 遍历每一个tr
														
 
															+
														
 
															+    for indtr, tr in enumerate(trs):
														
 
															+        ths_tmp = tr.findChildren('th', recursive=False)
														
 
															+        #不补全含有表格的tr
														
 
															+        if len(tr.findChildren('table'))>0:
														
 
															+            continue
														
 
															+        if len(ths_tmp) > 0:
														
 
															+            ths_len = ths_len + len(ths_tmp)
														
 
															+            for th in ths_tmp:
														
 
															+                ths.append(th)
														
 
															+        # 遍历每行中的element
														
 
															+        tds = tr.findChildren(recursive=False)
														
 
															+        for indtd, td in enumerate(tds):
														
 
															+            # 若有colspan 则补全同一行下一个位置
														
 
															+            if 'colspan' in td.attrs:
														
 
															+                if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
														
 
															+                    col = int(re.sub("[^0-9]","",str(td['colspan'])))
														
 
															+                    if col<100 and len(td.get_text())<1000:
														
 
															+                        td['colspan'] = 1
														
 
															+                        for i in range(1, col, 1):
														
 
															+                            td.insert_after(copy.copy(td))
														
 
															+
														
 
															+    for indtr, tr in enumerate(trs):
														
 
															+        ths_tmp = tr.findChildren('th', recursive=False)
														
 
															+        #不补全含有表格的tr
														
 
															+        if len(tr.findChildren('table'))>0:
														
 
															+            continue
														
 
															+        if len(ths_tmp) > 0:
														
 
															+            ths_len = ths_len + len(ths_tmp)
														
 
															+            for th in ths_tmp:
														
 
															+                ths.append(th)
														
 
															+        # 遍历每行中的element
														
 
															+        tds = tr.findChildren(recursive=False)
														
 
															+        for indtd, td in enumerate(tds):
														
 
															+            # 若有rowspan 则补全下一行同样位置
														
 
															+            if 'rowspan' in td.attrs:
														
 
															+                if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
														
 
															+                    row = int(re.sub("[^0-9]","",str(td['rowspan'])))
														
 
															+                    td['rowspan'] = 1
														
 
															+                    for i in range(1, row, 1):
														
 
															+                        # 获取下一行的所有td， 在对应的位置插入
														
 
															+                        if indtr+i<len(trs):
														
 
															+                            tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
														
 
															+                            if len(tds1) >= (indtd) and len(tds1)>0:
														
 
															+                                if indtd > 0:
														
 
															+                                    tds1[indtd - 1].insert_after(copy.copy(td))
														
 
															+                                else:
														
 
															+                                    tds1[0].insert_before(copy.copy(td))
														
 
															+                            elif indtd-2>0 and len(tds1) > 0 and len(tds1) == indtd - 1:  # 修正某些表格最后一列没补全
														
 
															+                                tds1[indtd-2].insert_after(copy.copy(td))
														
 
															+def getTable(tbody):
														
 
															+    #trs = tbody.findChildren('tr', recursive=False)
														
 
															+    trs = getTrs(tbody)
														
 
															+    inner_table = []
														
 
															+    for tr in trs:
														
 
															+        tr_line = []
														
 
															+        tds = tr.findChildren(['td','th'], recursive=False)
														
 
															+        if len(tds)==0:
														
 
															+            tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
														
 
															+        for td in tds:
														
 
															+            tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
														
 
															+            #tr_line.append([td.get_text(),0])
														
 
															+        inner_table.append(tr_line)
														
 
															+    return inner_table
														
 
															+
														
 
															+def getTrs(tbody):
														
 
															+    #获取所有的tr
														
 
															+    trs = []
														
 
															+    objs = tbody.find_all(recursive=False)
														
 
															+    for obj in objs:
														
 
															+        if obj.name=="tr":
														
 
															+            trs.append(obj)
														
 
															+        if obj.name=="tbody":
														
 
															+            for tr in obj.find_all("tr",recursive=False):
														
 
															+                trs.append(tr)
														
 
															+    return trs
														
 
															+
														
 
															+#处理表格不对齐的问题
														
 
															+def fixTable(inner_table,fix_value=""):
														
 
															+    maxWidth = 0
														
 
															+    for item in inner_table:
														
 
															+        if len(item)>maxWidth:
														
 
															+            maxWidth = len(item)
														
 
															+    if maxWidth > 100:
														
 
															+        # log('表格列数大于100，表格异常不做处理。')
														
 
															+        return []
														
 
															+    for i in range(len(inner_table)):
														
 
															+        if len(inner_table[i])<maxWidth:
														
 
															+            for j in range(maxWidth-len(inner_table[i])):
														
 
															+                inner_table[i].append([fix_value,0])
														
 
															+    return inner_table
														
 
															+
														
 
															+def getTableText(inner_table):
														
 
															+    height = len(inner_table)
														
 
															+    table_text = []
														
 
															+    if height>0:
														
 
															+        width = len(inner_table[0])
														
 
															+        for _h in range(height):
														
 
															+            _line = []
														
 
															+            for _w in range(width):
														
 
															+                _line.append(inner_table[_h][_w][0])
														
 
															+            table_text.append(_line)
														
 
															+    return table_text
														
 
															+
														
 
															+@annotate('string -> string')
														
 
															+class f_table_preprocess(BaseUDTF):
														
 
															+    '''
														
 
															+    获取所有的表格并返回表格数组的json
														
 
															+    '''
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        import time,re
														
 
															+
														
 
															+        global json,logging,time,re
														
 
															+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+        include_package_path("envs_py37.env.zip")
														
 
															+        from bs4 import BeautifulSoup
														
 
															+        global BeautifulSoup
														
 
															+
														
 
															+
														
 
															+    def process(self, dochtmlcon):
														
 
															+        if dochtmlcon is not None:
														
 
															+            _soup = BeautifulSoup(dochtmlcon,"lxml")
														
 
															+            richText = _soup.find("div",attrs={"class":"richTextFetch"})
														
 
															+            if richText is not None:
														
 
															+                richText.decompose()
														
 
															+            list_table = _soup.find_all("table")
														
 
															+            list_table.reverse()
														
 
															+            for tbody in list_table:
														
 
															+                fixSpan(tbody)
														
 
															+                inner_table = getTable(tbody)
														
 
															+                inner_table = fixTable(inner_table)
														
 
															+                list_text = getTableText(inner_table)
														
 
															+                print(list_text)
														
 
															+                table_text_json = json.dumps(list_text,ensure_ascii=False)
														
 
															+                if len(table_text_json)<200000:
														
 
															+                    self.forward(table_text_json)
														
 
															+                tbody.decompose()
														
 
															+            list_table = _soup.find_all("tbody")
														
 
															+            list_table.reverse()
														
 
															+            for tbody in list_table:
														
 
															+                fixSpan(tbody)
														
 
															+                inner_table = getTable(tbody)
														
 
															+                inner_table = fixTable(inner_table)
														
 
															+                list_text = getTableText(inner_table)
														
 
															+                table_text_json = json.dumps(list_text,ensure_ascii=False)
														
 
															+                if len(table_text_json)<200000:
														
 
															+                    self.forward(table_text_json)
														
 
															+
														
 
															+
														
 
															+def get_top_n_words(list_words,n,skip_punctuation=True):
														
 
															+    top_n_words = []
														
 
															+    for _word in list_words:
														
 
															+        if skip_punctuation and _word in (',',"。","，","：","(","（",")","）",""):
														
 
															+            continue
														
 
															+        top_n_words.append(_word)
														
 
															+        if len(top_n_words)>=n:
														
 
															+            break
														
 
															+    return top_n_words
														
 
															+
														
 
															+
														
 
															+time_pattern = re.compile("^\d{4}[\-年/]\d{2}[\-月/]\d{2}日?(\s*\d{2}[:时]\d{2}[:分]\d{2})?$")
														
 
															+time_pattern_cn = re.compile("\d{4}[年]\d{1,2}[月]\d{1,2}日?")
														
 
															+phone_pattern = re.compile('^(1[3|4|5|7|8|9][0-9][-|——|—]?\d{4}[-|——|—]?\d{4}|\d{3,4}[-|——|—]\d{7,8}/\d{3,8}|\d{3,4}[-|——|—]\d{7,8}转\d{1,4}|\d{3,4}[-|——|—]\d{7,8}|[\（|\(]0\d{2,3}[\）|\)]\d{7,8})$') # 联系电话
														
 
															+list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															+                      "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>-*[0-9][\d,]*(?:\.\d+)?(?P<science_key_word>(E-?\d+))?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
														
 
															+                      "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_front_m>(E-?\d+))?(?:，?)[百千]*)())",
														
 
															+                      "behind_m":"(()()(?P<money_behind_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															+list_m_p = []
														
 
															+for k,v in list_money_pattern.items():
														
 
															+    list_m_p.append(v)
														
 
															+
														
 
															+money_pattern = re.compile("^(%s)$"%"|".join(list_m_p))
														
 
															+money_pattern_cn = re.compile(list_money_pattern["cn"])
														
 
															+number_pattern = re.compile("^[0-9]+(\.\d+)?$")
														
 
															+eng_pattern = re.compile("[0-9a-zA-Z\-\(\)（）:/\.￥¥_\[\]【】@]{1,}")
														
 
															+filename_pattern = "\.(pdf|doc|docx|xls|xlsx|zip|rar|swf|txt|html)$"
														
 
															+
														
 
															+
														
 
															+
														
 
															+def extract_external_types(text):
														
 
															+    list_types = []
														
 
															+    #extract eng sentences
														
 
															+    import re
														
 
															+    for _match in re.finditer(eng_pattern,text):
														
 
															+        match_text = text[_match.start():_match.end()]
														
 
															+        if re.search(filename_pattern,match_text) is not None:
														
 
															+            list_types.append((_match.start(),_match.end(),"##filename##",match_text))
														
 
															+        elif re.search(time_pattern,match_text) is not None:
														
 
															+            list_types.append((_match.start(),_match.end(),"##time##",match_text))
														
 
															+        elif re.search(phone_pattern,match_text) is not None:
														
 
															+            list_types.append((_match.start(),_match.end(),"##phone##",match_text))
														
 
															+        elif re.search(money_pattern,match_text) is not None:
														
 
															+            list_types.append((_match.start(),_match.end(),"##money##",match_text))
														
 
															+        elif re.search(number_pattern,match_text) is not None:
														
 
															+            list_types.append((_match.start(),_match.end(),"##number##",match_text))
														
 
															+        else:
														
 
															+            _len = len(match_text)
														
 
															+            if _len<5:
														
 
															+                list_types.append((_match.start(),_match.end(),"##engsentence_<5##",match_text))
														
 
															+            elif _len>=5:
														
 
															+                list_types.append((_match.start(),_match.end(),"##engsentence_>=5##",match_text))
														
 
															+
														
 
															+    _search = re.search(time_pattern_cn,text)
														
 
															+    if _search is not None:
														
 
															+        match_text = text[_search.start():_search.end()]
														
 
															+        list_types.append((_search.start(),_search.end(),"##time##",match_text))
														
 
															+
														
 
															+    _search = re.search(money_pattern_cn,text)
														
 
															+    if _search is not None:
														
 
															+        match_text = text[_search.start():_search.end()]
														
 
															+        list_types.append((_search.start(),_search.end(),"##money##",match_text))
														
 
															+
														
 
															+    set_begin_end = set()
														
 
															+    final_types = []
														
 
															+    list_types.sort(key=lambda x:len(x[3]),reverse=True)
														
 
															+    for _t in list_types:
														
 
															+        _begin = _t[0]
														
 
															+        _end = _t[1]
														
 
															+        _exists = False
														
 
															+        for _b,_e in set_begin_end:
														
 
															+            if _begin>=_b and _begin<_e:
														
 
															+                _exists = True
														
 
															+                break
														
 
															+            if _end>_b and _end<=_e:
														
 
															+                _exists = True
														
 
															+                break
														
 
															+        if not _exists:
														
 
															+            set_begin_end.add((_begin,_end))
														
 
															+            final_types.append(_t)
														
 
															+
														
 
															+    return final_types
														
 
															+
														
 
															+def extract_text_types(text,list_entitys):
														
 
															+    set_types = set()
														
 
															+    #extract eng sentences
														
 
															+
														
 
															+    list_types = extract_external_types(text)
														
 
															+
														
 
															+    for _e in list_types:
														
 
															+        if isinstance(_e,tuple):
														
 
															+            _type = "%s"%(_e[2])
														
 
															+            _text = _e[3]
														
 
															+            set_types.add(_type)
														
 
															+        elif isinstance(_e,str):
														
 
															+            set_types.add(_e)
														
 
															+
														
 
															+    for _e in list_entitys:
														
 
															+        if isinstance(_e,tuple):
														
 
															+            _type = "##%s##"%(_e[2])
														
 
															+            _text = _e[3]
														
 
															+            set_types.add(_type)
														
 
															+        elif isinstance(_e,str):
														
 
															+            set_types.add(_e)
														
 
															+
														
 
															+    return list(set_types)
														
 
															+
														
 
															+
														
 
															+@annotate("string->string")
														
 
															+class f_table_cell_process():
														
 
															+    '''
														
 
															+    将表格单元格数据处理成字典，单元格去重，清洗，类型识别
														
 
															+    '''
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        import time,re
														
 
															+
														
 
															+        global json,logging,time,re
														
 
															+        # sys.path.insert(0,"F:\Workspace2016\BiddingKG")
														
 
															+        # from BiddingKG.dl.foolnltk.selffool.selffool_ner import SelfNer
														
 
															+        # include_package_path("jieba0.42.zip")
														
 
															+
														
 
															+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+        multiLoadEnv()
														
 
															+        from BiddingKG.dl.foolnltk.selffool.selffool_ner import SelfNer
														
 
															+        from BiddingKG.dl.table_head.predict import predict
														
 
															+        from bs4 import BeautifulSoup
														
 
															+        import jieba
														
 
															+        global BeautifulSoup,SelfNer,jieba,predict
														
 
															+        self.selfner = SelfNer()
														
 
															+
														
 
															+    def set_head_table(self,inner_table):
														
 
															+        if len(inner_table)>0:
														
 
															+            copy_inner_table = copy.deepcopy(inner_table)
														
 
															+
														
 
															+            for i in range(len(copy_inner_table)):
														
 
															+                for j in range(len(copy_inner_table[i])):
														
 
															+                    # 删掉单格前后符号，以免影响表头预测
														
 
															+                    col = copy_inner_table[i][j]
														
 
															+                    col = re.sub("^[^\u4e00-\u9fa5a-zA-Z0-9]+", "", col)
														
 
															+                    col = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9]+$", "", col)
														
 
															+                    copy_inner_table[i][j] = col
														
 
															+
														
 
															+            # 模型预测表头
														
 
															+            predict_list = predict(copy_inner_table)
														
 
															+
														
 
															+            # 组合结果
														
 
															+            for i in range(len(copy_inner_table)):
														
 
															+                for j in range(len(copy_inner_table[i])):
														
 
															+                    copy_inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
														
 
															+
														
 
															+            # print("table_head before repair", inner_table)
														
 
															+
														
 
															+            return copy_inner_table
														
 
															+
														
 
															+    def evaluate(self, list_text_json):
														
 
															+        if list_text_json is None or list_text_json=="":
														
 
															+            return
														
 
															+        list_text = json.loads(list_text_json)
														
 
															+
														
 
															+        list_table_cell = []
														
 
															+        height = len(list_text)
														
 
															+        if height>0:
														
 
															+            width = len(list_text[0])
														
 
															+            if width>50 or height>50:
														
 
															+                return
														
 
															+            inner_table = self.set_head_table(list_text)
														
 
															+
														
 
															+            for _h in range(height):
														
 
															+                _line = []
														
 
															+                for _w in range(width):
														
 
															+                    text = list_text[_h][_w]
														
 
															+                    list_words = list(jieba.cut(text))
														
 
															+                    #取20个词
														
 
															+                    top_20_words = get_top_n_words(list_words,20,True)
														
 
															+
														
 
															+                    ner_text = "".join(top_20_words)
														
 
															+
														
 
															+                    if len(ner_text)>0:
														
 
															+                        list_entity = self.selfner.ner([ner_text])[0]
														
 
															+                    else:
														
 
															+                        list_entity = []
														
 
															+                    list_types = extract_text_types(ner_text,list_entity)
														
 
															+                    _cell = {"top_n_words":top_20_words,
														
 
															+                             "list_types":list_types}
														
 
															+                    top_words_2gram = []
														
 
															+                    for _i in range(len(top_20_words)-1):
														
 
															+                        top_words_2gram.append("".join(top_20_words[_i:_i+2]))
														
 
															+                    _cell["top_words_2gram"] = top_words_2gram
														
 
															+                    _cell["is_head"] = inner_table[_h][_w][1]
														
 
															+                    _line.append(_cell)
														
 
															+                list_table_cell.append(_line)
														
 
															+        return json.dumps(list_table_cell,ensure_ascii=False)
														
 
															+
														
 
															+
														
 
															+@annotate('string -> string,string,string,string')
														
 
															+class f_words_contact(BaseUDTF):
														
 
															+    '''
														
 
															+    生成关联的词组信息，以供后续计算
														
 
															+    '''
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        import time,re
														
 
															+        global json,logging,time,re
														
 
															+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+    def process(self, table_cell_json):
														
 
															+        if table_cell_json is not None:
														
 
															+            list_cell = json.loads(table_cell_json)
														
 
															+            height = len(list_cell)
														
 
															+            list_a = []
														
 
															+            if height>0:
														
 
															+                width = len(list_cell[0])
														
 
															+                for _h in range(height):
														
 
															+                    for _w in range(width):
														
 
															+                        _cell = list_cell[_h][_w]
														
 
															+                        top_words = _cell["top_n_words"]
														
 
															+                        list_types = _cell["list_types"]
														
 
															+                        top_words_2gram = _cell["top_words_2gram"]
														
 
															+                        is_head = _cell["is_head"]
														
 
															+                        list_w = []
														
 
															+                        list_w.extend(top_words)
														
 
															+                        list_w.extend(list_types)
														
 
															+                        list_w.extend(top_words_2gram)
														
 
															+                        whole_w1 = "".join(top_words)
														
 
															+                        # _type = "single"
														
 
															+                        # for _w1 in list_w:
														
 
															+                        #     self.forward(_type,_w1,"")
														
 
															+                        #FIND THE LEFT SIDE
														
 
															+                        if is_head==1:
														
 
															+                            continue
														
 
															+
														
 
															+                        _find = False
														
 
															+                        for _w1 in range(_w):
														
 
															+                            _cell1 = list_cell[_h][_w1]
														
 
															+                            top_words1 = _cell1["top_n_words"]
														
 
															+                            list_types1 = _cell1["list_types"]
														
 
															+                            top_words_2gram1 = _cell1["top_words_2gram"]
														
 
															+                            is_head1 = _cell1["is_head"]
														
 
															+                            if is_head1==0:
														
 
															+                                if _find:
														
 
															+                                    break
														
 
															+                                else:
														
 
															+                                    continue
														
 
															+                            else:
														
 
															+                                _find = True
														
 
															+                            list_w1 = []
														
 
															+                            list_w1.extend(top_words1)
														
 
															+                            list_w1.extend(list_types1)
														
 
															+                            list_w1.extend(top_words_2gram1)
														
 
															+                            whole_w2 = "".join(top_words1)
														
 
															+                            _type = "pair"
														
 
															+                            # for _w1 in list_w:
														
 
															+                            #     # for _w2 in list_w1:
														
 
															+                            #     self.forward(_type,_w1,_w2)
														
 
															+                            self.forward(",".join(list_types),",".join(top_words),whole_w1,whole_w2)
														
 
															+
														
 
															+                        #FIND THE RIGHT SIDE
														
 
															+                        _find = False
														
 
															+                        for _h1 in range(_h):
														
 
															+                            _cell1 = list_cell[_h1][_w]
														
 
															+                            top_words1 = _cell1["top_n_words"]
														
 
															+                            list_types1 = _cell1["list_types"]
														
 
															+                            top_words_2gram1 = _cell1["top_words_2gram"]
														
 
															+                            list_w1 = []
														
 
															+                            list_w1.extend(top_words1)
														
 
															+                            list_w1.extend(list_types1)
														
 
															+                            list_w1.extend(top_words_2gram1)
														
 
															+                            is_head1 = _cell1["is_head"]
														
 
															+                            if is_head1==0:
														
 
															+                                continue
														
 
															+                                if _find:
														
 
															+                                    break
														
 
															+                                else:
														
 
															+                                    continue
														
 
															+                            else:
														
 
															+                                _find = True
														
 
															+                            whole_w2 = "".join(top_words1)
														
 
															+                            _type = "pair"
														
 
															+                            # for _w1 in list_w:
														
 
															+                            #     # for _w2 in list_w1:
														
 
															+                            #     #     self.forward(_type,_w1,_w2)
														
 
															+                            #     self.forward(_type,_w1,whole_w2)
														
 
															+                            self.forward(",".join(list_types),",".join(top_words),whole_w1,whole_w2)
														
 
															+
														
 
															+def regenerate_tokens_types(list_tokens,list_types):
														
 
															+    set_begin_end = set()
														
 
															+    set_types = set()
														
 
															+    for _t in list_types:
														
 
															+        _begin = _t[0]
														
 
															+        _end = _t[1]
														
 
															+        _type = _t[2]
														
 
															+        set_begin_end.add((_begin,_end))
														
 
															+        set_types.add(_type)
														
 
															+
														
 
															+    _begin = 0
														
 
															+    list_result = []
														
 
															+    for _token in list_tokens:
														
 
															+        _exists = False
														
 
															+        _end = _begin+len(_token)
														
 
															+        for _b,_e in set_begin_end:
														
 
															+            if _begin>=_b and _begin<_e:
														
 
															+                _exists = True
														
 
															+                break
														
 
															+            if _end>_b and _end<=_e:
														
 
															+                _exists = True
														
 
															+                break
														
 
															+        if not _exists:
														
 
															+            list_result.append(_token)
														
 
															+        _begin = _end
														
 
															+    list_result.extend(list(set_types))
														
 
															+    return list_result
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+@annotate('string,string,string,bigint -> string,string,double')
														
 
															+class f_process_words_contact(BaseUDTF):
														
 
															+    '''
														
 
															+    生成关联的词组信息，以供后续计算
														
 
															+    '''
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        import time,re
														
 
															+        global json,logging,time,re
														
 
															+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+    def process(self, type_center,tokens_center,word_left_top,pair_counts):
														
 
															+        #所有tokens和type共享counts
														
 
															+        #修改token和type互斥
														
 
															+        list_types = type_center.split(",")
														
 
															+
														
 
															+        set_types = set()
														
 
															+        for _t in list_types:
														
 
															+            if "engsentence" in _t or "money" in _t or "float" in _t or "number" in _t or "phone" in _t or "filename" in _t:
														
 
															+                continue
														
 
															+            set_types.add(_t)
														
 
															+
														
 
															+
														
 
															+        list_tokens = tokens_center.split(",")
														
 
															+        text = "".join(list_tokens)
														
 
															+        list_types = extract_external_types(text)
														
 
															+        print("list_tokens:",str(list_tokens))
														
 
															+        print("list_types:",str(list_types))
														
 
															+        list_result = regenerate_tokens_types(list_tokens,list_types)
														
 
															+        print("list_result:",str(list_result))
														
 
															+        _len = len(list_tokens)+len(set_types)
														
 
															+        avg_len = round(pair_counts/_len,2)
														
 
															+        for _t in list_result:
														
 
															+            self.forward(word_left_top,_t,avg_len)
														
 
															+        for _t in set_types:
														
 
															+            self.forward(word_left_top,_t,avg_len)
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # table_preprocess = f_table_preprocess()
														
 
															+    a = '''
														
 
															+    <div> 
														
 
															+  <input type="hidden" name="projectCode" value="5002287530902902303030047"> 
														
 
															+  <table width="100%">         
														
 
															+   <tbody>  
														
 
															+    <tr> 
														
 
															+     <td>  </td>   
														
 
															+     <td> 选取时间 <br> 2023-03-08 09:00:00 </td> 
														
 
															+     <td></td> 
														
 
															+     <td> 已有1家中介机构报名参加 </td> 
														
 
															+    </tr> 
														
 
															+   </tbody> 
														
 
															+  </table>  
														
 
															+  <div>
														
 
															+    关于为【2022年度财务报表审计服务】 公开选取【会计师事务所服务】机构的公告 
														
 
															+  </div>    
														
 
															+  <p> 该项目为直接选取项目，由张力从报名的中介服务机构中直接选定一家中介服务机构进行项目服务。 </p>  
														
 
															+  <table>     
														
 
															+   <tbody> 
														
 
															+    <tr> 
														
 
															+     <td> 项目名称jofujieojifw-fewoife </td> 
														
 
															+     <td> 2022年度财务报表审计服务 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 采购人 </td> 
														
 
															+     <td> 
														
 
															+      <div>
														
 
															+        <a target="_blank" class="markBlue" href="/bdqyhx/266418872542117888.html" style="color: #3083EB !important;text-decoration: underline;">重庆百里竹海旅游开发建设有限公司</a> 
														
 
															+      </div> 
														
 
															+      <div>      
														
 
															+       <span> （5.0） </span>  
														
 
															+      </div> </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 投资审批项目 </td> 
														
 
															+     <td> 否 </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 项目规模 </td> 
														
 
															+     <td>     </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 资金来源 </td> 
														
 
															+     <td>  国企资金   </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 项目实施地行政区划 </td> 
														
 
															+     <td> 重庆市梁平区 </td> 
														
 
															+    </tr>    
														
 
															+    <tr> 
														
 
															+     <td> 是否破产业务服务项目采购 </td> 
														
 
															+     <td> 否 </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 是否为行政管理中介服务事项采购 </td> 
														
 
															+     <td> 否 </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 所需服务类型 </td> 
														
 
															+     <td> 会计师事务所服务 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 服务内容 </td> 
														
 
															+     <td> 审计2022年度会计报表并出具审计报告 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 中介机构要求 </td> 
														
 
															+     <td> 以采购公告为准 </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 资质（资格）要求 </td> 
														
 
															+     <td> 需中介机构具备其中一项服务类型/资质子项  <a target="_blank" class="markBlue" href="https://zjcs.cqggzy.com/cq-zjcs-pub/purchaseNotice/qualityMultiView/919b647d-e66c-4bc8-a47c-67a3ab1f3e29" rel="noreferrer"> 点击查看资质专业要求 </a> </td> 
														
 
															+    </tr>      
														
 
															+    <tr> 
														
 
															+     <td> 其他要求说明 </td> 
														
 
															+     <td> 无 </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 服务时限及说明 </td> 
														
 
															+     <td> 广东签订合同之日起30日内出具财务会计年度审计报告 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 合同签订时限及说明 </td> 
														
 
															+     <td> 发布采购公告起7日内公布中选单位，并于5日内签订合同。 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 服务金额 </td> 
														
 
															+     <td>  张丽</td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 金额说明 </td> 
														
 
															+     <td> 按照渝价发（2011）257号文《重庆市会计师事务所服务收费管理办法》的收费标准控制在4.5折以内计算。 </td> 
														
 
															+    </tr>   
														
 
															+    <tr> 
														
 
															+     <td> 选取方式 </td> 
														
 
															+     <td> 直接选取 </td> 
														
 
															+    </tr>    
														
 
															+    <tr> 
														
 
															+     <td> 需规避机构 </td> 
														
 
															+     <td> </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 规避原因 </td> 
														
 
															+     <td> </td> 
														
 
															+    </tr>  
														
 
															+    <tr> 
														
 
															+     <td> 选取时间 </td> 
														
 
															+     <td> 2023-03-08 09:00:00 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 资质备案要求 </td> 
														
 
															+     <td>  </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 采购人业务咨询电话 </td> 
														
 
															+     <td> <a target="_blank" class="markBlue" href="/bdqyhx/266418872542117888.html" style="color: #3083EB !important;text-decoration: underline;">重庆百里竹海旅游开发建设有限公司</a>（19123659525） </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 监督举报电话 </td> 
														
 
															+     <td> 53888139 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 备注 </td> 
														
 
															+     <td> 无 </td> 
														
 
															+    </tr> 
														
 
															+    <tr> 
														
 
															+     <td> 采购需求书下载 </td> 
														
 
															+     <td> <a target="_blank" class="markBlue" rel="noreferrer"> 百里竹海.pdf </a> <br> </td> 
														
 
															+    </tr> 
														
 
															+   </tbody> 
														
 
															+  </table>   
														
 
															+  <p> <font color="red"> 特别提醒：请各中介机构报名前认真审核公司资质、资格是否符合采购公告相关要求，超越资质范围承揽业务，资质和资格条件或者能力不符合采购公告要求而响应采购公告进行报名可能被认定不良行为。 </font> <br> </p>  
														
 
															+  <p> <br> 2023-03-03 </p>  
														
 
															+ </div>
														
 
															+    '''
														
 
															+
														
 
															+    # table_text_json = table_preprocess.process(a)
														
 
															+    #
														
 
															+    # cell_process = f_table_cell_process()
														
 
															+    # table_cell_json = cell_process.evaluate(table_text_json)
														
 
															+    # print(table_cell_json)
														
 
															+    a = "12-23-23"
														
 
															+    a =  ['GGJY', '-', 'JZ', '-', '2014002']
														
 
															+    t = "".join(a)
														
 
															+    print(extract_external_types(t))
														
 
															+    print(regenerate_tokens_types(a,extract_external_types(t)))
														
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py