2 年前 · 1f9672fef5
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
			
 
				+  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK">
			
 
				     <output url="file://$PROJECT_DIR$/out" />
			
 
				   </component>
			
 
				 </project>
			
--- a/BaseDataMaintenance/dataMonitor/data_monitor.py
+++ b/BaseDataMaintenance/dataMonitor/data_monitor.py
@@ -13,7 +13,7 @@ from BaseDataMaintenance.dataSource.setttings import *
 
				 from queue import Queue
			
 
				 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				 
			
 
				-from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
			
 
				+
			
 
				 from BaseDataMaintenance.maintenance.dataflow_settings import *
			
 
				 
			
 
				 import pandas as pd
			
@@ -30,6 +30,9 @@ flow_init_log_dir = "/data/python/flow_init_log"
 
				 flow_init_check_dir = "/data/python/flow_init_check"
			
 
				 
			
 
				 
			
 
				+flow_dumplicate_log_path = "/python_log/flow_dumplicate.log"
			
 
				+
			
 
				+
			
 
				 class BaseDataMonitor():
			
 
				 
			
 
				     def __init__(self):
			
@@ -195,7 +198,7 @@ class BaseDataMonitor():
 
				 
			
 
				 
			
 
				     def monitor_attachment(self):
			
 
				-
			
 
				+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
			
 
				         try:
			
 
				             # query = BoolQuery(must_queries=[
			
 
				             #     RangeQuery("status",0,11),
			
@@ -280,7 +283,7 @@ class BaseDataMonitor():
 
				             traceback.print_exc()
			
 
				 
			
 
				     def monitor_extract(self):
			
 
				-
			
 
				+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
			
 
				         try:
			
 
				             # query = BoolQuery(must_queries=[
			
 
				             #     RangeQuery("status",11,61),
			
@@ -488,8 +491,13 @@ class BaseDataMonitor():
 
				                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				 
			
 
				         if total_count>=1000:
			
 
				-            _msg = "数据流报警：待去重公告数为:%d"%(total_count)
			
 
				-            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS)
			
 
				+            _cmd = 'cat %s | grep -c "%s.*upgrate True save"'%(flow_dumplicate_log_path,self.get_last_tenmin_time())
			
 
				+            process_count = self.cmd_execute(_cmd)
			
 
				+            atAll = False
			
 
				+            if int(process_count)==0:
			
 
				+                atAll = True
			
 
				+            _msg = "数据流报警：待去重公告数为:%d,最近十分钟去重数为：%s"%(total_count,str(process_count))
			
 
				+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=atAll)
			
 
				             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				 
			
 
				 
			
@@ -588,7 +596,7 @@ class BaseDataMonitor():
 
				         # scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
			
 
				         scheduler.add_job(self.monitor_extract,"cron",minute="*/10")
			
 
				         scheduler.add_job(self.monitor_proposedBuilding,"cron",hour="*/3")
			
 
				-        scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
			
 
				+        # scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
			
 
				         scheduler.add_job(self.monitor_sychr,"cron",minute="*/10")
			
 
				         scheduler.add_job(self.monitor_preproject,"cron",hour="8")
			
 
				         scheduler.add_job(self.monitor_merge,"cron",hour="*/1")
			
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -32,6 +32,8 @@ from BaseDataMaintenance.maxcompute.documentMerge import *
 
				 from BaseDataMaintenance.common.otsUtils import *
			
 
				 from BaseDataMaintenance.common.activateMQUtils import *
			
 
				 
			
 
				+from BaseDataMaintenance.dataMonitor.data_monitor import BaseDataMonitor
			
 
				+
			
 
				 from BaseDataMaintenance.dataSource.pool import ConnectorPool
			
 
				 
			
 
				 def getSet(list_dict,key):
			
@@ -2167,6 +2169,7 @@ class Dataflow_dumplicate(Dataflow):
 
				         logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				 
			
 
				         self.fix_doc_docid = None
			
 
				+        self.bdm = BaseDataMonitor()
			
 
				 
			
 
				         if start_delete_listener:
			
 
				             self.delete_comsumer_counts = 2
			
@@ -2931,17 +2934,15 @@ class Dataflow_dumplicate(Dataflow):
 
				         _dict = {}
			
 
				         #更新公共属性
			
 
				         for k,v in project_dict.items():
			
 
				-            if v is None or v=="":
			
 
				+            if v is None or v=="" or v=="[]" or v=="未知":
			
 
				                 continue
			
 
				             if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
			
 
				                 continue
			
 
				-            for _proj in projects:
			
 
				-                if k not in _proj:
			
 
				-                    _dict[k] = v
			
 
				-                elif _proj.get(k,"未知")=="未知":
			
 
				-                    _dict[k] = v
			
 
				         for _proj in projects:
			
 
				             _proj.update(_dict)
			
 
				+        for _proj in projects:
			
 
				+            if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
			
 
				+                _proj[project_page_time] = project_dict.get(project_page_time,"")
			
 
				 
			
 
				         #拼接属性
			
 
				         append_dict = {}
			
@@ -3749,44 +3750,47 @@ class Dataflow_dumplicate(Dataflow):
 
				         :param status_to:
			
 
				         :return:
			
 
				         '''
			
 
				-        list_docids = []
			
 
				-        _docid = item.get(document_tmp_docid)
			
 
				-        list_docids.append(_docid)
			
 
				-        if isinstance(dup_docid,list):
			
 
				-            list_docids.extend(dup_docid)
			
 
				-        list_docids = [a for a in list_docids if a is not None]
			
 
				+        try:
			
 
				+            list_docids = []
			
 
				+            _docid = item.get(document_tmp_docid)
			
 
				+            list_docids.append(_docid)
			
 
				+            if isinstance(dup_docid,list):
			
 
				+                list_docids.extend(dup_docid)
			
 
				+            list_docids = [a for a in list_docids if a is not None]
			
 
				 
			
 
				-        _time = time.time()
			
 
				-        list_projects = self.search_projects_with_document(list_docids)
			
 
				-        # log("search projects takes:%.3f"%(time.time()-_time))
			
 
				-        if len(list_projects)==0:
			
 
				-            # _time = time.time()
			
 
				-            list_docs = self.search_docs(list_docids)
			
 
				-            # log("search document takes:%.3f"%(time.time()-_time))
			
 
				-            # _time = time.time()
			
 
				-            list_projects = self.generate_projects_from_document(list_docs)
			
 
				-            # log("generate projects takes:%.3f"%(time.time()-_time))
			
 
				-        else:
			
 
				             _time = time.time()
			
 
				-            self.update_projects_by_document(_docid,save,list_projects)
			
 
				-            # log("update projects takes:%.3f"%(time.time()-_time))
			
 
				-        _time = time.time()
			
 
				-        list_projects = dumplicate_projects(list_projects)
			
 
				-        # log("dumplicate projects takes:%.3f"%(time.time()-_time))
			
 
				-        _time = time.time()
			
 
				-        list_projects = self.merge_projects(list_projects,b_log)
			
 
				-        # log("merge projects takes:%.3f"%(time.time()-_time))
			
 
				+            list_projects = self.search_projects_with_document(list_docids)
			
 
				+            # log("search projects takes:%.3f"%(time.time()-_time))
			
 
				+            if len(list_projects)==0:
			
 
				+                # _time = time.time()
			
 
				+                list_docs = self.search_docs(list_docids)
			
 
				+                # log("search document takes:%.3f"%(time.time()-_time))
			
 
				+                # _time = time.time()
			
 
				+                list_projects = self.generate_projects_from_document(list_docs)
			
 
				+                # log("generate projects takes:%.3f"%(time.time()-_time))
			
 
				+            else:
			
 
				+                _time = time.time()
			
 
				+                self.update_projects_by_document(_docid,save,list_projects)
			
 
				+                # log("update projects takes:%.3f"%(time.time()-_time))
			
 
				+            _time = time.time()
			
 
				+            list_projects = dumplicate_projects(list_projects)
			
 
				+            # log("dumplicate projects takes:%.3f"%(time.time()-_time))
			
 
				+            _time = time.time()
			
 
				+            list_projects = self.merge_projects(list_projects,b_log)
			
 
				+            # log("merge projects takes:%.3f"%(time.time()-_time))
			
 
				 
			
 
				-        _time = time.time()
			
 
				-        dumplicate_document_in_merge(list_projects)
			
 
				-        log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
			
 
				+            _time = time.time()
			
 
				+            dumplicate_document_in_merge(list_projects)
			
 
				+            log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
			
 
				 
			
 
				-        _time = time.time()
			
 
				-        project_json = to_project_json(list_projects)
			
 
				-        # log("json projects takes:%.3f"%(time.time()-_time))
			
 
				-        if b_log:
			
 
				-            log("project_json:%s"%project_json)
			
 
				-        return project_json
			
 
				+            _time = time.time()
			
 
				+            project_json = to_project_json(list_projects)
			
 
				+            # log("json projects takes:%.3f"%(time.time()-_time))
			
 
				+            if b_log:
			
 
				+                log("project_json:%s"%project_json)
			
 
				+            return project_json
			
 
				+        except Exception as e:
			
 
				+            raise RuntimeError("error on dumplicate")
			
 
				 
			
 
				     def is_exist_fingerprint(self,final_list,_docid,_fingerprint,table_name):
			
 
				         set_fingerprint = set()
			
@@ -3905,7 +3909,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 dtmp.setValue(document_tmp_projects,"[]",True)
			
 
				             else:
			
 
				                 dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
			
 
				-
			
 
				+            log(dtmp.getProperties().get(document_tmp_projects))
			
 
				             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
			
 
				             if upgrade:
			
 
				                 if table_name=="document_tmp":
			
@@ -3914,7 +3918,16 @@ class Dataflow_dumplicate(Dataflow):
 
				                 # print(dtmp.getProperties())
			
 
				                 dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
			
 
				                 dtmp.setValue(document_tmp_best_docid,best_docid,True)
			
 
				-                dtmp.update_row(self.ots_client)
			
 
				+                _flag = dtmp.update_row(self.ots_client)
			
 
				+                if not _flag:
			
 
				+                    for i in range(10):
			
 
				+                        list_proj_json = dtmp.getProperties().get(document_tmp_projects)
			
 
				+                        if list_proj_json is not None:
			
 
				+                            list_proj = json.loads(list_proj_json)
			
 
				+                            dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
			
 
				+                            if dtmp.update_row(self.ots_client):
			
 
				+                                break
			
 
				+
			
 
				 
			
 
				             # log("dump takes %.2f"%(time.time()-start_time))
			
 
				         except Exception as e:
			
@@ -3986,6 +3999,7 @@ class Dataflow_dumplicate(Dataflow):
 
				         schedule = BlockingScheduler()
			
 
				         schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
			
 
				         # schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
			
 
				+        schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
			
 
				         schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
			
 
				         schedule.start()
			
 
				 
			
@@ -4107,7 +4121,7 @@ if __name__ == '__main__':
 
				     df_dump = Dataflow_dumplicate(start_delete_listener=False)
			
 
				     # df_dump.start_flow_dumplicate()
			
 
				     a = time.time()
			
 
				-    df_dump.test_dumplicate(268920229)
			
 
				+    df_dump.test_dumplicate(237450072)
			
 
				     # df_dump.test_merge([292315564],[287890754])
			
 
				     # df_dump.flow_remove_project_tmp()
			
 
				     print("takes",time.time()-a)
			
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -15,6 +15,8 @@ from BaseDataMaintenance.common.Utils import article_limit
 
				 from BaseDataMaintenance.common.documentFingerprint import getFingerprint
			
 
				 from BaseDataMaintenance.model.postgres.document_extract import *
			
 
				 
			
 
				+
			
 
				+
			
 
				 class ActiveMQListener():
			
 
				 
			
 
				     def __init__(self,conn,_queue,*args,**kwargs):
			
--- a/BaseDataMaintenance/maxcompute/documentAnalysis.py
+++ b/BaseDataMaintenance/maxcompute/documentAnalysis.py
@@ -1,6 +1,111 @@
 
				+#coding:utf8
			
 
				 from odps.udf import annotate
			
 
				-from odps.udf import BaseUDTF
			
 
				+from odps.udf import BaseUDTF,BaseUDAF
			
 
				+from odps.distcache import get_cache_archive
			
 
				+from odps.distcache import get_cache_file
			
 
				+import copy
			
 
				 
			
 
				+import sys,os,re
			
 
				+
			
 
				+import threading
			
 
				+import logging
			
 
				+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+import time
			
 
				+from multiprocessing import Process,Queue
			
 
				+
			
 
				+def log(msg):
			
 
				+    logging.info(msg)
			
 
				+
			
 
				+
			
 
				+# 配置pandas依赖包
			
 
				+def include_package_path(res_name):
			
 
				+    import os, sys
			
 
				+    archive_files = get_cache_archive(res_name)
			
 
				+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
			
 
				+                        if '.dist_info' not in f.name], key=lambda v: len(v))
			
 
				+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
			
 
				+    log("add path:%s"%(_path))
			
 
				+    sys.path.append(_path)
			
 
				+    return _path
			
 
				+
			
 
				+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
			
 
				+# 这是因为包含C的库，会被沙盘block，可设置set odps.isolation.session.enable = true
			
 
				+def include_file(file_name):
			
 
				+    import os, sys
			
 
				+    so_file = get_cache_file(file_name)
			
 
				+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
			
 
				+
			
 
				+def include_so(file_name):
			
 
				+    import os, sys
			
 
				+    so_file = get_cache_file(file_name)
			
 
				+
			
 
				+    with open(so_file.name, 'rb') as fp:
			
 
				+        content=fp.read()
			
 
				+        so = open(file_name, "wb")
			
 
				+        so.write(content)
			
 
				+        so.flush()
			
 
				+        so.close()
			
 
				+
			
 
				+#初始化业务数据包，由于上传限制,python版本以及archive解压包不统一等各种问题，需要手动导入
			
 
				+def init_env(list_files,package_name):
			
 
				+    import os,sys
			
 
				+
			
 
				+    if len(list_files)==1:
			
 
				+        so_file = get_cache_file(list_files[0])
			
 
				+        cmd_line = os.path.abspath(so_file.name)
			
 
				+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
			
 
				+    elif len(list_files)>1:
			
 
				+        cmd_line = "cat"
			
 
				+        for _file in list_files:
			
 
				+            so_file = get_cache_file(_file)
			
 
				+            cmd_line += " "+os.path.abspath(so_file.name)
			
 
				+        cmd_line += " > temp.zip"
			
 
				+        os.system(cmd_line)
			
 
				+        os.system("unzip -o temp.zip -d %s"%(package_name))
			
 
				+    # os.system("rm -rf %s/*.dist-info"%(package_name))
			
 
				+    # return os.listdir(os.path.abspath("local_package"))
			
 
				+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
			
 
				+    # os.system("source ~/.bashrc")
			
 
				+    sys.path.insert(0,os.path.abspath(package_name))
			
 
				+
			
 
				+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
			
 
				+def multiLoadEnv():
			
 
				+    def load_project():
			
 
				+        start_time = time.time()
			
 
				+        ## init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
			
 
				+        # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
			
 
				+        #改为zip引入
			
 
				+        log("=======")
			
 
				+        include_package_path("BiddingKG.baseline.zip")
			
 
				+        # include_package_path("BiddingKG.backup.zip")
			
 
				+        logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
			
 
				+
			
 
				+    def load_vector():
			
 
				+        start_time = time.time()
			
 
				+        # init_env(["wiki_128_word_embedding_new.vector.env"],".")
			
 
				+        include_package_path("wiki.zip")
			
 
				+        logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+        # init_env(["enterprise.zip.env"],".")
			
 
				+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
			
 
				+        include_package_path("enterprise.zip")
			
 
				+        logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+        init_env(["so.env"],".")
			
 
				+        logging.info("init so.env cost %d"%(time.time()-start_time))
			
 
				+
			
 
				+    def load_py():
			
 
				+        start_time = time.time()
			
 
				+        # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
			
 
				+        include_package_path("envs_py37.env.zip")
			
 
				+        # include_package_path("envs_py35.zip")
			
 
				+        logging.info("init envs_py cost %d"%(time.time()-start_time))
			
 
				+
			
 
				+    load_project()
			
 
				+    load_vector()
			
 
				+    load_py()
			
 
				 
			
 
				 @annotate('string -> string')
			
 
				 class f_analysis_type(BaseUDTF):
			
@@ -38,3 +143,834 @@ class f_analysis_type(BaseUDTF):
 
				 
			
 
				             if len(list_match)>0:
			
 
				                 self.forward(",".join(list_match))
			
 
				+
			
 
				+#数据清洗
			
 
				+def segment(soup,final=True):
			
 
				+    # print("==")
			
 
				+    # print(soup)
			
 
				+    # print("====")
			
 
				+    #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
			
 
				+    subspaceList = ["td",'a',"span","p"]
			
 
				+    if soup.name in subspaceList:
			
 
				+        #判断有值叶子节点数
			
 
				+        _count = 0
			
 
				+        for child in soup.find_all(recursive=True):
			
 
				+            if child.get_text().strip()!="" and len(child.find_all())==0:
			
 
				+                _count += 1
			
 
				+        if _count<=1:
			
 
				+            text = soup.get_text()
			
 
				+            # 2020/11/24 大网站规则添加
			
 
				+            if 'title' in soup.attrs:
			
 
				+                if '...' in soup.get_text() and soup.get_text().strip()[:-3] in soup.attrs['title']:
			
 
				+                    text = soup.attrs['title']
			
 
				+
			
 
				+            _list = []
			
 
				+            for x in re.split("\s+",text):
			
 
				+                if x.strip()!="":
			
 
				+                    _list.append(len(x))
			
 
				+            if len(_list)>0:
			
 
				+                _minLength = min(_list)
			
 
				+                if _minLength>2:
			
 
				+                    _substr = "，"
			
 
				+                else:
			
 
				+                    _substr = ""
			
 
				+            else:
			
 
				+                _substr = ""
			
 
				+
			
 
				+            text = text.replace("\r\n","，").replace("\n","，")
			
 
				+            text = re.sub("\s+",_substr,text)
			
 
				+            # text = re.sub("\s+","##space##",text)
			
 
				+            return text
			
 
				+    segList = ["title"]
			
 
				+    commaList = ["div","br","td","p","li"]
			
 
				+    #commaList = []
			
 
				+    spaceList = ["span"]
			
 
				+    tbodies = soup.find_all('tbody')
			
 
				+    if len(tbodies) == 0:
			
 
				+        tbodies = soup.find_all('table')
			
 
				+    # 递归遍历所有节点,插入符号
			
 
				+    for child in soup.find_all(recursive=True):
			
 
				+        # print(child.name,child.get_text())
			
 
				+        if child.name in segList:
			
 
				+            child.insert_after("。")
			
 
				+        if child.name in commaList:
			
 
				+            child.insert_after("，")
			
 
				+        # if child.name == 'div' and 'class' in child.attrs:
			
 
				+        #     # 添加附件"attachment"标识
			
 
				+        #     if "richTextFetch" in child['class']:
			
 
				+        #         child.insert_before("##attachment##")
			
 
				+        # print(child.parent)
			
 
				+        # if child.name in subspaceList:
			
 
				+        #     child.insert_before("#subs"+str(child.name)+"#")
			
 
				+        #     child.insert_after("#sube"+str(child.name)+"#")
			
 
				+        # if child.name in spaceList:
			
 
				+        #     child.insert_after(" ")
			
 
				+    text = str(soup.get_text())
			
 
				+    #替换英文冒号为中文冒号
			
 
				+    text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])","：",text)
			
 
				+    #替换为中文逗号
			
 
				+    text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])","，",text)
			
 
				+    #替换为中文分号
			
 
				+    text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				+    # 感叹号替换为中文句号
			
 
				+    text = re.sub("(?<=[\u4e00-\u9fa5])[!！]|[!！](?=[\u4e00-\u9fa5])","。",text)
			
 
				+    #替换格式未识别的问号为" " ,update:2021/7/20
			
 
				+    text = re.sub("[？\?]{2,}|\n"," ",text)
			
 
				+
			
 
				+
			
 
				+    #替换"""为"“",否则导入deepdive出错
			
 
				+    # text = text.replace('"',"“").replace("\r","").replace("\n","，")
			
 
				+    text = text.replace('"',"“").replace("\r","").replace("\n","").replace("\\n","") #2022/1/4修复 非分段\n 替换为逗号造成 公司拆分 span \n南航\n上海\n分公司
			
 
				+    # print('==1',text)
			
 
				+    # text = re.sub("\s{4,}","，",text)
			
 
				+    # 解决公告中的" "空格替换问题
			
 
				+    if re.search("\s{4,}",text):
			
 
				+        _text = ""
			
 
				+        for _sent in re.split("。+",text):
			
 
				+            for _sent2 in re.split('，+',_sent):
			
 
				+                for _sent3 in re.split("：+",_sent2):
			
 
				+                    for _t in re.split("\s{4,}",_sent3):
			
 
				+                        if len(_t)<3:
			
 
				+                            _text += _t
			
 
				+                        else:
			
 
				+                            _text += "，"+_t
			
 
				+                    _text += "："
			
 
				+                _text = _text[:-1]
			
 
				+                _text += "，"
			
 
				+            _text = _text[:-1]
			
 
				+            _text += "。"
			
 
				+        _text = _text[:-1]
			
 
				+        text = _text
			
 
				+    # print('==2',text)
			
 
				+    #替换标点
			
 
				+
			
 
				+    #替换连续的标点
			
 
				+
			
 
				+    if final:
			
 
				+        text = re.sub("##space##"," ",text)
			
 
				+
			
 
				+    punc_pattern = "(?P<del>[。，；：:,\s]+)"
			
 
				+
			
 
				+    list_punc = re.findall(punc_pattern,text)
			
 
				+    list_punc.sort(key=lambda x:len(x),reverse=True)
			
 
				+    for punc_del in list_punc:
			
 
				+        if len(punc_del)>1:
			
 
				+            if len(punc_del.strip())>0:
			
 
				+                if "：" in punc_del.strip():
			
 
				+                    if "。" in punc_del.strip():
			
 
				+                        text = re.sub(punc_del, "：。", text)
			
 
				+                    else:
			
 
				+                        text = re.sub(punc_del,"：",text)
			
 
				+                else:
			
 
				+                    text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
			
 
				+            else:
			
 
				+                text = re.sub(punc_del,"",text)
			
 
				+
			
 
				+
			
 
				+    #将连续的中文句号替换为一个
			
 
				+    text_split = text.split("。")
			
 
				+    text_split = [x for x in text_split if len(x)>0]
			
 
				+    text = "。".join(text_split)
			
 
				+
			
 
				+    # #删除标签中的所有空格
			
 
				+    # for subs in subspaceList:
			
 
				+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
			
 
				+    #     while(True):
			
 
				+    #         oneMatch = re.search(re.compile(patten),text)
			
 
				+    #         if oneMatch is not None:
			
 
				+    #             _match = oneMatch.group(1)
			
 
				+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
			
 
				+    #         else:
			
 
				+    #             break
			
 
				+
			
 
				+    # text过大报错
			
 
				+    LOOP_LEN = 10000
			
 
				+    LOOP_BEGIN = 0
			
 
				+    _text = ""
			
 
				+
			
 
				+
			
 
				+
			
 
				+    if len(text)<10000000:
			
 
				+        while(LOOP_BEGIN<len(text)):
			
 
				+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				+            LOOP_BEGIN += LOOP_LEN
			
 
				+        text = _text
			
 
				+    # 附件标识前修改为句号，避免正文和附件内容混合在一起
			
 
				+    text = re.sub("[^。](?=##attachment##)","。",text)
			
 
				+    text = re.sub("[^。](?=##attachment_begin##)","。",text)
			
 
				+    text = re.sub("[^。](?=##attachment_end##)","。",text)
			
 
				+    text = re.sub("##attachment_begin##。","##attachment_begin##",text)
			
 
				+    text = re.sub("##attachment_end##。","##attachment_end##",text)
			
 
				+
			
 
				+    return text
			
 
				+def fixSpan(tbody):
			
 
				+    # 处理colspan, rowspan信息补全问题
			
 
				+    #trs = tbody.findChildren('tr', recursive=False)
			
 
				+
			
 
				+    trs = getTrs(tbody)
			
 
				+    ths_len = 0
			
 
				+    ths = list()
			
 
				+    #修改为先进行列补全再进行行补全，否则可能会出现表格解析混乱
			
 
				+    # 遍历每一个tr
			
 
				+
			
 
				+    for indtr, tr in enumerate(trs):
			
 
				+        ths_tmp = tr.findChildren('th', recursive=False)
			
 
				+        #不补全含有表格的tr
			
 
				+        if len(tr.findChildren('table'))>0:
			
 
				+            continue
			
 
				+        if len(ths_tmp) > 0:
			
 
				+            ths_len = ths_len + len(ths_tmp)
			
 
				+            for th in ths_tmp:
			
 
				+                ths.append(th)
			
 
				+        # 遍历每行中的element
			
 
				+        tds = tr.findChildren(recursive=False)
			
 
				+        for indtd, td in enumerate(tds):
			
 
				+            # 若有colspan 则补全同一行下一个位置
			
 
				+            if 'colspan' in td.attrs:
			
 
				+                if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
			
 
				+                    col = int(re.sub("[^0-9]","",str(td['colspan'])))
			
 
				+                    if col<100 and len(td.get_text())<1000:
			
 
				+                        td['colspan'] = 1
			
 
				+                        for i in range(1, col, 1):
			
 
				+                            td.insert_after(copy.copy(td))
			
 
				+
			
 
				+    for indtr, tr in enumerate(trs):
			
 
				+        ths_tmp = tr.findChildren('th', recursive=False)
			
 
				+        #不补全含有表格的tr
			
 
				+        if len(tr.findChildren('table'))>0:
			
 
				+            continue
			
 
				+        if len(ths_tmp) > 0:
			
 
				+            ths_len = ths_len + len(ths_tmp)
			
 
				+            for th in ths_tmp:
			
 
				+                ths.append(th)
			
 
				+        # 遍历每行中的element
			
 
				+        tds = tr.findChildren(recursive=False)
			
 
				+        for indtd, td in enumerate(tds):
			
 
				+            # 若有rowspan 则补全下一行同样位置
			
 
				+            if 'rowspan' in td.attrs:
			
 
				+                if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
			
 
				+                    row = int(re.sub("[^0-9]","",str(td['rowspan'])))
			
 
				+                    td['rowspan'] = 1
			
 
				+                    for i in range(1, row, 1):
			
 
				+                        # 获取下一行的所有td， 在对应的位置插入
			
 
				+                        if indtr+i<len(trs):
			
 
				+                            tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
			
 
				+                            if len(tds1) >= (indtd) and len(tds1)>0:
			
 
				+                                if indtd > 0:
			
 
				+                                    tds1[indtd - 1].insert_after(copy.copy(td))
			
 
				+                                else:
			
 
				+                                    tds1[0].insert_before(copy.copy(td))
			
 
				+                            elif indtd-2>0 and len(tds1) > 0 and len(tds1) == indtd - 1:  # 修正某些表格最后一列没补全
			
 
				+                                tds1[indtd-2].insert_after(copy.copy(td))
			
 
				+def getTable(tbody):
			
 
				+    #trs = tbody.findChildren('tr', recursive=False)
			
 
				+    trs = getTrs(tbody)
			
 
				+    inner_table = []
			
 
				+    for tr in trs:
			
 
				+        tr_line = []
			
 
				+        tds = tr.findChildren(['td','th'], recursive=False)
			
 
				+        if len(tds)==0:
			
 
				+            tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
			
 
				+        for td in tds:
			
 
				+            tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				+            #tr_line.append([td.get_text(),0])
			
 
				+        inner_table.append(tr_line)
			
 
				+    return inner_table
			
 
				+
			
 
				+def getTrs(tbody):
			
 
				+    #获取所有的tr
			
 
				+    trs = []
			
 
				+    objs = tbody.find_all(recursive=False)
			
 
				+    for obj in objs:
			
 
				+        if obj.name=="tr":
			
 
				+            trs.append(obj)
			
 
				+        if obj.name=="tbody":
			
 
				+            for tr in obj.find_all("tr",recursive=False):
			
 
				+                trs.append(tr)
			
 
				+    return trs
			
 
				+
			
 
				+#处理表格不对齐的问题
			
 
				+def fixTable(inner_table,fix_value=""):
			
 
				+    maxWidth = 0
			
 
				+    for item in inner_table:
			
 
				+        if len(item)>maxWidth:
			
 
				+            maxWidth = len(item)
			
 
				+    if maxWidth > 100:
			
 
				+        # log('表格列数大于100，表格异常不做处理。')
			
 
				+        return []
			
 
				+    for i in range(len(inner_table)):
			
 
				+        if len(inner_table[i])<maxWidth:
			
 
				+            for j in range(maxWidth-len(inner_table[i])):
			
 
				+                inner_table[i].append([fix_value,0])
			
 
				+    return inner_table
			
 
				+
			
 
				+def getTableText(inner_table):
			
 
				+    height = len(inner_table)
			
 
				+    table_text = []
			
 
				+    if height>0:
			
 
				+        width = len(inner_table[0])
			
 
				+        for _h in range(height):
			
 
				+            _line = []
			
 
				+            for _w in range(width):
			
 
				+                _line.append(inner_table[_h][_w][0])
			
 
				+            table_text.append(_line)
			
 
				+    return table_text
			
 
				+
			
 
				+@annotate('string -> string')
			
 
				+class f_table_preprocess(BaseUDTF):
			
 
				+    '''
			
 
				+    获取所有的表格并返回表格数组的json
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        import time,re
			
 
				+
			
 
				+        global json,logging,time,re
			
 
				+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        include_package_path("envs_py37.env.zip")
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        global BeautifulSoup
			
 
				+
			
 
				+
			
 
				+    def process(self, dochtmlcon):
			
 
				+        if dochtmlcon is not None:
			
 
				+            _soup = BeautifulSoup(dochtmlcon,"lxml")
			
 
				+            richText = _soup.find("div",attrs={"class":"richTextFetch"})
			
 
				+            if richText is not None:
			
 
				+                richText.decompose()
			
 
				+            list_table = _soup.find_all("table")
			
 
				+            list_table.reverse()
			
 
				+            for tbody in list_table:
			
 
				+                fixSpan(tbody)
			
 
				+                inner_table = getTable(tbody)
			
 
				+                inner_table = fixTable(inner_table)
			
 
				+                list_text = getTableText(inner_table)
			
 
				+                print(list_text)
			
 
				+                table_text_json = json.dumps(list_text,ensure_ascii=False)
			
 
				+                if len(table_text_json)<200000:
			
 
				+                    self.forward(table_text_json)
			
 
				+                tbody.decompose()
			
 
				+            list_table = _soup.find_all("tbody")
			
 
				+            list_table.reverse()
			
 
				+            for tbody in list_table:
			
 
				+                fixSpan(tbody)
			
 
				+                inner_table = getTable(tbody)
			
 
				+                inner_table = fixTable(inner_table)
			
 
				+                list_text = getTableText(inner_table)
			
 
				+                table_text_json = json.dumps(list_text,ensure_ascii=False)
			
 
				+                if len(table_text_json)<200000:
			
 
				+                    self.forward(table_text_json)
			
 
				+
			
 
				+
			
 
				+def get_top_n_words(list_words,n,skip_punctuation=True):
			
 
				+    top_n_words = []
			
 
				+    for _word in list_words:
			
 
				+        if skip_punctuation and _word in (',',"。","，","：","(","（",")","）",""):
			
 
				+            continue
			
 
				+        top_n_words.append(_word)
			
 
				+        if len(top_n_words)>=n:
			
 
				+            break
			
 
				+    return top_n_words
			
 
				+
			
 
				+
			
 
				+time_pattern = re.compile("^\d{4}[\-年/]\d{2}[\-月/]\d{2}日?(\s*\d{2}[:时]\d{2}[:分]\d{2})?$")
			
 
				+time_pattern_cn = re.compile("\d{4}[年]\d{1,2}[月]\d{1,2}日?")
			
 
				+phone_pattern = re.compile('^(1[3|4|5|7|8|9][0-9][-|——|—]?\d{4}[-|——|—]?\d{4}|\d{3,4}[-|——|—]\d{7,8}/\d{3,8}|\d{3,4}[-|——|—]\d{7,8}转\d{1,4}|\d{3,4}[-|——|—]\d{7,8}|[\（|\(]0\d{2,3}[\）|\)]\d{7,8})$') # 联系电话
			
 
				+list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				+                      "key_word": "((?P<text_key_word>(?:[￥¥]+，?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,，（\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[）\)]?)\s*[，,:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>-*[0-9][\d,]*(?:\.\d+)?(?P<science_key_word>(E-?\d+))?(?:，?)[百千]{,1})(?:[（\(]?(?P<filter_>[%])*\s*(单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[）\)]?))",
			
 
				+                      "front_m":"((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)])\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_front_m>(E-?\d+))?(?:，?)[百千]*)())",
			
 
				+                      "behind_m":"(()()(?P<money_behind_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:，?)[百千]*)(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				+list_m_p = []
			
 
				+for k,v in list_money_pattern.items():
			
 
				+    list_m_p.append(v)
			
 
				+
			
 
				+money_pattern = re.compile("^(%s)$"%"|".join(list_m_p))
			
 
				+money_pattern_cn = re.compile(list_money_pattern["cn"])
			
 
				+number_pattern = re.compile("^[0-9]+(\.\d+)?$")
			
 
				+eng_pattern = re.compile("[0-9a-zA-Z\-\(\)（）:/\.￥¥_\[\]【】@]{1,}")
			
 
				+filename_pattern = "\.(pdf|doc|docx|xls|xlsx|zip|rar|swf|txt|html)$"
			
 
				+
			
 
				+
			
 
				+
			
 
				+def extract_external_types(text):
			
 
				+    list_types = []
			
 
				+    #extract eng sentences
			
 
				+    import re
			
 
				+    for _match in re.finditer(eng_pattern,text):
			
 
				+        match_text = text[_match.start():_match.end()]
			
 
				+        if re.search(filename_pattern,match_text) is not None:
			
 
				+            list_types.append((_match.start(),_match.end(),"##filename##",match_text))
			
 
				+        elif re.search(time_pattern,match_text) is not None:
			
 
				+            list_types.append((_match.start(),_match.end(),"##time##",match_text))
			
 
				+        elif re.search(phone_pattern,match_text) is not None:
			
 
				+            list_types.append((_match.start(),_match.end(),"##phone##",match_text))
			
 
				+        elif re.search(money_pattern,match_text) is not None:
			
 
				+            list_types.append((_match.start(),_match.end(),"##money##",match_text))
			
 
				+        elif re.search(number_pattern,match_text) is not None:
			
 
				+            list_types.append((_match.start(),_match.end(),"##number##",match_text))
			
 
				+        else:
			
 
				+            _len = len(match_text)
			
 
				+            if _len<5:
			
 
				+                list_types.append((_match.start(),_match.end(),"##engsentence_<5##",match_text))
			
 
				+            elif _len>=5:
			
 
				+                list_types.append((_match.start(),_match.end(),"##engsentence_>=5##",match_text))
			
 
				+
			
 
				+    _search = re.search(time_pattern_cn,text)
			
 
				+    if _search is not None:
			
 
				+        match_text = text[_search.start():_search.end()]
			
 
				+        list_types.append((_search.start(),_search.end(),"##time##",match_text))
			
 
				+
			
 
				+    _search = re.search(money_pattern_cn,text)
			
 
				+    if _search is not None:
			
 
				+        match_text = text[_search.start():_search.end()]
			
 
				+        list_types.append((_search.start(),_search.end(),"##money##",match_text))
			
 
				+
			
 
				+    set_begin_end = set()
			
 
				+    final_types = []
			
 
				+    list_types.sort(key=lambda x:len(x[3]),reverse=True)
			
 
				+    for _t in list_types:
			
 
				+        _begin = _t[0]
			
 
				+        _end = _t[1]
			
 
				+        _exists = False
			
 
				+        for _b,_e in set_begin_end:
			
 
				+            if _begin>=_b and _begin<_e:
			
 
				+                _exists = True
			
 
				+                break
			
 
				+            if _end>_b and _end<=_e:
			
 
				+                _exists = True
			
 
				+                break
			
 
				+        if not _exists:
			
 
				+            set_begin_end.add((_begin,_end))
			
 
				+            final_types.append(_t)
			
 
				+
			
 
				+    return final_types
			
 
				+
			
 
				+def extract_text_types(text,list_entitys):
			
 
				+    set_types = set()
			
 
				+    #extract eng sentences
			
 
				+
			
 
				+    list_types = extract_external_types(text)
			
 
				+
			
 
				+    for _e in list_types:
			
 
				+        if isinstance(_e,tuple):
			
 
				+            _type = "%s"%(_e[2])
			
 
				+            _text = _e[3]
			
 
				+            set_types.add(_type)
			
 
				+        elif isinstance(_e,str):
			
 
				+            set_types.add(_e)
			
 
				+
			
 
				+    for _e in list_entitys:
			
 
				+        if isinstance(_e,tuple):
			
 
				+            _type = "##%s##"%(_e[2])
			
 
				+            _text = _e[3]
			
 
				+            set_types.add(_type)
			
 
				+        elif isinstance(_e,str):
			
 
				+            set_types.add(_e)
			
 
				+
			
 
				+    return list(set_types)
			
 
				+
			
 
				+
			
 
				+@annotate("string->string")
			
 
				+class f_table_cell_process():
			
 
				+    '''
			
 
				+    将表格单元格数据处理成字典，单元格去重，清洗，类型识别
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        import time,re
			
 
				+
			
 
				+        global json,logging,time,re
			
 
				+        # sys.path.insert(0,"F:\Workspace2016\BiddingKG")
			
 
				+        # from BiddingKG.dl.foolnltk.selffool.selffool_ner import SelfNer
			
 
				+        # include_package_path("jieba0.42.zip")
			
 
				+
			
 
				+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        multiLoadEnv()
			
 
				+        from BiddingKG.dl.foolnltk.selffool.selffool_ner import SelfNer
			
 
				+        from BiddingKG.dl.table_head.predict import predict
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        import jieba
			
 
				+        global BeautifulSoup,SelfNer,jieba,predict
			
 
				+        self.selfner = SelfNer()
			
 
				+
			
 
				+    def set_head_table(self,inner_table):
			
 
				+        if len(inner_table)>0:
			
 
				+            copy_inner_table = copy.deepcopy(inner_table)
			
 
				+
			
 
				+            for i in range(len(copy_inner_table)):
			
 
				+                for j in range(len(copy_inner_table[i])):
			
 
				+                    # 删掉单格前后符号，以免影响表头预测
			
 
				+                    col = copy_inner_table[i][j]
			
 
				+                    col = re.sub("^[^\u4e00-\u9fa5a-zA-Z0-9]+", "", col)
			
 
				+                    col = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9]+$", "", col)
			
 
				+                    copy_inner_table[i][j] = col
			
 
				+
			
 
				+            # 模型预测表头
			
 
				+            predict_list = predict(copy_inner_table)
			
 
				+
			
 
				+            # 组合结果
			
 
				+            for i in range(len(copy_inner_table)):
			
 
				+                for j in range(len(copy_inner_table[i])):
			
 
				+                    copy_inner_table[i][j] = [inner_table[i][j], int(predict_list[i][j])]
			
 
				+
			
 
				+            # print("table_head before repair", inner_table)
			
 
				+
			
 
				+            return copy_inner_table
			
 
				+
			
 
				+    def evaluate(self, list_text_json):
			
 
				+        if list_text_json is None or list_text_json=="":
			
 
				+            return
			
 
				+        list_text = json.loads(list_text_json)
			
 
				+
			
 
				+        list_table_cell = []
			
 
				+        height = len(list_text)
			
 
				+        if height>0:
			
 
				+            width = len(list_text[0])
			
 
				+            if width>50 or height>50:
			
 
				+                return
			
 
				+            inner_table = self.set_head_table(list_text)
			
 
				+
			
 
				+            for _h in range(height):
			
 
				+                _line = []
			
 
				+                for _w in range(width):
			
 
				+                    text = list_text[_h][_w]
			
 
				+                    list_words = list(jieba.cut(text))
			
 
				+                    #取20个词
			
 
				+                    top_20_words = get_top_n_words(list_words,20,True)
			
 
				+
			
 
				+                    ner_text = "".join(top_20_words)
			
 
				+
			
 
				+                    if len(ner_text)>0:
			
 
				+                        list_entity = self.selfner.ner([ner_text])[0]
			
 
				+                    else:
			
 
				+                        list_entity = []
			
 
				+                    list_types = extract_text_types(ner_text,list_entity)
			
 
				+                    _cell = {"top_n_words":top_20_words,
			
 
				+                             "list_types":list_types}
			
 
				+                    top_words_2gram = []
			
 
				+                    for _i in range(len(top_20_words)-1):
			
 
				+                        top_words_2gram.append("".join(top_20_words[_i:_i+2]))
			
 
				+                    _cell["top_words_2gram"] = top_words_2gram
			
 
				+                    _cell["is_head"] = inner_table[_h][_w][1]
			
 
				+                    _line.append(_cell)
			
 
				+                list_table_cell.append(_line)
			
 
				+        return json.dumps(list_table_cell,ensure_ascii=False)
			
 
				+
			
 
				+
			
 
				+@annotate('string -> string,string,string,string')
			
 
				+class f_words_contact(BaseUDTF):
			
 
				+    '''
			
 
				+    生成关联的词组信息，以供后续计算
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        import time,re
			
 
				+        global json,logging,time,re
			
 
				+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def process(self, table_cell_json):
			
 
				+        if table_cell_json is not None:
			
 
				+            list_cell = json.loads(table_cell_json)
			
 
				+            height = len(list_cell)
			
 
				+            list_a = []
			
 
				+            if height>0:
			
 
				+                width = len(list_cell[0])
			
 
				+                for _h in range(height):
			
 
				+                    for _w in range(width):
			
 
				+                        _cell = list_cell[_h][_w]
			
 
				+                        top_words = _cell["top_n_words"]
			
 
				+                        list_types = _cell["list_types"]
			
 
				+                        top_words_2gram = _cell["top_words_2gram"]
			
 
				+                        is_head = _cell["is_head"]
			
 
				+                        list_w = []
			
 
				+                        list_w.extend(top_words)
			
 
				+                        list_w.extend(list_types)
			
 
				+                        list_w.extend(top_words_2gram)
			
 
				+                        whole_w1 = "".join(top_words)
			
 
				+                        # _type = "single"
			
 
				+                        # for _w1 in list_w:
			
 
				+                        #     self.forward(_type,_w1,"")
			
 
				+                        #FIND THE LEFT SIDE
			
 
				+                        if is_head==1:
			
 
				+                            continue
			
 
				+
			
 
				+                        _find = False
			
 
				+                        for _w1 in range(_w):
			
 
				+                            _cell1 = list_cell[_h][_w1]
			
 
				+                            top_words1 = _cell1["top_n_words"]
			
 
				+                            list_types1 = _cell1["list_types"]
			
 
				+                            top_words_2gram1 = _cell1["top_words_2gram"]
			
 
				+                            is_head1 = _cell1["is_head"]
			
 
				+                            if is_head1==0:
			
 
				+                                if _find:
			
 
				+                                    break
			
 
				+                                else:
			
 
				+                                    continue
			
 
				+                            else:
			
 
				+                                _find = True
			
 
				+                            list_w1 = []
			
 
				+                            list_w1.extend(top_words1)
			
 
				+                            list_w1.extend(list_types1)
			
 
				+                            list_w1.extend(top_words_2gram1)
			
 
				+                            whole_w2 = "".join(top_words1)
			
 
				+                            _type = "pair"
			
 
				+                            # for _w1 in list_w:
			
 
				+                            #     # for _w2 in list_w1:
			
 
				+                            #     self.forward(_type,_w1,_w2)
			
 
				+                            self.forward(",".join(list_types),",".join(top_words),whole_w1,whole_w2)
			
 
				+
			
 
				+                        #FIND THE RIGHT SIDE
			
 
				+                        _find = False
			
 
				+                        for _h1 in range(_h):
			
 
				+                            _cell1 = list_cell[_h1][_w]
			
 
				+                            top_words1 = _cell1["top_n_words"]
			
 
				+                            list_types1 = _cell1["list_types"]
			
 
				+                            top_words_2gram1 = _cell1["top_words_2gram"]
			
 
				+                            list_w1 = []
			
 
				+                            list_w1.extend(top_words1)
			
 
				+                            list_w1.extend(list_types1)
			
 
				+                            list_w1.extend(top_words_2gram1)
			
 
				+                            is_head1 = _cell1["is_head"]
			
 
				+                            if is_head1==0:
			
 
				+                                continue
			
 
				+                                if _find:
			
 
				+                                    break
			
 
				+                                else:
			
 
				+                                    continue
			
 
				+                            else:
			
 
				+                                _find = True
			
 
				+                            whole_w2 = "".join(top_words1)
			
 
				+                            _type = "pair"
			
 
				+                            # for _w1 in list_w:
			
 
				+                            #     # for _w2 in list_w1:
			
 
				+                            #     #     self.forward(_type,_w1,_w2)
			
 
				+                            #     self.forward(_type,_w1,whole_w2)
			
 
				+                            self.forward(",".join(list_types),",".join(top_words),whole_w1,whole_w2)
			
 
				+
			
 
				+def regenerate_tokens_types(list_tokens,list_types):
			
 
				+    set_begin_end = set()
			
 
				+    set_types = set()
			
 
				+    for _t in list_types:
			
 
				+        _begin = _t[0]
			
 
				+        _end = _t[1]
			
 
				+        _type = _t[2]
			
 
				+        set_begin_end.add((_begin,_end))
			
 
				+        set_types.add(_type)
			
 
				+
			
 
				+    _begin = 0
			
 
				+    list_result = []
			
 
				+    for _token in list_tokens:
			
 
				+        _exists = False
			
 
				+        _end = _begin+len(_token)
			
 
				+        for _b,_e in set_begin_end:
			
 
				+            if _begin>=_b and _begin<_e:
			
 
				+                _exists = True
			
 
				+                break
			
 
				+            if _end>_b and _end<=_e:
			
 
				+                _exists = True
			
 
				+                break
			
 
				+        if not _exists:
			
 
				+            list_result.append(_token)
			
 
				+        _begin = _end
			
 
				+    list_result.extend(list(set_types))
			
 
				+    return list_result
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@annotate('string,string,string,bigint -> string,string,double')
			
 
				+class f_process_words_contact(BaseUDTF):
			
 
				+    '''
			
 
				+    生成关联的词组信息，以供后续计算
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        import time,re
			
 
				+        global json,logging,time,re
			
 
				+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def process(self, type_center,tokens_center,word_left_top,pair_counts):
			
 
				+        #所有tokens和type共享counts
			
 
				+        #修改token和type互斥
			
 
				+        list_types = type_center.split(",")
			
 
				+
			
 
				+        set_types = set()
			
 
				+        for _t in list_types:
			
 
				+            if "engsentence" in _t or "money" in _t or "float" in _t or "number" in _t or "phone" in _t or "filename" in _t:
			
 
				+                continue
			
 
				+            set_types.add(_t)
			
 
				+
			
 
				+
			
 
				+        list_tokens = tokens_center.split(",")
			
 
				+        text = "".join(list_tokens)
			
 
				+        list_types = extract_external_types(text)
			
 
				+        print("list_tokens:",str(list_tokens))
			
 
				+        print("list_types:",str(list_types))
			
 
				+        list_result = regenerate_tokens_types(list_tokens,list_types)
			
 
				+        print("list_result:",str(list_result))
			
 
				+        _len = len(list_tokens)+len(set_types)
			
 
				+        avg_len = round(pair_counts/_len,2)
			
 
				+        for _t in list_result:
			
 
				+            self.forward(word_left_top,_t,avg_len)
			
 
				+        for _t in set_types:
			
 
				+            self.forward(word_left_top,_t,avg_len)
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # table_preprocess = f_table_preprocess()
			
 
				+    a = '''
			
 
				+    <div> 
			
 
				+  <input type="hidden" name="projectCode" value="5002287530902902303030047"> 
			
 
				+  <table width="100%">         
			
 
				+   <tbody>  
			
 
				+    <tr> 
			
 
				+     <td>  </td>   
			
 
				+     <td> 选取时间 <br> 2023-03-08 09:00:00 </td> 
			
 
				+     <td></td> 
			
 
				+     <td> 已有1家中介机构报名参加 </td> 
			
 
				+    </tr> 
			
 
				+   </tbody> 
			
 
				+  </table>  
			
 
				+  <div>
			
 
				+    关于为【2022年度财务报表审计服务】 公开选取【会计师事务所服务】机构的公告 
			
 
				+  </div>    
			
 
				+  <p> 该项目为直接选取项目，由张力从报名的中介服务机构中直接选定一家中介服务机构进行项目服务。 </p>  
			
 
				+  <table>     
			
 
				+   <tbody> 
			
 
				+    <tr> 
			
 
				+     <td> 项目名称jofujieojifw-fewoife </td> 
			
 
				+     <td> 2022年度财务报表审计服务 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 采购人 </td> 
			
 
				+     <td> 
			
 
				+      <div>
			
 
				+        <a target="_blank" class="markBlue" href="/bdqyhx/266418872542117888.html" style="color: #3083EB !important;text-decoration: underline;">重庆百里竹海旅游开发建设有限公司</a> 
			
 
				+      </div> 
			
 
				+      <div>      
			
 
				+       <span> （5.0） </span>  
			
 
				+      </div> </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 投资审批项目 </td> 
			
 
				+     <td> 否 </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 项目规模 </td> 
			
 
				+     <td>     </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 资金来源 </td> 
			
 
				+     <td>  国企资金   </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 项目实施地行政区划 </td> 
			
 
				+     <td> 重庆市梁平区 </td> 
			
 
				+    </tr>    
			
 
				+    <tr> 
			
 
				+     <td> 是否破产业务服务项目采购 </td> 
			
 
				+     <td> 否 </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 是否为行政管理中介服务事项采购 </td> 
			
 
				+     <td> 否 </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 所需服务类型 </td> 
			
 
				+     <td> 会计师事务所服务 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 服务内容 </td> 
			
 
				+     <td> 审计2022年度会计报表并出具审计报告 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 中介机构要求 </td> 
			
 
				+     <td> 以采购公告为准 </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 资质（资格）要求 </td> 
			
 
				+     <td> 需中介机构具备其中一项服务类型/资质子项  <a target="_blank" class="markBlue" href="https://zjcs.cqggzy.com/cq-zjcs-pub/purchaseNotice/qualityMultiView/919b647d-e66c-4bc8-a47c-67a3ab1f3e29" rel="noreferrer"> 点击查看资质专业要求 </a> </td> 
			
 
				+    </tr>      
			
 
				+    <tr> 
			
 
				+     <td> 其他要求说明 </td> 
			
 
				+     <td> 无 </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 服务时限及说明 </td> 
			
 
				+     <td> 广东签订合同之日起30日内出具财务会计年度审计报告 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 合同签订时限及说明 </td> 
			
 
				+     <td> 发布采购公告起7日内公布中选单位，并于5日内签订合同。 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 服务金额 </td> 
			
 
				+     <td>  张丽</td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 金额说明 </td> 
			
 
				+     <td> 按照渝价发（2011）257号文《重庆市会计师事务所服务收费管理办法》的收费标准控制在4.5折以内计算。 </td> 
			
 
				+    </tr>   
			
 
				+    <tr> 
			
 
				+     <td> 选取方式 </td> 
			
 
				+     <td> 直接选取 </td> 
			
 
				+    </tr>    
			
 
				+    <tr> 
			
 
				+     <td> 需规避机构 </td> 
			
 
				+     <td> </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 规避原因 </td> 
			
 
				+     <td> </td> 
			
 
				+    </tr>  
			
 
				+    <tr> 
			
 
				+     <td> 选取时间 </td> 
			
 
				+     <td> 2023-03-08 09:00:00 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 资质备案要求 </td> 
			
 
				+     <td>  </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 采购人业务咨询电话 </td> 
			
 
				+     <td> <a target="_blank" class="markBlue" href="/bdqyhx/266418872542117888.html" style="color: #3083EB !important;text-decoration: underline;">重庆百里竹海旅游开发建设有限公司</a>（19123659525） </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 监督举报电话 </td> 
			
 
				+     <td> 53888139 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 备注 </td> 
			
 
				+     <td> 无 </td> 
			
 
				+    </tr> 
			
 
				+    <tr> 
			
 
				+     <td> 采购需求书下载 </td> 
			
 
				+     <td> <a target="_blank" class="markBlue" rel="noreferrer"> 百里竹海.pdf </a> <br> </td> 
			
 
				+    </tr> 
			
 
				+   </tbody> 
			
 
				+  </table>   
			
 
				+  <p> <font color="red"> 特别提醒：请各中介机构报名前认真审核公司资质、资格是否符合采购公告相关要求，超越资质范围承揽业务，资质和资格条件或者能力不符合采购公告要求而响应采购公告进行报名可能被认定不良行为。 </font> <br> </p>  
			
 
				+  <p> <br> 2023-03-03 </p>  
			
 
				+ </div>
			
 
				+    '''
			
 
				+
			
 
				+    # table_text_json = table_preprocess.process(a)
			
 
				+    #
			
 
				+    # cell_process = f_table_cell_process()
			
 
				+    # table_cell_json = cell_process.evaluate(table_text_json)
			
 
				+    # print(table_cell_json)
			
 
				+    a = "12-23-23"
			
 
				+    a =  ['GGJY', '-', 'JZ', '-', '2014002']
			
 
				+    t = "".join(a)
			
 
				+    print(extract_external_types(t))
			
 
				+    print(regenerate_tokens_types(a,extract_external_types(t)))
			
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py