há 5 meses atrás · 8bcf3fb0dd
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -2,8 +2,10 @@
 
				 <project version="4">
			
 
				   <component name="Encoding">
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/attachmentProcessTime.xlsx" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/chat/chatUtil.py" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/dataSource/searchPaddle.py" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/gpt_extract.py" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_exclude_name_from_tw_prod.csv" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_product_name_exclude_name.csv" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/update_product.csv" charset="GBK" />
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
			
 
				+  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
			
 
				     <output url="file://$PROJECT_DIR$/out" />
			
 
				   </component>
			
 
				 </project>
			
--- a/BaseDataMaintenance/chat/ERNIE_utils.py
+++ b/BaseDataMaintenance/chat/ERNIE_utils.py
@@ -0,0 +1,77 @@
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+def get_access_token():
			
 
				+    """
			
 
				+    使用 API Key，Secret Key 获取access_token，替换下列示例中的应用API Key、应用Secret Key
			
 
				+    """
			
 
				+
			
 
				+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=gnwVXv96An9qMYqq9eWbeNqk&client_secret=mDsRQbCPsV4N7x28LbwkhTAaLmrrDnXk"
			
 
				+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=Ok8QMe4qIQOAex0F9Gf1uns0&client_secret=6DjGGDdvhnBaEOMdSXAg02KxZnQhWpbd"
			
 
				+
			
 
				+    payload = json.dumps("")
			
 
				+    headers = {
			
 
				+        'Content-Type': 'application/json',
			
 
				+        'Accept': 'application/json'
			
 
				+    }
			
 
				+
			
 
				+    response = requests.request("POST", url, headers=headers, data=payload)
			
 
				+    return response.json().get("access_token")
			
 
				+
			
 
				+def main():
			
 
				+    _token = get_access_token()
			
 
				+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
			
 
				+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
			
 
				+
			
 
				+    # url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/xuanyuan_70b_chat?access_token=" + _token
			
 
				+
			
 
				+    payload = json.dumps({
			
 
				+        "messages": [
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": '''
			
 
				+               今天是几号
			
 
				+                '''
			
 
				+            }
			
 
				+        ]
			
 
				+    })
			
 
				+    headers = {
			
 
				+        'Content-Type': 'application/json'
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    response = requests.request("POST", url, headers=headers, data=payload)
			
 
				+
			
 
				+    print(response.text)
			
 
				+
			
 
				+def chat(msg,token=None,api_url=None):
			
 
				+    if token is None:
			
 
				+        token = get_access_token()
			
 
				+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
			
 
				+    if api_url is None:
			
 
				+        api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions"
			
 
				+        # api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-3.5-128k"
			
 
				+    url =  api_url+"?access_token="+ token
			
 
				+    payload = json.dumps({
			
 
				+        "messages": [
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": '''
			
 
				+               %s
			
 
				+                '''%msg
			
 
				+            }
			
 
				+        ],
			
 
				+        "stream":False
			
 
				+    })
			
 
				+    headers = {
			
 
				+        'Content-Type': 'application/json'
			
 
				+    }
			
 
				+    response = requests.request("POST", url, headers=headers, data=payload)
			
 
				+
			
 
				+    return response
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/BaseDataMaintenance/chat/chatUtil.py
+++ b/BaseDataMaintenance/chat/chatUtil.py
@@ -0,0 +1,86 @@
 
				+#coding:utf8
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+import re
			
 
				+
			
 
				+def html2text(_html):
			
 
				+
			
 
				+    if type(_html)==str:
			
 
				+        _soup = BeautifulSoup(_html,"lxml")
			
 
				+    else:
			
 
				+        _soup = _html
			
 
				+    list_table = _soup.find_all("table")
			
 
				+    list_tbody = _soup.find_all("tbody")
			
 
				+    if len(list_table)>0 or len(list_tbody)>0:
			
 
				+        list_childs = _soup.find_all(recursive=False)
			
 
				+        list_child_text = []
			
 
				+        for child in list_childs:
			
 
				+            list_child_text.append(html2text(child))
			
 
				+        return "\n".join(list_child_text)
			
 
				+
			
 
				+    else:
			
 
				+        if _soup.name=="table" or _soup.name=="tbody":
			
 
				+            _table_text = ""
			
 
				+            trs = _soup.find_all("tr")
			
 
				+            list_tr_text = []
			
 
				+            for tr in trs:
			
 
				+                tds = tr.find_all("th")
			
 
				+                if len(tds)>0:
			
 
				+                    list_td_text = []
			
 
				+                    for td in tds:
			
 
				+                        list_td_text.append(re.sub('\s','',td.get_text()))
			
 
				+                    list_tr_text.append("|".join(list_td_text))
			
 
				+                tds = tr.find_all("td")
			
 
				+                if len(tds)>0:
			
 
				+                    list_td_text = []
			
 
				+                    for td in tds:
			
 
				+                        list_td_text.append(re.sub('\s','',td.get_text()))
			
 
				+                    list_tr_text.append("|".join(list_td_text))
			
 
				+            _table_text = "%s\n\n"%"\n".join(list_tr_text)
			
 
				+            if _table_text == "":
			
 
				+                _table_text = _soup.get_text()
			
 
				+            _soup.decompose()
			
 
				+            return _table_text
			
 
				+        else:
			
 
				+            _text = re.sub('\s','',_soup.get_text().strip())
			
 
				+            _soup.decompose()
			
 
				+            return _text
			
 
				+
			
 
				+def table2list(_html):
			
 
				+    if type(_html)==str:
			
 
				+        _soup = BeautifulSoup(_html,'lxml')
			
 
				+    else:
			
 
				+        _soup = _html
			
 
				+    print("===",type(_soup),_soup.name)
			
 
				+    if _soup.name=="table" or _soup.name=="tbody":
			
 
				+        _table_text = ""
			
 
				+        trs = _soup.find_all("tr")
			
 
				+        list_tr_text = []
			
 
				+        for tr in trs:
			
 
				+            tds = tr.find_all("th")
			
 
				+            if len(tds)>0:
			
 
				+                list_td_text = []
			
 
				+                for td in tds:
			
 
				+                    list_td_text.append(re.sub('\s','',td.get_text()))
			
 
				+                if len(list_td_text)>0:
			
 
				+                    list_tr_text.append(list_td_text)
			
 
				+            tds = tr.find_all("td")
			
 
				+            if len(tds)>0:
			
 
				+                list_td_text = []
			
 
				+                for td in tds:
			
 
				+                    list_td_text.append(re.sub('\s','',td.get_text()))
			
 
				+                if len(list_td_text)>0:
			
 
				+                    list_tr_text.append(list_td_text)
			
 
				+        return list_tr_text
			
 
				+
			
 
				+def tableList2text(table_list):
			
 
				+    list_tr_text = []
			
 
				+    for tr in table_list:
			
 
				+        tds = tr
			
 
				+        if len(tds)>0:
			
 
				+            list_td_text = []
			
 
				+            for td in tds:
			
 
				+                list_td_text.append(re.sub('\s','',td))
			
 
				+            list_tr_text.append("|".join(list_td_text))
			
 
				+    _table_text = "%s\n\n"%"\n".join(list_tr_text)
			
 
				+    return _table_text
			
--- a/BaseDataMaintenance/common/Utils.py
+++ b/BaseDataMaintenance/common/Utils.py
@@ -720,7 +720,6 @@ def getMultipleFactor(unit):
 
				     MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
			
 
				     return MultipleFactor.get(unit)
			
 
				 
			
 
				-
			
 
				 def getUnifyMoney(money):
			
 
				     '''
			
 
				     @summary:将中文金额字符串转换为数字金额
			
@@ -735,9 +734,9 @@ def getUnifyMoney(money):
 
				     money = re.sub("[，,]","",money)
			
 
				     money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
			
 
				     result = Decimal(0)
			
 
				-    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
			
 
				+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
			
 
				     # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
			
 
				-    chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
			
 
				+    chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"]  # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
			
 
				 
			
 
				     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
			
 
				     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
			
--- a/BaseDataMaintenance/common/activateMQUtils.py
+++ b/BaseDataMaintenance/common/activateMQUtils.py
@@ -15,18 +15,12 @@ def send_msg_toacmq(pool_conn,msg,dest,retry_times=5):
 
				         conn = pool_conn.getConnector()
			
 
				         try:
			
 
				             conn.send(body=str(msg), destination=dest, persistent='false')
			
 
				+            pool_conn.putConnector(conn)
			
 
				             return True
			
 
				         except Exception as e:
			
 
				             traceback.print_exc()
			
 
				-            try:
			
 
				-                conn.disconnect()
			
 
				-            except Exception as e:
			
 
				-                pass
			
 
				-        finally:
			
 
				-            if conn.is_connected():
			
 
				-                pool_conn.putConnector(conn)
			
 
				-            else:
			
 
				-                del conn
			
 
				+            time.sleep(2)
			
 
				+            del conn
			
 
				     return False
			
 
				 
			
 
				 class MyListener(object):
			
--- a/BaseDataMaintenance/common/documentFingerprint.py
+++ b/BaseDataMaintenance/common/documentFingerprint.py
@@ -1,4 +1,4 @@
 
				-
			
 
				+#coding:utf8
			
 
				 
			
 
				 import hashlib
			
 
				 import codecs
			
@@ -47,6 +47,7 @@ def getFingerprint(sourceHtml):
 
				     return _fingerprint
			
 
				 
			
 
				 if __name__=="__main__":
			
 
				-    sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
			
 
				-    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
			
 
				+    # sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
			
 
				+    sourceHtml = "天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示"+'<div> <div> <div> 天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示 </div> <div> <div> <p>一、办理事项：天全县农村敬老院护理能力提升改造项目初步设计及概算审批</p> <p>二、项目业主：天全县民政局</p> <p>三、项目代码：2107-511825-04-01-642123</p> <p>四、办理状态：办结。</p> <p>五、办理时间：2024年5月14日</p> </div> </div> </div> </div>'
			
 
				+    sourceHtml = "天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示"+'审批项目'
			
 
				     print(getFingerprint(sourceHtml))
			
--- a/BaseDataMaintenance/common/ossUtils.py
+++ b/BaseDataMaintenance/common/ossUtils.py
@@ -108,7 +108,7 @@ def test_download(filemd5):
 
				 
			
 
				 
			
 
				 if __name__=="__main__":
			
 
				-    # print(getMDFFromFile('8a9c96a68803c2ad01881d0ee93618e5.pdf'))
			
 
				-    test_download("892bde698088f1d61b5310782550d0e1")
			
 
				+    print(getMDFFromFile(r'G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-09\中国区超低氮锅炉电锅炉招标文件与附件(1).zip'))
			
 
				+    # test_download("892bde698088f1d61b5310782550d0e1")
			
 
				     # print(bucket.sign_url("GET","0015//20220623/2022-06-22/WGH001018/1655926900020.png",86500*30))
			
 
				     # print(time.strftime("%Y-%m-%d",time.localtime(1658655178)))
			
--- a/BaseDataMaintenance/dataMonitor/data_monitor.py
+++ b/BaseDataMaintenance/dataMonitor/data_monitor.py
@@ -1,20 +1,29 @@
 
				-import os, sys
			
 
				+
			
 
				+
			
 
				+import os,sys
			
 
				 import subprocess
			
 
				-from datetime import datetime, timedelta
			
 
				+from datetime import datetime,timedelta
			
 
				+
			
 
				 import psutil
			
 
				 from apscheduler.schedulers.blocking import BlockingScheduler
			
 
				-from BaseDataMaintenance.dataSource.source import getConnect_ots, getConnect_activateMQ
			
 
				+
			
 
				+from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_activateMQ
			
 
				+
			
 
				 from BaseDataMaintenance.dataSource.interface import *
			
 
				 from BaseDataMaintenance.common.Utils import *
			
 
				+
			
 
				 from tablestore import *
			
 
				 from BaseDataMaintenance.dataSource.setttings import *
			
 
				 from queue import Queue
			
 
				 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				 
			
 
				+
			
 
				 from BaseDataMaintenance.maintenance.dataflow_settings import *
			
 
				 
			
 
				+
			
 
				 import pandas as pd
			
 
				 
			
 
				+
			
 
				 flow_attachment_log_path = "/data/python/flow_attachment.log"
			
 
				 
			
 
				 flow_extract_log_path = "/data/python/flow_extract.log"
			
@@ -27,6 +36,48 @@ flow_init_check_dir = "/data/python/flow_init_check"
 
				 flow_dumplicate_log_path = "/home/appuser/python/flow_dumplicate.log"
			
 
				 
			
 
				 
			
 
				+def fixDoc_to_queue_init(filename=""):
			
 
				+    import pandas as pd
			
 
				+    from BaseDataMaintenance.model.oracle.GongGaoTemp import dict_oracle2ots
			
 
				+    from BaseDataMaintenance.model.oracle.TouSuTemp import dict_oracle2ots as dict_oracle2ots_tousu
			
 
				+
			
 
				+    from BaseDataMaintenance.dataSource.source import getConnection_oracle
			
 
				+    current_path = os.path.abspath(os.path.dirname(__file__))
			
 
				+    if filename=="":
			
 
				+        filename = os.path.join(current_path,"check.xlsx")
			
 
				+    df = pd.read_excel(filename)
			
 
				+    if "docchannel" in dict_oracle2ots:
			
 
				+        dict_oracle2ots.pop("docchannel")
			
 
				+    row_name = ",".join(list(dict_oracle2ots.keys()))
			
 
				+
			
 
				+    list_tousu_keys = []
			
 
				+    for k,v in dict_oracle2ots_tousu.items():
			
 
				+        if str(k).isupper():
			
 
				+            list_tousu_keys.append(k)
			
 
				+    row_name_tousu = ",".join(list(list_tousu_keys))
			
 
				+    conn = getConnection_oracle()
			
 
				+    cursor = conn.cursor()
			
 
				+    _count = 0
			
 
				+    for uuid,tablename,_exists,_toolong in zip(df["uuid"],df["tablename"],df["exists"],df["tolong"]):
			
 
				+        if _exists==0 and _toolong==0:
			
 
				+            _count += 1
			
 
				+            is_tousu = False
			
 
				+            if tablename in ('bxkc.t_wei_fa_ji_lu_temp','bxkc.t_tou_su_chu_li_temp','bxkc.t_qi_ta_shi_xin_temp'):
			
 
				+                is_tousu = True
			
 
				+            _source = str(tablename).replace("_TEMP","")
			
 
				+            if is_tousu:
			
 
				+                _source = str(tablename).replace("_temp","")
			
 
				+            _rowname = row_name_tousu if is_tousu else row_name
			
 
				+
			
 
				+            sql = " insert into %s(%s) select %s from %s where id='%s' "%(tablename,_rowname,_rowname,_source,uuid)
			
 
				+            log("%d:%s"%(_count,sql))
			
 
				+            cursor.execute(sql)
			
 
				+
			
 
				+    conn.commit()
			
 
				+    conn.close()
			
 
				+
			
 
				+    return _count
			
 
				+
			
 
				 class BaseDataMonitor():
			
 
				 
			
 
				     def __init__(self):
			
@@ -43,35 +94,32 @@ class BaseDataMonitor():
 
				     def get_last_tenmin_time(self, nums=15):
			
 
				         current_time = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
			
 
				 
			
 
				-        last_ten_minite_time = timeAdd(current_time, 0, "%Y-%m-%d %H:%M:%S", -10)
			
 
				+        last_ten_minite_time = timeAdd(current_time,0,"%Y-%m-%d %H:%M:%S",-10)
			
 
				         return last_ten_minite_time[:nums]
			
 
				 
			
 
				-    def check_document_uuid(self, log_filename):
			
 
				+    def check_document_uuid(self,log_filename):
			
 
				 
			
 
				-        def _handle(_item, result_queue):
			
 
				-            bool_query = BoolQuery(must_queries=[TermQuery("uuid", _item.get("uuid"))])
			
 
				+        def _handle(_item,result_queue):
			
 
				+            bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
			
 
				 
			
 
				-            rows, next_token, total_count, is_all_succeed = ots_client.search("document_tmp", "document_tmp_index",
			
 
				-                                                                              SearchQuery(bool_query,
			
 
				-                                                                                          get_total_count=True),
			
 
				-                                                                              columns_to_get=ColumnsToGet(
			
 
				-                                                                                  return_type=ColumnReturnType.NONE))
			
 
				+            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
 
				+                                                                           SearchQuery(bool_query,get_total_count=True),
			
 
				+                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				 
			
 
				             _item["exists"] = total_count
			
 
				-
			
 
				-        check_filename = "%s_check.xlsx" % (log_filename)
			
 
				+        check_filename = "%s_check.xlsx"%(log_filename)
			
 
				         list_uuid = []
			
 
				         task_queue = Queue()
			
 
				         dict_tolong = {}
			
 
				         if not os.path.exists(check_filename) and os.path.exists(log_filename):
			
 
				             _regrex = "delete\s+(?P<tablename>bxkc[^\s]+)\s+.*ID='(?P<uuid>.+)'"
			
 
				             _regrex_tolong = "msg too long:(?P<uuid>[^,]+),\d+"
			
 
				-            with open(log_filename, "r", encoding="utf8") as f:
			
 
				+            with open(log_filename,"r",encoding="utf8") as f:
			
 
				                 while 1:
			
 
				                     _line = f.readline()
			
 
				                     if not _line:
			
 
				                         break
			
 
				-                    _match = re.search(_regrex, _line)
			
 
				+                    _match = re.search(_regrex,_line)
			
 
				                     if _match is not None:
			
 
				                         _uuid = _match.groupdict().get("uuid")
			
 
				                         tablename = _match.groupdict().get("tablename")
			
@@ -99,36 +147,44 @@ class BaseDataMonitor():
 
				                        "tolong": []}
			
 
				 
			
 
				             for _data in list_uuid:
			
 
				-                for k, v in df_data.items():
			
 
				-                    if k != "tolong":
			
 
				+                for k,v in df_data.items():
			
 
				+                    if k!="tolong":
			
 
				                         v.append(_data.get(k))
			
 
				-                df_data["tolong"].append(dict_tolong.get(_data["uuid"], 0))
			
 
				+                df_data["tolong"].append(dict_tolong.get(_data["uuid"],0))
			
 
				             df2 = pd.DataFrame(df_data)
			
 
				             df2.to_excel(check_filename)
			
 
				 
			
 
				     def monitor_init(self):
			
 
				 
			
 
				-        def _handle(_item, result_queue):
			
 
				-            bool_query = BoolQuery(must_queries=[TermQuery("uuid", _item.get("uuid"))])
			
 
				+        def _handle(_item,result_queue):
			
 
				+            bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
			
 
				 
			
 
				-            rows, next_token, total_count, is_all_succeed = ots_client.search("document_tmp", "document_tmp_index",
			
 
				-                                                                              SearchQuery(bool_query,
			
 
				-                                                                                          get_total_count=True),
			
 
				-                                                                              columns_to_get=ColumnsToGet(
			
 
				-                                                                                  return_type=ColumnReturnType.NONE))
			
 
				+            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
 
				+                                                                           SearchQuery(bool_query,get_total_count=True),
			
 
				+                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				+
			
 
				+            if total_count>0:
			
 
				+                _item["exists"] = total_count
			
 
				+            else:
			
 
				+                bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
			
 
				+
			
 
				+                rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                               SearchQuery(bool_query,get_total_count=True),
			
 
				+                                                                               columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				+
			
 
				+                _item["exists"] = total_count
			
 
				 
			
 
				-            _item["exists"] = total_count
			
 
				 
			
 
				         try:
			
 
				             current_date = getCurrent_date("%Y-%m-%d")
			
 
				 
			
 
				-            last_date = timeAdd(current_date, -1, "%Y-%m-%d")
			
 
				+            last_date = timeAdd(current_date,-1,"%Y-%m-%d")
			
 
				 
			
 
				             if not os.path.exists(flow_init_check_dir):
			
 
				                 os.mkdir(flow_init_check_dir)
			
 
				 
			
 
				-            log_filename = os.path.join(flow_init_log_dir, "flow_init_%s.log" % (last_date))
			
 
				-            check_filename = os.path.join(flow_init_check_dir, "flow_init_%s.xlsx" % (last_date))
			
 
				+            log_filename = os.path.join(flow_init_log_dir,"flow_init_%s.log"%(last_date))
			
 
				+            check_filename = os.path.join(flow_init_check_dir,"flow_init_%s.xlsx"%(last_date))
			
 
				 
			
 
				             list_uuid = []
			
 
				             task_queue = Queue()
			
@@ -136,66 +192,76 @@ class BaseDataMonitor():
 
				             if not os.path.exists(check_filename) and os.path.exists(log_filename):
			
 
				                 _regrex = "delete\s+(?P<tablename>bxkc[^\s]+)\s+.*ID='(?P<uuid>.+)'"
			
 
				                 _regrex_tolong = "msg too long:(?P<uuid>[^,]+),\d+"
			
 
				-                with open(log_filename, "r", encoding="utf8") as f:
			
 
				+                with open(log_filename,"r",encoding="utf8") as f:
			
 
				                     while 1:
			
 
				                         _line = f.readline()
			
 
				                         if not _line:
			
 
				                             break
			
 
				-                        _match = re.search(_regrex, _line)
			
 
				+                        _match = re.search(_regrex,_line)
			
 
				                         if _match is not None:
			
 
				                             _uuid = _match.groupdict().get("uuid")
			
 
				                             tablename = _match.groupdict().get("tablename")
			
 
				                             if _uuid is not None:
			
 
				-                                list_uuid.append({"uuid": _uuid, "tablename": tablename})
			
 
				-                        _match = re.search(_regrex_tolong, _line)
			
 
				+                                list_uuid.append({"uuid":_uuid,"tablename":tablename})
			
 
				+                        _match = re.search(_regrex_tolong,_line)
			
 
				                         if _match is not None:
			
 
				                             _uuid = _match.groupdict().get("uuid")
			
 
				                             dict_tolong[_uuid] = 1
			
 
				 
			
 
				-                if list_uuid == 0:
			
 
				+
			
 
				+                if list_uuid==0:
			
 
				                     _msg = "数据遗漏检查出错"
			
 
				-                    sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
			
 
				+                    sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
			
 
				                     # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				 
			
 
				                 ots_client = getConnect_ots()
			
 
				 
			
 
				                 for _d in list_uuid:
			
 
				                     task_queue.put(_d)
			
 
				-                mt = MultiThreadHandler(task_queue, _handle, None, 30)
			
 
				+                mt = MultiThreadHandler(task_queue,_handle,None,30)
			
 
				                 mt.run()
			
 
				-                df_data = {"uuid": [],
			
 
				-                           "tablename": [],
			
 
				-                           "exists": [],
			
 
				-                           "tolong": []}
			
 
				+                df_data = {"uuid":[],
			
 
				+                           "tablename":[],
			
 
				+                           "exists":[],
			
 
				+                           "tolong":[]}
			
 
				 
			
 
				                 for _data in list_uuid:
			
 
				-                    for k, v in df_data.items():
			
 
				-                        if k != "tolong":
			
 
				+                    for k,v in df_data.items():
			
 
				+                        if k!="tolong":
			
 
				                             v.append(_data.get(k))
			
 
				-                    df_data["tolong"].append(dict_tolong.get(_data["uuid"], 0))
			
 
				+                    df_data["tolong"].append(dict_tolong.get(_data["uuid"],0))
			
 
				                 df2 = pd.DataFrame(df_data)
			
 
				                 df2.to_excel(check_filename)
			
 
				 
			
 
				             counts = 0
			
 
				             df_data = pd.read_excel(check_filename)
			
 
				-            for _exists, _tolong in zip(df_data["exists"], df_data["tolong"]):
			
 
				-                if _exists == 0 and _tolong == 0:
			
 
				+            for _exists,_tolong in zip(df_data["exists"],df_data["tolong"]):
			
 
				+                if _exists==0 and _tolong==0:
			
 
				                     counts += 1
			
 
				-            if counts > 0:
			
 
				-                _msg = "数据遗漏检查报警,%s有%s条公告遗漏，详见%s" % (last_date, str(counts), check_filename)
			
 
				-                sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
			
 
				+            if counts>0:
			
 
				+                _msg = "数据遗漏检查报警,%s有%s条公告遗漏，详见%s"%(last_date,str(counts),check_filename)
			
 
				+                sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
			
 
				                 # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				 
			
 
				+                _count = fixDoc_to_queue_init(check_filename)
			
 
				+                if _count>0:
			
 
				+                    _msg = "数据遗漏检查报警%d条公告已重新同步"%(_count)
			
 
				+                    sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
			
 
				+                    df_data.to_excel("%s_bak.xlsx"%check_filename)
			
 
				+                    os.remove(check_filename)
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				         except Exception as e:
			
 
				             _msg = "数据遗漏检查报错"
			
 
				-            sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
			
 
				+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
			
 
				             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				             traceback.print_exc()
			
 
				 
			
 
				+
			
 
				     def monitor_attachment(self):
			
 
				-        from BaseDataMaintenance.java.MQInfo import getAllQueueSize, getQueueSize
			
 
				+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
			
 
				         try:
			
 
				             # query = BoolQuery(must_queries=[
			
 
				             #     RangeQuery("status",0,11),
			
@@ -206,7 +272,7 @@ class BaseDataMonitor():
 
				             #                                                                            columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				             total_count_todeal = getQueueSize("dataflow_attachment")
			
 
				 
			
 
				-            if total_count_todeal > 1000:
			
 
				+            if total_count_todeal>1000:
			
 
				                 # query = BoolQuery(must_queries=[
			
 
				                 #     RangeQuery("crtime",self.get_last_tenmin_time(16))
			
 
				                 # ])
			
@@ -224,7 +290,11 @@ class BaseDataMonitor():
 
				                 #                                                                            SearchQuery(query,None,True),
			
 
				                 #                                                                            columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				 
			
 
				-                # 通过命令行获取日志情况
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+                #通过命令行获取日志情况
			
 
				                 # _cmd = 'cat %s | grep -c "%s.*process filemd5"'%(flow_attachment_log_path,self.get_last_tenmin_time())
			
 
				                 # log(_cmd)
			
 
				                 # process_count = self.cmd_execute(_cmd)
			
@@ -239,16 +309,15 @@ class BaseDataMonitor():
 
				 
			
 
				                 # _msg = "附件提取队列报警：队列堆积%s条公告，最近十分钟处理公告附件数：%s，处理成功数：%s"%(str(total_count_todeal),str(process_count),str(process_succeed_count))
			
 
				 
			
 
				-                # 通过读取文件获取日志情况
			
 
				+                #通过读取文件获取日志情况
			
 
				                 dict_type = {}
			
 
				-                _pattern = "%s.*process filemd5\:[^\s]* (?P<result>(True|False)) of type\:(?P<type>[^\s]*).*download:(?P<downloadtime>\d+\.\d+)s recognize takes (?P<costtime>\d+)s upload takes (?P<uploadtime>\d+\.\d+)s" % (
			
 
				-                    re.escape(self.get_last_tenmin_time()))
			
 
				-                with open(flow_attachment_log_path, "r", encoding="utf8") as f:
			
 
				+                _pattern = "%s.*process filemd5\:[^\s]* (?P<result>(True|False)) of type\:(?P<type>[^\s]*).*download:(?P<downloadtime>\d+\.\d+)s recognize takes (?P<costtime>\d+)s upload takes (?P<uploadtime>\d+\.\d+)s"%(re.escape(self.get_last_tenmin_time()))
			
 
				+                with open(flow_attachment_log_path,"r",encoding="utf8") as f:
			
 
				                     while True:
			
 
				                         line = f.readline()
			
 
				                         if not line:
			
 
				                             break
			
 
				-                        _match = re.search(_pattern, str(line))
			
 
				+                        _match = re.search(_pattern,str(line))
			
 
				                         if _match is not None:
			
 
				                             _type = _match.groupdict().get("type")
			
 
				                             _result = _match.groupdict().get("result")
			
@@ -333,9 +402,12 @@ class BaseDataMonitor():
 
				 
			
 
				             total_count_todeal = getQueueSize("dataflow_extract")
			
 
				 
			
 
				-            if total_count_todeal > 500:
			
 
				-                _cmd = 'cat %s | grep "%s" | grep -c "process.*docid"' % (
			
 
				-                flow_extract_log_path, self.get_last_tenmin_time())
			
 
				+            if total_count_todeal>1000:
			
 
				+                _cmd = 'cat %s | grep "%s" | grep -c "要素提取失败：docid"'%(flow_extract_log_path,self.get_last_tenmin_time())
			
 
				+                log(_cmd)
			
 
				+                process_failed_count = self.cmd_execute(_cmd)
			
 
				+
			
 
				+                _cmd = 'cat %s | grep "%s" | grep -c "process.*docid"'%(flow_extract_log_path,self.get_last_tenmin_time())
			
 
				                 log(_cmd)
			
 
				                 process_count = self.cmd_execute(_cmd)
			
 
				                 _cmd = 'cat %s | grep "%s" | grep -c "process.*docid.*1$"' % (
			
@@ -369,8 +441,8 @@ class BaseDataMonitor():
 
				                 #                                                                              SearchQuery(query,None,True),
			
 
				                 #                                                                              columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				 
			
 
				-                _msg = "要素提取队列报警：队列堆积%s条公告，最近十分钟入库数：%s,最近十分钟处理公告数：%s，其中成功处理数：%s，查库免提取数：%s" % (
			
 
				-                str(total_count_todeal), str(init_count), str(process_count), str(success_count), str(exists_count))
			
 
				+                _msg = "要素提取队列报警：队列堆积%s条公告，最近十分钟入库数：%s,最近十分钟处理公告数：%s，其中成功处理数：%s，处理失败数：%s，查库免提取数：%s" % (
			
 
				+                str(total_count_todeal), str(init_count), str(process_count), str(success_count), str(process_failed_count),str(exists_count))
			
 
				                 log(_msg)
			
 
				                 atAll = False
			
 
				                 if success_count == 0:
			
@@ -432,9 +504,9 @@ class BaseDataMonitor():
 
				                                                                                columns_to_get=ColumnsToGet(
			
 
				                                                                                    return_type=ColumnReturnType.NONE))
			
 
				 
			
 
				-        if total_count >= 200:
			
 
				-            _msg = "数据流报警：待同步到成品表公告数为:%d" % (total_count)
			
 
				-            sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS)
			
 
				+        if total_count>=2000:
			
 
				+            _msg = "数据流报警：待同步到成品表公告数为:%d"%(total_count)
			
 
				+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS)
			
 
				             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				 
			
 
				     def monitor_preproject(self):
			
@@ -681,26 +753,28 @@ class BaseDataMonitor():
 
				             sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS)
			
 
				             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
			
 
				 
			
 
				+
			
 
				     def start_monitor(self):
			
 
				-        # 数据监控
			
 
				+        #数据监控
			
 
				 
			
 
				         scheduler = BlockingScheduler()
			
 
				 
			
 
				         # scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
			
 
				-        scheduler.add_job(self.monitor_extract, "cron", minute="*/10")
			
 
				-        scheduler.add_job(self.monitor_proposedBuilding, "cron", hour="*/11")
			
 
				+        scheduler.add_job(self.monitor_extract,"cron",minute="*/10")
			
 
				+        scheduler.add_job(self.monitor_proposedBuilding,"cron",hour="*/11")
			
 
				         # scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
			
 
				-        scheduler.add_job(self.monitor_sychr, "cron", minute="*/10")
			
 
				+        scheduler.add_job(self.monitor_merge,"cron",hour="*/2")
			
 
				+        scheduler.add_job(self.monitor_sychr, "cron", minute="*/30")
			
 
				         scheduler.add_job(self.monitor_preproject, "cron", hour="8")
			
 
				-        scheduler.add_job(self.monitor_merge, "cron", minute="*/60")
			
 
				         scheduler.add_job(self.monitor_init, "cron", hour="*/3")
			
 
				         scheduler.start()
			
 
				 
			
 
				+
			
 
				     def start_attach_monitor(self):
			
 
				-        # 附件监控
			
 
				+        #附件监控
			
 
				         scheduler = BlockingScheduler()
			
 
				 
			
 
				-        scheduler.add_job(self.monitor_attachment, "cron", minute="*/10")
			
 
				+        scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
			
 
				         scheduler.start()
			
 
				 
			
 
				 
			
@@ -775,11 +849,15 @@ def monitor_convert_interface():
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				+
			
 
				     # dm = BaseDataMonitor()
			
 
				     # # dm.start_monitor()
			
 
				     # log_filename = "C:\\Users\\Administrator\\Desktop\\flow_init_2023-02-03.log"
			
 
				     # dm.check_document_uuid(log_filename)
			
 
				 
			
 
				-    sentMsgToDD("报警test_msg", ACCESS_TOKEN_DATAWORKS)
			
 
				+    sentMsgToDD("报警test_msg",ACCESS_TOKEN_DATAWORKS)
			
 
				     # dm.monitor_proposedBuilding()
			
 
				     # print(dm.get_last_tenmin_time(16))
			
 
				+
			
 
				+
			
 
				+
			
--- a/BaseDataMaintenance/dataSource/setttings.py
+++ b/BaseDataMaintenance/dataSource/setttings.py
@@ -43,12 +43,12 @@ oracle_host = "121.46.18.113"
 
				 oracle_port = 10522
			
 
				 oracle_host = "192.168.0.150"
			
 
				 oracle_port = 1522
			
 
				-# oracle_user = "bxkc_data_readonly"
			
 
				-# oracle_pass = "P7WUrgcz0@#j8pjg"
			
 
				-oracle_user = "BXKC_WRITE"
			
 
				+# oracle_user = "BXKC_DATA_READONLY"
			
 
				+# oracle_pass = "nXcQG3Z8DW=Hzr!h"
			
 
				+# oracle_user = "BXKC_WRITE"
			
 
				+# oracle_pass = "PHNhX3%rVy4@fDB&"
			
 
				+oracle_user = "bxkc_db"
			
 
				 oracle_pass = "PHNhX3%rVy4@fDB&"
			
 
				-# oracle_user = "bxkc_db"
			
 
				-# oracle_pass = "xb9F#24Hd#5rStr9"
			
 
				 oracle_db = "yanphone"
			
 
				 
			
 
				 ots_AccessKeyId = 'LTAI5tFuoxHm8Uxrr5nT8wTZ'
			
--- a/BaseDataMaintenance/fixDoc_to_queue_extract.py
+++ b/BaseDataMaintenance/fixDoc_to_queue_extract.py
@@ -3,9 +3,10 @@ import sys,os
 
				 
			
 
				 sys.path.append(os.path.dirname(__file__)+"/..")
			
 
				 
			
 
				-from BaseDataMaintenance.maintenance.dataflow_mq import fixDoc_to_queue_extract,fixDoc_to_queue_init
			
 
				+from BaseDataMaintenance.maintenance.dataflow_mq import fixDoc_to_queue_extract
			
 
				+from BaseDataMaintenance.dataMonitor.data_monitor import fixDoc_to_queue_init
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # fixDoc_to_queue_extract()
			
 
				-    fixDoc_to_queue_init(filename="/data/python/flow_init_check/flow_init_2023-12-28.xlsx")
			
 
				+    fixDoc_to_queue_init(filename="/data/python/flow_init_check/flow_init_2024-12-02.xlsx")
			
--- a/BaseDataMaintenance/maintenance/attachment/attachmentProcess.py
+++ b/BaseDataMaintenance/maintenance/attachment/attachmentProcess.py
@@ -811,7 +811,7 @@ class AttachmentRec():
 
				                             attach.setValue(attachment_process_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
			
 
				                             attach.setValue(attachment_status,ATTACHMENT_PROCESSED_FAILED)
			
 
				                             log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				-                            sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				+                            # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				 
			
 
				 
			
 
				                 attach.update_row(self.ots_client)
			
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -260,7 +260,7 @@ class Dataflow():
 
				                         log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
			
 
				                     else:
			
 
				                         log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				-                        sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				+                        # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				                         _html = ""
			
 
				                         return False
			
 
				 
			
@@ -350,8 +350,8 @@ class Dataflow():
 
				 
			
 
				 
			
 
				     def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
			
 
				-                                  set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
			
 
				-                                  set_range=set(["page_time","status"]),set_phrase=set(["doctitle"])):
			
 
				+                                  set_term=set(["doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
			
 
				+                                  set_range=set(["page_time","status"]),set_phrase=set(["doctitle","project_name"])):
			
 
				         list_must_queries = []
			
 
				         list_must_no_queries = []
			
 
				         for k,v in _dict.items():
			
@@ -415,7 +415,10 @@ class Dataflow():
 
				         if agency is not None and agency!="":
			
 
				             extract_count += 1
			
 
				         if sub_docs_json is not None:
			
 
				-            sub_docs = json.loads(sub_docs_json)
			
 
				+            try:
			
 
				+                sub_docs = json.loads(sub_docs_json)
			
 
				+            except Exception as e:
			
 
				+                sub_docs = []
			
 
				             sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
			
 
				             sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
			
 
				             # log("==%s"%(str(sub_docs)))
			
@@ -2203,7 +2206,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 createComsumer(listener,self.doc_delete_queue)
			
 
				 
			
 
				 
			
 
				-    def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
			
 
				+    def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
			
 
				         dict_time = {}
			
 
				         for k in keys:
			
 
				             dict_time[k] = _extract.get(k)
			
@@ -2231,10 +2234,12 @@ class Dataflow_dumplicate(Dataflow):
 
				         _dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
			
 
				         _dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
			
 
				                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
			
 
				-        _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
			
 
				+        _dict["extract_count"] = _extract.get("extract_count",0)
			
 
				         _dict["package"] = self.c_f_get_package.evaluate(extract_json)
			
 
				         _dict["project_name"] = _extract.get("name","")
			
 
				         _dict["dict_time"] = self.get_dict_time(_extract)
			
 
				+        _dict["punish"] = _extract.get("punish",{})
			
 
				+        _dict["approval"] = _extract.get("approval",[])
			
 
				 
			
 
				     def dumplicate_fianl_check(self,base_list,b_log=False):
			
 
				         the_group = base_list
			
@@ -2272,22 +2277,22 @@ class Dataflow_dumplicate(Dataflow):
 
				     def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
			
 
				         document_less = _dict1
			
 
				         docid_less = _dict1["docid"]
			
 
				-        docchannel_less = document_less["docchannel"]
			
 
				-        page_time_less = document_less["page_time"]
			
 
				+        docchannel_less = document_less.get("docchannel",0)
			
 
				+        page_time_less = document_less.get("page_time")
			
 
				         doctitle_refine_less = document_less["doctitle_refine"]
			
 
				-        project_codes_less = document_less["project_codes"]
			
 
				+        project_codes_less = document_less.get("project_codes")
			
 
				         nlp_enterprise_less = document_less["nlp_enterprise"]
			
 
				-        tenderee_less = document_less["tenderee"]
			
 
				-        agency_less = document_less["agency"]
			
 
				+        tenderee_less = document_less.get("tenderee","")
			
 
				+        agency_less = document_less.get("agency")
			
 
				         win_tenderer_less = document_less["win_tenderer"]
			
 
				         bidding_budget_less = document_less["bidding_budget"]
			
 
				         win_bid_price_less = document_less["win_bid_price"]
			
 
				-        product_less = document_less["product"]
			
 
				-        package_less = document_less["package"]
			
 
				-        json_time_less = document_less["dict_time"]
			
 
				-        project_name_less = document_less["project_name"]
			
 
				-        fingerprint_less = document_less["fingerprint"]
			
 
				-        extract_count_less = document_less["extract_count"]
			
 
				+        product_less = document_less.get("product")
			
 
				+        package_less = document_less.get("package")
			
 
				+        json_time_less = document_less.get("dict_time")
			
 
				+        project_name_less = document_less.get("project_name")
			
 
				+        fingerprint_less = document_less.get("fingerprint")
			
 
				+        extract_count_less = document_less.get("extract_count",0)
			
 
				         web_source_no_less = document_less.get("web_source_no")
			
 
				         province_less = document_less.get("province")
			
 
				         city_less = document_less.get("city")
			
@@ -2295,26 +2300,29 @@ class Dataflow_dumplicate(Dataflow):
 
				         moneys_less = document_less.get("moneys")
			
 
				         moneys_attachment_less = document_less.get("moneys_attachment")
			
 
				         page_attachments_less = document_less.get(document_tmp_attachment_path,"[]")
			
 
				+        punish_less = document_less.get("punish",{})
			
 
				+        approval_less = document_less.get("approval",[])
			
 
				+        source_type_less = document_less.get("source_type")
			
 
				 
			
 
				 
			
 
				         document_greater = _dict2
			
 
				         docid_greater = _dict2["docid"]
			
 
				         page_time_greater = document_greater["page_time"]
			
 
				-        docchannel_greater = document_greater["docchannel"]
			
 
				-        doctitle_refine_greater = document_greater["doctitle_refine"]
			
 
				+        docchannel_greater = document_greater.get("docchannel",0)
			
 
				+        doctitle_refine_greater = document_greater.get("doctitle_refine","")
			
 
				         project_codes_greater = document_greater["project_codes"]
			
 
				         nlp_enterprise_greater = document_greater["nlp_enterprise"]
			
 
				-        tenderee_greater = document_greater["tenderee"]
			
 
				-        agency_greater = document_greater["agency"]
			
 
				+        tenderee_greater = document_greater.get("tenderee","")
			
 
				+        agency_greater = document_greater.get("agency","")
			
 
				         win_tenderer_greater = document_greater["win_tenderer"]
			
 
				         bidding_budget_greater = document_greater["bidding_budget"]
			
 
				         win_bid_price_greater = document_greater["win_bid_price"]
			
 
				-        product_greater = document_greater["product"]
			
 
				-        package_greater = document_greater["package"]
			
 
				+        product_greater = document_greater.get("product")
			
 
				+        package_greater = document_greater.get("package")
			
 
				         json_time_greater = document_greater["dict_time"]
			
 
				-        project_name_greater = document_greater["project_name"]
			
 
				-        fingerprint_greater = document_greater["fingerprint"]
			
 
				-        extract_count_greater = document_greater["extract_count"]
			
 
				+        project_name_greater = document_greater.get("project_name")
			
 
				+        fingerprint_greater = document_greater.get("fingerprint")
			
 
				+        extract_count_greater = document_greater.get("extract_count",0)
			
 
				         web_source_no_greater = document_greater.get("web_source_no")
			
 
				         province_greater = document_greater.get("province")
			
 
				         city_greater = document_greater.get("city")
			
@@ -2324,12 +2332,16 @@ class Dataflow_dumplicate(Dataflow):
 
				         moneys_attachment_greater = document_greater.get("moneys_attachment")
			
 
				         page_attachments_greater = document_greater.get(document_tmp_attachment_path,"[]")
			
 
				 
			
 
				+        punish_greater = document_greater.get("punish",{})
			
 
				+        approval_greater = document_greater.get("approval",[])
			
 
				+        source_type_greater = document_greater.get("source_type")
			
 
				+
			
 
				         hard_level=1
			
 
				         if web_source_no_less==web_source_no_greater=="17397-3":
			
 
				             hard_level=2
			
 
				 
			
 
				         if self.check_rule==1:
			
 
				-            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
			
 
				+            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less=punish_less,punish_greater=punish_greater,approval_less=approval_less,approval_greater=approval_greater,source_type_less=source_type_less,source_type_greater=source_type_greater)
			
 
				         else:
			
 
				             _prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
			
 
				 
			
@@ -2559,7 +2571,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 else:
			
 
				                     bool_query = _query
			
 
				                 rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
			
 
				-                                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=30,get_total_count=True),
			
 
				+                                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
			
 
				                                                                                     ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
			
 
				                 list_dict = getRow_ots(rows)
			
 
				                 list_data = []
			
@@ -2854,7 +2866,7 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				         return list_rules,table_name,table_index
			
 
				 
			
 
				-    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]):
			
 
				+    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
			
 
				         q_size = self.queue_dumplicate.qsize()
			
 
				         log("dumplicate queue size %d"%(q_size))
			
 
				 
			
@@ -2939,7 +2951,7 @@ class Dataflow_dumplicate(Dataflow):
 
				         # mt.run()
			
 
				 
			
 
				 
			
 
				-    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
			
 
				+    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment,document_tenderee_code,document_agency_code,document_candidates]):
			
 
				         '''
			
 
				         根据docid查询公告内容，先查询document_tmp，再查询document
			
 
				         :param list_docids:
			
@@ -3049,7 +3061,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                     continue
			
 
				             if v is None or v=="" or v=="[]" or v=="未知":
			
 
				                 continue
			
 
				-            if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
			
 
				+            if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates):
			
 
				                 continue
			
 
				             _dict[k] = v
			
 
				         for _proj in projects:
			
@@ -3058,14 +3070,19 @@ class Dataflow_dumplicate(Dataflow):
 
				             if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
			
 
				                 _proj[project_page_time] = project_dict.get(project_page_time,"")
			
 
				 
			
 
				-        #拼接属性
			
 
				-        append_dict = {}
			
 
				-        set_docid = set()
			
 
				-        set_product = set()
			
 
				-        set_code = set()
			
 
				-        set_nlp_enterprise = set()
			
 
				-        set_nlp_enterprise_attachment = set()
			
 
				+
			
 
				         for _proj in projects:
			
 
				+            #拼接属性
			
 
				+            append_dict = {}
			
 
				+            set_docid = set()
			
 
				+            set_product = set()
			
 
				+            set_code = set()
			
 
				+            set_nlp_enterprise = set()
			
 
				+            set_nlp_enterprise_attachment = set()
			
 
				+            set_candidates = set()
			
 
				+
			
 
				+
			
 
				+
			
 
				             _docids = _proj.get(project_docids,"")
			
 
				             _codes = _proj.get(project_project_codes,"")
			
 
				             _product = _proj.get(project_product,"")
			
@@ -3081,15 +3098,22 @@ class Dataflow_dumplicate(Dataflow):
 
				             try:
			
 
				                 set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
			
 
				                 set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
			
 
				-            except Exception as e:
			
 
				-                pass
			
 
				+                list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
			
 
				+                for item in list_candidates:
			
 
				+                    if item.get("name") is not None and item.get("name") not in set_candidates:
			
 
				+                        set_candidates.add(item.get("name"))
			
 
				 
			
 
				-            set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
			
 
				-            set_product = set_product | set(project_dict.get(project_product,"").split(","))
			
 
				 
			
 
				-            try:
			
 
				+                set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
			
 
				+                set_product = set_product | set(project_dict.get(project_product,"").split(","))
			
 
				+
			
 
				                 set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
			
 
				                 set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
			
 
				+
			
 
				+                for item in json.loads(_proj.get(project_candidates,"[]")):
			
 
				+                    if item.get("name") is not None and item.get("name") not in set_candidates:
			
 
				+                        set_candidates.add(item.get("name"))
			
 
				+                        list_candidates.append(item)
			
 
				             except Exception as e:
			
 
				                 pass
			
 
				 
			
@@ -3101,6 +3125,7 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				             append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
			
 
				             append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
			
 
				+            append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				             dict_dynamic = {}
			
@@ -3119,6 +3144,7 @@ class Dataflow_dumplicate(Dataflow):
 
				             list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
			
 
				 
			
 
				             append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
			
 
				+
			
 
				             _proj.update(append_dict)
			
 
				 
			
 
				 
			
@@ -3151,74 +3177,84 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				 
			
 
				         #更新私有属性
			
 
				-        for _pp in list_package_properties:
			
 
				-
			
 
				-            flag_update = False
			
 
				-            sub_project_name = _pp.get(project_sub_project_name,"")
			
 
				-            if sub_project_name=="Project":
			
 
				-                sub_project_name = ""
			
 
				-            win_tenderer = _pp.get(project_win_tenderer,"")
			
 
				-            win_bid_price = _pp.get(project_win_bid_price,0)
			
 
				-            bidding_budget = _pp.get(project_bidding_budget,0)
			
 
				-            if win_tenderer!="" and bidding_budget!=0:
			
 
				-                _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
			
 
				-                if _key in dict_package:
			
 
				-                    if self.is_same_package(_pp,dict_package[_key]):
			
 
				-                        ud = self.getUpdate_dict(_pp)
			
 
				-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				-                        dict_package[_key].update(ud)
			
 
				-                        flag_update = True
			
 
				-                        continue
			
 
				-            if win_tenderer!="" and  win_bid_price!=0:
			
 
				-                _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
			
 
				-                if _key in dict_package:
			
 
				-                    if self.is_same_package(_pp,dict_package[_key]):
			
 
				-                        ud = self.getUpdate_dict(_pp)
			
 
				-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				-                        dict_package[_key].update(ud)
			
 
				-                        flag_update = True
			
 
				-                        continue
			
 
				-            if win_tenderer!="":
			
 
				-                _key = "%s-%s"%(sub_project_name,win_tenderer)
			
 
				-                if _key in dict_package:
			
 
				-                    if self.is_same_package(_pp,dict_package[_key]):
			
 
				-                        ud = self.getUpdate_dict(_pp)
			
 
				-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				-                        dict_package[_key].update(ud)
			
 
				-                        flag_update = True
			
 
				-                        continue
			
 
				-            if bidding_budget!=0:
			
 
				-                _key = "%s-%s"%(sub_project_name,str(bidding_budget))
			
 
				-                if _key in dict_package:
			
 
				-                    if self.is_same_package(_pp,dict_package[_key]):
			
 
				-                        ud = self.getUpdate_dict(_pp)
			
 
				-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				-                        dict_package[_key].update(ud)
			
 
				-                        flag_update = True
			
 
				-                        continue
			
 
				-            if not flag_update:
			
 
				-                _pp.update(project_dict)
			
 
				-                projects.append(_pp)
			
 
				+        if len(projects)==1 and len(list_package_properties)==1:
			
 
				+            _pp = list_package_properties[0]
			
 
				+            pp = projects[0]
			
 
				+            ud = self.getUpdate_dict(_pp)
			
 
				+            self.set_project_uuid(ud,pp.get("uuid"))
			
 
				+            pp.update(_pp)
			
 
				+        else:
			
 
				 
			
 
				+            for _pp in list_package_properties:
			
 
				 
			
 
				-                _counts = 0
			
 
				+                flag_update = False
			
 
				+                sub_project_name = _pp.get(project_sub_project_name,"")
			
 
				+                if sub_project_name=="Project":
			
 
				+                    sub_project_name = ""
			
 
				+                win_tenderer = _pp.get(project_win_tenderer,"")
			
 
				+                win_bid_price = _pp.get(project_win_bid_price,0)
			
 
				+                bidding_budget = _pp.get(project_bidding_budget,0)
			
 
				                 if win_tenderer!="" and bidding_budget!=0:
			
 
				                     _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
			
 
				-                    dict_package[_key] = _pp
			
 
				-                _counts += 1
			
 
				+                    if _key in dict_package:
			
 
				+                        if self.is_same_package(_pp,dict_package[_key]):
			
 
				+                            ud = self.getUpdate_dict(_pp)
			
 
				+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				+                            dict_package[_key].update(ud)
			
 
				+                            flag_update = True
			
 
				+                            continue
			
 
				                 if win_tenderer!="" and  win_bid_price!=0:
			
 
				                     _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
			
 
				-                    dict_package[_key] = _pp
			
 
				-                    _counts +=1
			
 
				-                if _counts==0:
			
 
				-                    if win_tenderer!="":
			
 
				-                        _key = "%s-%s"%(sub_project_name,win_tenderer)
			
 
				+                    if _key in dict_package:
			
 
				+                        if self.is_same_package(_pp,dict_package[_key]):
			
 
				+                            ud = self.getUpdate_dict(_pp)
			
 
				+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				+                            dict_package[_key].update(ud)
			
 
				+                            flag_update = True
			
 
				+                            continue
			
 
				+                if win_tenderer!="":
			
 
				+                    _key = "%s-%s"%(sub_project_name,win_tenderer)
			
 
				+                    if _key in dict_package:
			
 
				+                        if self.is_same_package(_pp,dict_package[_key]):
			
 
				+                            ud = self.getUpdate_dict(_pp)
			
 
				+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				+                            dict_package[_key].update(ud)
			
 
				+                            flag_update = True
			
 
				+                            continue
			
 
				+                if bidding_budget!=0:
			
 
				+                    _key = "%s-%s"%(sub_project_name,str(bidding_budget))
			
 
				+                    if _key in dict_package:
			
 
				+                        if self.is_same_package(_pp,dict_package[_key]):
			
 
				+                            ud = self.getUpdate_dict(_pp)
			
 
				+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
			
 
				+                            dict_package[_key].update(ud)
			
 
				+                            flag_update = True
			
 
				+                            continue
			
 
				+                if not flag_update:
			
 
				+                    _pp.update(project_dict)
			
 
				+                    projects.append(_pp)
			
 
				+
			
 
				+
			
 
				+                    _counts = 0
			
 
				+                    if win_tenderer!="" and bidding_budget!=0:
			
 
				+                        _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
			
 
				                         dict_package[_key] = _pp
			
 
				-                        _counts += 1
			
 
				-                    if bidding_budget!=0:
			
 
				-                        _key = "%s-%s"%(sub_project_name,str(bidding_budget))
			
 
				+                    _counts += 1
			
 
				+                    if win_tenderer!="" and  win_bid_price!=0:
			
 
				+                        _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
			
 
				                         dict_package[_key] = _pp
			
 
				-                        _counts += 1
			
 
				+                        _counts +=1
			
 
				+                    if _counts==0:
			
 
				+                        if win_tenderer!="":
			
 
				+                            _key = "%s-%s"%(sub_project_name,win_tenderer)
			
 
				+                            dict_package[_key] = _pp
			
 
				+                            _counts += 1
			
 
				+                        if bidding_budget!=0:
			
 
				+                            _key = "%s-%s"%(sub_project_name,str(bidding_budget))
			
 
				+                            dict_package[_key] = _pp
			
 
				+                            _counts += 1
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
@@ -3255,33 +3291,42 @@ class Dataflow_dumplicate(Dataflow):
 
				             list_projects = dumplicate_projects(list_projects)
			
 
				         list_projects.extend(list_delete_projects)
			
 
				         project_json = to_project_json(list_projects)
			
 
				-        print("delete_json",project_json)
			
 
				         return project_json
			
 
				 
			
 
				 
			
 
				     def delete_doc_handle(self,_dict,result_queue):
			
 
				         headers = _dict.get("frame")
			
 
				         conn = _dict.get("conn")
			
 
				-        log("==========delete")
			
 
				+
			
 
				         if headers is not None:
			
 
				             message_id = headers.headers["message-id"]
			
 
				             body = headers.body
			
 
				             item = json.loads(body)
			
 
				             docid = item.get("docid")
			
 
				+            log("==========start delete docid:%s"%(str(docid)))
			
 
				             if docid is None:
			
 
				-                return
			
 
				+                ackMsg(conn,message_id)
			
 
				             delete_result = self.delete_projects_by_document(docid)
			
 
				 
			
 
				+            log("1")
			
 
				             _uuid = uuid4().hex
			
 
				             _d = {PROJECT_PROCESS_UUID:_uuid,
			
 
				                   PROJECT_PROCESS_CRTIME:1,
			
 
				                   PROJECT_PROCESS_PROJECTS:delete_result}
			
 
				             _pp = Project_process(_d)
			
 
				-            if _pp.update_row(self.ots_client):
			
 
				+            log("2")
			
 
				+            try:
			
 
				+                if _pp.update_row(self.ots_client):
			
 
				+                    ackMsg(conn,message_id)
			
 
				+            except Exception as e:
			
 
				                 ackMsg(conn,message_id)
			
 
				+            log("3")
			
 
				             #取消插入结果队列,改成插入project_process表
			
 
				             # if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
			
 
				             #     ackMsg(conn,message_id)
			
 
				+            log("==========end delete docid:%s"%(str(docid)))
			
 
				+        else:
			
 
				+            log("has not headers")
			
 
				 
			
 
				     def generate_common_properties(self,list_docs):
			
 
				         '''
			
@@ -3539,6 +3584,9 @@ class Dataflow_dumplicate(Dataflow):
 
				             project_info_source,
			
 
				             project_nlp_enterprise,
			
 
				             project_nlp_enterprise_attachment,
			
 
				+            project_tenderee_code,
			
 
				+            project_agency_code,
			
 
				+            project_candidates
			
 
				         ],sort="page_time",table_name="project2",table_index="project2_index")
			
 
				 
			
 
				         return list_project_dict
			
@@ -3654,6 +3702,14 @@ class Dataflow_dumplicate(Dataflow):
 
				                       should_q_cod]
			
 
				             list_query.append([_query,2])
			
 
				 
			
 
				+        if win_tenderer!="" and sub_project_name!="":
			
 
				+            _query = [TermQuery(project_win_tenderer,win_tenderer),
			
 
				+                      TermQuery(project_sub_project_name,sub_project_name)
			
 
				+                                             ]
			
 
				+            list_query.append([_query,2])
			
 
				+
			
 
				+
			
 
				+
			
 
				         if win_tenderer!="" and float(win_bid_price)>0:
			
 
				             _query = [TermQuery(project_win_tenderer,win_tenderer),
			
 
				                                              TermQuery(project_win_bid_price,win_bid_price)]
			
@@ -3710,10 +3766,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 _uuid = _proj.get("uuid")
			
 
				                 if _uuid is not None:
			
 
				                     set_uuid = set_uuid | set(_uuid.split(","))
			
 
				-            must_not_q = []
			
 
				-            for _uuid in list(set_uuid):
			
 
				-                must_not_q.append(TermQuery("uuid",_uuid))
			
 
				-                print("must_not_q uuid:%s"%(_uuid))
			
 
				+
			
 
				 
			
 
				 
			
 
				             projects_merge_count = 0
			
@@ -3729,6 +3782,10 @@ class Dataflow_dumplicate(Dataflow):
 
				             docids = ""
			
 
				             for _proj in list_projects[:30]:
			
 
				 
			
 
				+                must_not_q = []
			
 
				+                for _uuid in list(set_uuid):
			
 
				+                    must_not_q.append(TermQuery("uuid",_uuid))
			
 
				+
			
 
				                 docids = _proj.get(project_docids,"")
			
 
				                 page_time = _proj.get(project_page_time,"")
			
 
				                 project_codes = _proj.get(project_project_codes,"")
			
@@ -3754,8 +3811,8 @@ class Dataflow_dumplicate(Dataflow):
 
				                 district = _proj.get(project_district,"")
			
 
				 
			
 
				                 if is_yanshou:
			
 
				-                    page_time_less = timeAdd(page_time,-750)
			
 
				-                    page_time_greater = timeAdd(page_time,720)
			
 
				+                    page_time_less = timeAdd(page_time,-850)
			
 
				+                    page_time_greater = timeAdd(page_time,820)
			
 
				                 else:
			
 
				                     page_time_less = timeAdd(page_time,-450)
			
 
				                     page_time_greater = timeAdd(page_time,420)
			
@@ -3784,6 +3841,7 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				                 if page_time_less is not None and page_time_greater is not None:
			
 
				                     must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
			
 
				+                                    # RangeQuery("status",201,301)
			
 
				                                 ]
			
 
				 
			
 
				                 #sub_project_name非必要条件
			
@@ -3832,7 +3890,8 @@ class Dataflow_dumplicate(Dataflow):
 
				                 list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
			
 
				                 list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
			
 
				                 # log(page_time_less+"=="+page_time_greater)
			
 
				-                # log("list_merge_data:%s"%(str(list_merge_data)))
			
 
				+                if b_log:
			
 
				+                    log("list_merge_data count:%d"%(len(list_merge_data)))
			
 
				                 list_check_data = []
			
 
				                 for _data in list_merge_data:
			
 
				                     _time = time.time()
			
@@ -3858,8 +3917,9 @@ class Dataflow_dumplicate(Dataflow):
 
				                         update_projects_by_project(_data,[_proj])
			
 
				                         projects_update_time += time.time()-_time
			
 
				 
			
 
				-            whole_time = time.time()-whole_time_start
			
 
				-            log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
			
 
				+                whole_time = time.time()-whole_time_start
			
 
				+                log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
			
 
				+
			
 
				 
			
 
				             return list_projects
			
 
				         except Exception as e:
			
@@ -3892,10 +3952,9 @@ class Dataflow_dumplicate(Dataflow):
 
				             list_docids = [a for a in list_docids if a is not None]
			
 
				 
			
 
				 
			
 
				-
			
 
				             _time = time.time()
			
 
				             list_projects = self.search_projects_with_document(list_docids)
			
 
				-            # log("search projects takes:%.3f"%(time.time()-_time))
			
 
				+            log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
			
 
				             if len(list_projects)==0:
			
 
				                 # _time = time.time()
			
 
				                 list_docs = self.search_docs(list_docids)
			
@@ -3914,7 +3973,6 @@ class Dataflow_dumplicate(Dataflow):
 
				             list_projects = self.merge_projects(list_projects,b_log)
			
 
				             # log("merge projects takes:%.3f"%(time.time()-_time))
			
 
				 
			
 
				-
			
 
				             _time = time.time()
			
 
				             list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
			
 
				             # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
			
@@ -3923,6 +3981,27 @@ class Dataflow_dumplicate(Dataflow):
 
				                 list_projects = []
			
 
				 
			
 
				             _time = time.time()
			
 
				+
			
 
				+            projects = list_projects
			
 
				+            for _proj in projects:
			
 
				+                dup_docid = _proj.get(project_dup_docid,"")
			
 
				+                list_dup_docid = dup_docid.split(",")
			
 
				+                new_dup_docid = []
			
 
				+                for _docid in list_dup_docid:
			
 
				+                    if _docid=="":
			
 
				+                        continue
			
 
				+                    docid = int(_docid)
			
 
				+                    _d = {"partitionkey":docid%500+1,
			
 
				+                          "docid":docid,
			
 
				+                          }
			
 
				+                    _doc = Document(_d)
			
 
				+
			
 
				+                    if _doc.fix_columns(self.ots_client,[document_update_document],True):
			
 
				+                        if _doc.getProperties().get(document_update_document,"")!="true":
			
 
				+                            new_dup_docid.append(str(docid))
			
 
				+                _proj[project_dup_docid] = ",".join(new_dup_docid)
			
 
				+            list_projects = projects
			
 
				+
			
 
				             project_json = to_project_json(list_projects)
			
 
				             # log("json projects takes:%.3f"%(time.time()-_time))
			
 
				             if b_log:
			
@@ -3957,6 +4036,11 @@ class Dataflow_dumplicate(Dataflow):
 
				         has_before = False
			
 
				         has_after = False
			
 
				 
			
 
				+        bidclose_time = page_time
			
 
				+        web_source_name = item.get(document_tmp_web_source_name,"")
			
 
				+
			
 
				+
			
 
				+
			
 
				         if len(page_time)>0:
			
 
				             l_page_time = timeAdd(page_time,days=-90)
			
 
				             dict_time = item.get("dict_time",{})
			
@@ -3966,6 +4050,14 @@ class Dataflow_dumplicate(Dataflow):
 
				                         has_before = True
			
 
				                     if v>page_time:
			
 
				                         has_after = True
			
 
				+                    if k==document_tmp_time_bidclose:
			
 
				+                        bidclose_time = v
			
 
				+
			
 
				+        set_web_source = {"中国招标投标公共服务平台","比地招标"}
			
 
				+
			
 
				+        if web_source_name in set_web_source and bidclose_time<page_time:
			
 
				+            return False
			
 
				+
			
 
				         log("check page_time has_before %s has_after %s"%(str(has_before),str(has_after)))
			
 
				         if has_before:
			
 
				             _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
			
@@ -4024,7 +4116,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                 singleNum_keys = _rule["singleNum_keys"]
			
 
				                 contain_keys = _rule["contain_keys"]
			
 
				                 multiNum_keys = _rule["multiNum_keys"]
			
 
				-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path],b_log=b_log)
			
 
				+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
			
 
				                 _i += step
			
 
				 
			
 
				 
			
@@ -4049,7 +4141,8 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				             dup_docid = set()
			
 
				             for _dict in final_list:
			
 
				-                dup_docid.add(_dict.get(document_tmp_docid))
			
 
				+                if _dict.get("update_document","")!="true":
			
 
				+                    dup_docid.add(_dict.get(document_tmp_docid))
			
 
				             if item.get(document_tmp_docid) in dup_docid:
			
 
				                 dup_docid.remove(item.get(document_tmp_docid))
			
 
				 
			
@@ -4057,7 +4150,7 @@ class Dataflow_dumplicate(Dataflow):
 
				             remove_list = []
			
 
				 
			
 
				 
			
 
				-            if self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid)):
			
 
				+            if (self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
			
 
				                 dtmp.setValue(document_tmp_save,1,True)
			
 
				                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
			
 
				                 dmp_docid = ",".join([str(a) for a in list(dup_docid)])
			
@@ -4071,6 +4164,7 @@ class Dataflow_dumplicate(Dataflow):
 
				                     for _dict in final_list:
			
 
				                         if _dict.get(document_tmp_docid) in dup_docid:
			
 
				                             remove_list.append(_dict)
			
 
				+
			
 
				                     dmp_docid = ",".join([str(a) for a in list(dup_docid)])
			
 
				                     dmp_docid = "%d,%s"%(best_docid,dmp_docid)
			
 
				                 else:
			
@@ -4082,16 +4176,19 @@ class Dataflow_dumplicate(Dataflow):
 
				             list_docids = list(dup_docid)
			
 
				             list_docids.append(best_docid)
			
 
				 
			
 
				-            if item.get(document_update_document)=="true":
			
 
				-                dtmp.setValue(document_tmp_save,1,True)
			
 
				+            # if item.get(document_update_document)=="true":
			
 
				+            #     dtmp.setValue(document_tmp_save,1,True)
			
 
				 
			
 
				             list_merge_dump = []
			
 
				             if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
			
 
				-                log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
			
 
				+                if exist_finterprint:
			
 
				+                    log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
			
 
				                 dtmp.setValue(document_tmp_projects,"[]",True)
			
 
				             else:
			
 
				                 project_json,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
			
 
				-                if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump:
			
 
				+
			
 
				+
			
 
				+                if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
			
 
				                     dtmp.setValue(document_tmp_save,0,True)
			
 
				                 dtmp.setValue(document_tmp_projects,project_json,True)
			
 
				             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
			
@@ -4145,19 +4242,23 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				 
			
 
				 
			
 
				+        current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
			
 
				+        before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
			
 
				+        after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
			
 
				         if self.fix_doc_docid is None:
			
 
				-            current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
			
 
				-            before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
			
 
				             bool_query = BoolQuery(must_queries=[
			
 
				                 TermQuery(document_tmp_save,1),
			
 
				                 RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
			
 
				-                RangeQuery(document_tmp_opertime,before_date)
			
 
				+                RangeQuery(document_tmp_docchannel,0,300),
			
 
				+                RangeQuery(document_tmp_opertime,before_date,after_date)
			
 
				             ])
			
 
				         else:
			
 
				             bool_query = BoolQuery(must_queries=[
			
 
				                 TermQuery(document_tmp_save,1),
			
 
				                 RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
			
 
				-                RangeQuery(document_tmp_docid,self.fix_doc_docid)
			
 
				+                RangeQuery(document_tmp_docchannel,0,300),
			
 
				+                RangeQuery(document_tmp_docid,self.fix_doc_docid),
			
 
				+                RangeQuery(document_tmp_opertime,before_date,after_date)
			
 
				             ])
			
 
				 
			
 
				         list_data = []
			
@@ -4192,7 +4293,7 @@ class Dataflow_dumplicate(Dataflow):
 
				         schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
			
 
				         schedule.add_job(self.flow_remove,"cron",hour="20")
			
 
				         schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
			
 
				-        # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
			
 
				+        schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
			
 
				         schedule.start()
			
 
				 
			
 
				     def changeSaveStatus(self,list_dict):
			
@@ -4213,16 +4314,17 @@ class Dataflow_dumplicate(Dataflow):
 
				                           document_tmp_save:0
			
 
				                           }
			
 
				                     _d_tmp = Document_tmp(_d)
			
 
				-                    if _d_tmp.fix_columns(self.ots_client,["status"],True):
			
 
				+                    if _d_tmp.fix_columns(self.ots_client,["status",document_update_document],True):
			
 
				                         if _d_tmp.getProperties().get("status")==1:
			
 
				-                            _d_tmp.setValue("status",0,True)
			
 
				-                            _d_tmp.update_row(self.ots_client)
			
 
				+                            if _d_tmp.getProperties().get(document_update_document,"")!="true":
			
 
				+                                _d_tmp.setValue("status",0,True)
			
 
				+                                _d_tmp.update_row(self.ots_client)
			
 
				 
			
 
				 
			
 
				 
			
 
				     def test_dumplicate(self,docid):
			
 
				         # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
			
 
				-        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]
			
 
				+        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
			
 
				         bool_query = BoolQuery(must_queries=[
			
 
				             TermQuery("docid",docid)
			
 
				         ])
			
@@ -4413,7 +4515,7 @@ if __name__ == '__main__':
 
				     # test_attachment_interface()
			
 
				     df_dump = Dataflow_dumplicate(start_delete_listener=False)
			
 
				     # df_dump.start_flow_dumplicate()
			
 
				-    df_dump.test_dumplicate(455485514
			
 
				+    df_dump.test_dumplicate(576859812
			
 
				                             )
			
 
				     # compare_dumplicate_check()
			
 
				     # df_dump.test_merge([391898061
			
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -9,7 +9,7 @@ from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
 
				 import os
			
 
				 from BaseDataMaintenance.common.ossUtils import *
			
 
				 from BaseDataMaintenance.dataSource.pool import ConnectorPool
			
 
				-from BaseDataMaintenance.model.ots.document import Document
			
 
				+from BaseDataMaintenance.model.ots.document import Document,document_attachment_path_filemd5
			
 
				 
			
 
				 from BaseDataMaintenance.common.Utils import article_limit
			
 
				 from BaseDataMaintenance.common.documentFingerprint import getFingerprint
			
@@ -108,7 +108,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				 
			
 
				     def start_attachment_listener(self):
			
 
				         for _i in range(self.comsumer_count):
			
 
				-            listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler,_i)
			
 
				+            listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler ,_i)
			
 
				             createComsumer(listener_attachment,self.mq_attachment)
			
 
				             self.list_attachment_comsumer.append(listener_attachment)
			
 
				 
			
@@ -254,26 +254,43 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				         '''
			
 
				 
			
 
				         try:
			
 
				+            start_time = time.time()
			
 
				+
			
 
				             item = _dict.get("item")
			
 
				             list_attach = _dict.get("list_attach")
			
 
				             conn = _dict["conn"]
			
 
				             message_id = _dict.get("message_id")
			
 
				 
			
 
				+            if "retry_times" not in item:
			
 
				+                item["retry_times"] = 5
			
 
				             _retry_times = item.get("retry_times",0)
			
 
				+
			
 
				+
			
 
				             dhtml = Document_html({"partitionkey":item.get("partitionkey"),
			
 
				                                    "docid":item.get("docid")})
			
 
				 
			
 
				             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
			
 
				             dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
			
 
				             dhtml.delete_bidi_a()
			
 
				-            dtmp = Document_tmp(item)
			
 
				-
			
 
				 
			
 
				-            start_time = time.time()
			
 
				             #调用识别接口
			
 
				             _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
			
 
				 
			
 
				+            # 将附件分类写回document
			
 
				+            page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
			
 
				+            if len(page_attachments)>0:
			
 
				+                for _attachment in page_attachments:
			
 
				+                    filemd5 = _attachment.get(document_attachment_path_filemd5,"")
			
 
				+                    classification = None
			
 
				+                    for _attach in list_attach:
			
 
				+                        if _attach.getProperties().get(attachment_filemd5,"")==filemd5:
			
 
				+                            classification = _attach.getProperties().get(attachment_classification,"")
			
 
				+                            break
			
 
				+                    if classification is not None:
			
 
				+                        _attachment[attachment_classification] = classification
			
 
				+                item[document_tmp_attachment_path] = json.dumps(page_attachments,ensure_ascii=False)
			
 
				 
			
 
				+            dtmp = Document_tmp(item)
			
 
				 
			
 
				             _to_ack = False
			
 
				             if not _succeed and _retry_times<self.retry_times:
			
@@ -301,6 +318,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                     dhtml.updateSWFImages(swf_urls)
			
 
				                     dhtml.updateAttachment(list_html)
			
 
				 
			
 
				+
			
 
				                     dtmp.setValue(document_tmp_attachment_extract_status,1,True)
			
 
				                     dtmp.setValue(document_tmp_dochtmlcon,dhtml.getProperties().get(document_tmp_dochtmlcon),True)
			
 
				                     send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(dtmp.getProperties(),cls=MyEncoder),self.mq_extract)
			
@@ -435,7 +453,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                     if len(_html)>1:
			
 
				                         _html = "interface return error"
			
 
				                     else:
			
 
				-                        sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				+                        # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
			
 
				                         _html = ""
			
 
				 
			
 
				                         return False
			
@@ -630,7 +648,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                     log("getAttachments search in ots:%s"%(_filemd5))
			
 
				                     _attach = {attachment_filemd5:_filemd5}
			
 
				                     _attach_ots = attachment(_attach)
			
 
				-                    if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time],True):
			
 
				+                    if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time,attachment_classification],True):
			
 
				                         if _attach_ots.getProperties().get(attachment_status) is not None:
			
 
				                             log("getAttachments find in ots:%s"%(_filemd5))
			
 
				                             _attach_pg = Attachment_postgres(_attach_ots.getProperties())
			
@@ -828,14 +846,17 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				             self.list_extract_comsumer.append(listener_extract)
			
 
				 
			
 
				         while 1:
			
 
				-            for _i in range(len(self.list_extract_comsumer)):
			
 
				-                if self.list_extract_comsumer[_i].conn.is_connected():
			
 
				-                    continue
			
 
				-                else:
			
 
				-                    listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
			
 
				-                    createComsumer(listener,self.mq_extract)
			
 
				-                    self.list_extract_comsumer[_i] = listener
			
 
				-            time.sleep(5)
			
 
				+            try:
			
 
				+                for _i in range(len(self.list_extract_comsumer)):
			
 
				+                    if self.list_extract_comsumer[_i].conn.is_connected():
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
			
 
				+                        createComsumer(listener,self.mq_extract)
			
 
				+                        self.list_extract_comsumer[_i] = listener
			
 
				+                time.sleep(5)
			
 
				+            except Exception as e:
			
 
				+                traceback.print_exc()
			
 
				 
			
 
				     def monitor_listener(self):
			
 
				         for i in range(len(self.list_extract_comsumer)):
			
@@ -978,6 +999,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				     def comsumer_handle(self,_dict,result_queue):
			
 
				         try:
			
 
				             log("start handle")
			
 
				+            data = {}
			
 
				+
			
 
				             frame = _dict["frame"]
			
 
				             conn = _dict["conn"]
			
 
				             message_id = frame.headers["message-id"]
			
@@ -999,7 +1022,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
			
 
				                 try:
			
 
				                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
			
 
				-                    _soup = BeautifulSoup(_dochtmlcon,"html5lib")
			
 
				+                    _soup = BeautifulSoup(_dochtmlcon,"lxml")
			
 
				                     all_len = len(_soup.get_text()) # 全公告内容text长度
			
 
				                     _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
			
 
				                     attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
			
@@ -1026,7 +1049,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				             _extract.setValue(document_extract2_docid,item.get(document_docid))
			
 
				             all_done = 1
			
 
				 
			
 
				-            data = {}
			
 
				+
			
 
				             for k,v in item.items():
			
 
				                 data[k] = v
			
 
				             data["timeout"] = 440
			
@@ -1042,8 +1065,9 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				             data["web_source_no"] = item.get(document_tmp_web_source_no,"")
			
 
				             data["web_source_name"] = item.get(document_tmp_web_source_name,"")
			
 
				             data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
			
 
				+            data["page_attachments"] = item.get(document_tmp_attachment_path,"[]")
			
 
				 
			
 
				-            _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))
			
 
				+            _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))+str(data["original_docchannel"])
			
 
				 
			
 
				             if all_done>0:
			
 
				                 _time = time.time()
			
@@ -1078,9 +1102,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				             # if all_done>0 and len(_extract.getProperties().get(document_extract2_extract_json,""))<=2:
			
 
				             #     all_done = -4
			
 
				             _extract.setValue(document_extract2_industry_json,"{}",True)
			
 
				+            _to_ack = True
			
 
				             try:
			
 
				                 if all_done!=1:
			
 
				-                    sentMsgToDD("要素提取失败：docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
			
 
				+                    # sentMsgToDD("要素提取失败：docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
			
 
				+                    log("要素提取失败：docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
			
 
				                     if extract_times>=10:
			
 
				                         #process as succeed
			
 
				                         dtmp.setValue(document_tmp_dochtmlcon,"",False)
			
@@ -1138,15 +1164,20 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				 
			
 
				             if _to_ack:
			
 
				                 ackMsg(conn,message_id,subscription)
			
 
				-            log("process %s docid:%d %s"%(str(_to_ack),data["doc_id"],str(all_done)))
			
 
				+            else:
			
 
				+                item["extract_times"] -= 1
			
 
				+                send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract)
			
 
				+                ackMsg(conn,message_id,subscription)
			
 
				+            log("process %s docid:%d %s"%(str(_to_ack),data.get("doc_id"),str(all_done)))
			
 
				         except requests.ConnectionError as e1:
			
 
				             item["extract_times"] -= 1
			
 
				             if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
			
 
				                 ackMsg(conn,message_id,subscription)
			
 
				         except Exception as e:
			
 
				             traceback.print_exc()
			
 
				-            sentMsgToDD("要素提取失败：docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
			
 
				-            log("process %s docid: failed message_id:%s"%(data["doc_id"],message_id))
			
 
				+            # sentMsgToDD("要素提取失败：docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
			
 
				+            log("要素提取失败：docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
			
 
				+            log("process %s docid: failed message_id:%s"%(data.get("doc_id"),message_id))
			
 
				             if extract_times>=10:
			
 
				                 #process as succeed
			
 
				                 dtmp.setValue(document_tmp_dochtmlcon,"",False)
			
@@ -1360,7 +1391,7 @@ class Dataflow_init(Dataflow):
 
				         conn_oracle = self.pool_oracle.getConnector()
			
 
				 
			
 
				         try:
			
 
				-            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[],limit=1000)
			
 
				+            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[])
			
 
				             for _obj in list_obj:
			
 
				                 ots_dict = _obj.getProperties_ots()
			
 
				 
			
@@ -1379,7 +1410,7 @@ class Dataflow_init(Dataflow):
 
				             traceback.print_exc()
			
 
				             self.pool_oracle.decrease()
			
 
				 
			
 
				-    def shengpi2mq(self):
			
 
				+    def shenpi2mq(self):
			
 
				 
			
 
				         conn_oracle = self.pool_oracle.getConnector()
			
 
				 
			
@@ -1395,36 +1426,131 @@ class Dataflow_init(Dataflow):
 
				                     if max_shenpi_id>self.base_shenpi_id:
			
 
				                         max_shenpi_id -= self.base_shenpi_id
			
 
				                     self.max_shenpi_id = max_shenpi_id
			
 
				-            if self.max_shenpi_id is not None:
			
 
				-                # select data in order
			
 
				-                list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,self.max_shenpi_id,)
			
 
				 
			
 
				-                # send data to mq one by one with max_shenpi_id updated
			
 
				-                for _data in list_data:
			
 
				-                    _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
			
 
				+                if self.max_shenpi_id<60383953:
			
 
				+                    self.max_shenpi_id = 60383953
			
 
				 
			
 
				-                    ots_dict = _data.getProperties_ots()
			
 
				-                    if ots_dict["docid"]<self.base_shenpi_id:
			
 
				-                        ots_dict["docid"] += self.base_shenpi_id
			
 
				 
			
 
				-                    if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
			
 
				-                        if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
			
 
				-                            self.max_shenpi_id = _id
			
 
				-                        else:
			
 
				-                            log("sent shenpi message to mq failed %s"%(_id))
			
 
				-                            break
			
 
				-                    else:
			
 
				-                        if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
			
 
				-                            self.max_shenpi_id = _id
			
 
				-                        else:
			
 
				-                            log("sent shenpi message to mq failed %s"%(_id))
			
 
				-                            break
			
 
				+            if self.max_shenpi_id is not None:
			
 
				+                # select data in order
			
 
				+
			
 
				+                origin_max_shenpi_id = T_SHEN_PI_XIANG_MU.get_max_id(conn_oracle)
			
 
				+
			
 
				+                if origin_max_shenpi_id is not None:
			
 
				+                    log("shenpi origin_max_shenpi_id:%d current_id:%d"%(origin_max_shenpi_id,self.max_shenpi_id))
			
 
				+                    for _id_i in range(self.max_shenpi_id+1,origin_max_shenpi_id+1):
			
 
				+                        list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,_id_i)
			
 
				+
			
 
				+                        # send data to mq one by one with max_shenpi_id updated
			
 
				+                        for _data in list_data:
			
 
				+
			
 
				+                            _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
			
 
				+
			
 
				+                            ots_dict = _data.getProperties_ots()
			
 
				+                            if ots_dict["docid"]<self.base_shenpi_id:
			
 
				+                                ots_dict["docid"] += self.base_shenpi_id
			
 
				+                                ots_dict["partitionkey"] = ots_dict["docid"]%500+1
			
 
				+
			
 
				+                            if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
			
 
				+                                if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
			
 
				+                                    self.max_shenpi_id = _id
			
 
				+                                else:
			
 
				+                                    log("sent shenpi message to mq failed %s"%(_id))
			
 
				+                                    break
			
 
				+                            else:
			
 
				+                                if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
			
 
				+                                    self.max_shenpi_id = _id
			
 
				+                                else:
			
 
				+                                    log("sent shenpi message to mq failed %s"%(_id))
			
 
				+                                    break
			
 
				+            self.pool_oracle.putConnector(conn_oracle)
			
 
				 
			
 
				         except Exception as e:
			
 
				+            log("shenpi error")
			
 
				             traceback.print_exc()
			
 
				             self.pool_oracle.decrease()
			
 
				 
			
 
				+    def fix_shenpi(self):
			
 
				+
			
 
				+        pool_oracle = ConnectorPool(10,15,getConnection_oracle)
			
 
				+        begin_id = 0
			
 
				+        end_id = 64790010
			
 
				+        thread_num = 15
			
 
				+        step = (end_id-begin_id)//thread_num
			
 
				+        list_items = []
			
 
				+        for _i in range(thread_num):
			
 
				+            _begin = _i*step
			
 
				+            _end = (_i+1)*step-1
			
 
				+            if _i==thread_num-1:
			
 
				+                _end = end_id
			
 
				+            list_items.append((_begin,_end,_i))
			
 
				+        task_queue = Queue()
			
 
				+        for item in list_items:
			
 
				+            task_queue.put(item)
			
 
				+
			
 
				+        fix_count_list = []
			
 
				+
			
 
				+        def _handle(item,result_queue):
			
 
				+            conn_oracle = pool_oracle.getConnector()
			
 
				+            (begin_id,end_id,thread_id) = item
			
 
				+
			
 
				+            _count = 0
			
 
				+            for _id_i in range(begin_id,end_id):
			
 
				+                try:
			
 
				+                    bool_query = BoolQuery(must_queries=[
			
 
				+                        TermQuery("docchannel",302),
			
 
				+                        TermQuery("original_id",_id_i)
			
 
				+                    ])
			
 
				+                    rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
			
 
				+                                                                                        SearchQuery(bool_query,get_total_count=True))
			
 
				+                    if total_count>0:
			
 
				+                        continue
			
 
				+
			
 
				+                    # bool_query = BoolQuery(must_queries=[
			
 
				+                    #     TermQuery("id",_id_i),
			
 
				+                    # ])
			
 
				+                    # rows,next_token,total_count,is_all_succeed = self.ots_client.search("t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index",
			
 
				+                    #                                                                     SearchQuery(bool_query,get_total_count=True))
			
 
				+                    # if total_count>0:
			
 
				+                    #     continue
			
 
				+
			
 
				+                    try:
			
 
				+                        list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,_id_i)
			
 
				+                    except Exception as e:
			
 
				+                        continue
			
 
				+
			
 
				+                    # send data to mq one by one with max_shenpi_id updated
			
 
				+                    for _data in list_data:
			
 
				+
			
 
				+                        _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
			
 
				+
			
 
				+                        ots_dict = _data.getProperties_ots()
			
 
				+                        if ots_dict["docid"]<self.base_shenpi_id:
			
 
				+                            ots_dict["docid"] += self.base_shenpi_id
			
 
				+                            ots_dict["partitionkey"] = ots_dict["docid"]%500+1
			
 
				+                        ots_dict["status"] = 201
			
 
				+                        dict_1 = {}
			
 
				+                        dict_2 = {}
			
 
				+                        for k,v in ots_dict.items():
			
 
				+                            if k!="dochtmlcon":
			
 
				+                                dict_1[k] = v
			
 
				+                            if k in ('partitionkey',"docid","dochtmlcon"):
			
 
				+                                dict_2[k] = v
			
 
				+                        d_1 = Document(dict_1)
			
 
				+                        d_2 = Document(dict_2)
			
 
				+                        d_1.update_row(self.ots_client)
			
 
				+                        d_2.update_row(self.ots_capacity)
			
 
				+                        _count += 1
			
 
				+                except Exception as e:
			
 
				+                    traceback.print_exc()
			
 
				+
			
 
				+                log("thread_id:%d=%d/%d/%d"%(thread_id,_id_i-begin_id,_count,end_id-begin_id))
			
 
				+            fix_count_list.append(_count)
			
 
				+            pool_oracle.putConnector(conn_oracle)
			
 
				 
			
 
				+        mt = MultiThreadHandler(task_queue,_handle,None,thread_count=thread_num)
			
 
				+        mt.run()
			
 
				+        print(fix_count_list,sum(fix_count_list))
			
 
				 
			
 
				     def ots2mq(self):
			
 
				         try:
			
@@ -1432,13 +1558,34 @@ class Dataflow_init(Dataflow):
 
				 
			
 
				             rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
			
 
				                                                                                 SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_docid)]),get_total_count=True,limit=100),
			
 
				-                                                                                ColumnsToGet(return_type=ColumnReturnType.ALL))
			
 
				+                                                                                ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				             list_data = getRow_ots(rows)
			
 
				+            task_queue = Queue()
			
 
				             for _data in list_data:
			
 
				+                task_queue.put(_data)
			
 
				+
			
 
				+
			
 
				+            while next_token:
			
 
				+                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
			
 
				+                                                                                    SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
			
 
				+                                                                                    ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				+                list_data = getRow_ots(rows)
			
 
				+
			
 
				+                for _data in list_data:
			
 
				+                    task_queue.put(_data)
			
 
				+
			
 
				+                if task_queue.qsize()>=1000:
			
 
				+                    break
			
 
				+
			
 
				+            def _handle(_data,result_queue):
			
 
				+
			
 
				                 _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
			
 
				                       document_tmp_docid:_data.get(document_tmp_docid),
			
 
				                       document_tmp_status:0}
			
 
				                 _document = Document(_d)
			
 
				+                _document.fix_columns(self.ots_client,None,True)
			
 
				+                _data = _document.getProperties()
			
 
				+
			
 
				                 page_attachments = _data.get(document_tmp_attachment_path,"[]")
			
 
				 
			
 
				                 _document_html = Document(_data)
			
@@ -1453,36 +1600,16 @@ class Dataflow_init(Dataflow):
 
				                     _data[document_tmp_status] = status
			
 
				                     send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
			
 
				                 if send_succeed:
			
 
				+                    _document.setValue(document_tmp_status,0,True)
			
 
				                     _document.update_row(self.ots_client)
			
 
				                 else:
			
 
				                     log("send_msg_error2222")
			
 
				-            while next_token:
			
 
				-                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
			
 
				-                                                                                    SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
			
 
				-                                                                                    ColumnsToGet(return_type=ColumnReturnType.ALL))
			
 
				-                list_data = getRow_ots(rows)
			
 
				-                for _data in list_data:
			
 
				-                    _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
			
 
				-                          document_tmp_docid:_data.get(document_tmp_docid),
			
 
				-                          document_tmp_status:0}
			
 
				-                    _document = Document(_d)
			
 
				-                    page_attachments = _data.get(document_tmp_attachment_path,"[]")
			
 
				 
			
 
				-                    _document_html = Document(_data)
			
 
				-                    _document_html.fix_columns(self.ots_capacity,[document_tmp_dochtmlcon],True)
			
 
				+            if task_queue.qsize()>0:
			
 
				+                mt = MultiThreadHandler(task_queue,_handle,None,15)
			
 
				+                mt.run()
			
 
				+
			
 
				 
			
 
				-                    if page_attachments!="[]":
			
 
				-                        status = random.randint(1,10)
			
 
				-                        _data[document_tmp_status] = status
			
 
				-                        send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_attachment)
			
 
				-                    else:
			
 
				-                        status = random.randint(11,50)
			
 
				-                        _data[document_tmp_status] = status
			
 
				-                        send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
			
 
				-                    if send_succeed:
			
 
				-                        _document.update_row(self.ots_client)
			
 
				-                    else:
			
 
				-                        log("send_msg_error2222")
			
 
				         except Exception as e:
			
 
				             traceback.print_exc()
			
 
				 
			
@@ -1501,6 +1628,8 @@ class Dataflow_init(Dataflow):
 
				                 _document = Document_tmp(_d)
			
 
				                 page_attachments = _data.get(document_tmp_attachment_path,"[]")
			
 
				 
			
 
				+                log("refix doc %s from document_tmp"%(str(_data.get(document_tmp_docid))))
			
 
				+
			
 
				                 _document_html = Document_html(_data)
			
 
				                 _document_html.fix_columns(self.ots_client,[document_tmp_dochtmlcon],True)
			
 
				 
			
@@ -1593,7 +1722,14 @@ class Dataflow_init(Dataflow):
 
				         from BaseDataMaintenance.model.oracle.TuDiKuangChanTemp import TuDiKuangChanTemp
			
 
				         from BaseDataMaintenance.model.oracle.ZhaoBiaoDaYiTemp import ZhaoBiaoDaYiTemp
			
 
				         from BaseDataMaintenance.model.oracle.ZhaoBiaoWenJianTemp import ZhaoBiaoWenJianTemp
			
 
				+
			
 
				+        from BaseDataMaintenance.model.oracle.TouSuChuLiTemp import TouSuChuLiTemp
			
 
				+        from BaseDataMaintenance.model.oracle.WeiFaJiLuTemp import WeiFaJiLuTemp
			
 
				+        from BaseDataMaintenance.model.oracle.QiTaShiXinTemp import QiTaShiXin
			
 
				+
			
 
				+
			
 
				         schedule = BlockingScheduler()
			
 
				+
			
 
				         schedule.add_job(self.temp2mq,"cron",args=(CaiGouYiXiangTemp({}),),second="*/10")
			
 
				         schedule.add_job(self.temp2mq,"cron",args=(PaiMaiChuRangTemp({}),),second="*/10")
			
 
				         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoGongGaoTemp({}),),second="*/10")
			
@@ -1606,14 +1742,24 @@ class Dataflow_init(Dataflow):
 
				         schedule.add_job(self.temp2mq,"cron",args=(TuDiKuangChanTemp({}),),second="*/10")
			
 
				         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoDaYiTemp({}),),second="*/10")
			
 
				         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoWenJianTemp({}),),second="*/10")
			
 
				+
			
 
				+        schedule.add_job(self.temp2mq,"cron",args=(TouSuChuLiTemp({}),),second="*/10")
			
 
				+        schedule.add_job(self.temp2mq,"cron",args=(WeiFaJiLuTemp({}),),second="*/10")
			
 
				+        schedule.add_job(self.temp2mq,"cron",args=(QiTaShiXin({}),),second="*/10")
			
 
				+
			
 
				         schedule.add_job(self.ots2mq,"cron",second="*/10")
			
 
				         schedule.add_job(self.otstmp2mq,"cron",second="*/10")
			
 
				         schedule.add_job(self.monitor_listener,"cron",minute="*/1")
			
 
				+
			
 
				+        schedule.add_job(self.shenpi2mq,"cron",minute="*/1")
			
 
				         schedule.start()
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				+
			
 
				+
			
 
				+
			
 
				 def transform_attachment():
			
 
				     from BaseDataMaintenance.model.ots.attachment import attachment
			
 
				     from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
			
@@ -1802,27 +1948,7 @@ def check_data_synchronization():
 
				 
			
 
				 current_path = os.path.abspath(os.path.dirname(__file__))
			
 
				 
			
 
				-def fixDoc_to_queue_init(filename=""):
			
 
				-    import pandas as pd
			
 
				-    from BaseDataMaintenance.model.oracle.GongGaoTemp import dict_oracle2ots
			
 
				-    if filename=="":
			
 
				-        filename = os.path.join(current_path,"check.xlsx")
			
 
				-    df = pd.read_excel(filename)
			
 
				-    if "docchannel" in dict_oracle2ots:
			
 
				-        dict_oracle2ots.pop("docchannel")
			
 
				-    row_name = ",".join(list(dict_oracle2ots.keys()))
			
 
				-    conn = getConnection_oracle()
			
 
				-    cursor = conn.cursor()
			
 
				-    _count = 0
			
 
				-    for uuid,tablename,_exists,_toolong in zip(df["uuid"],df["tablename"],df["exists"],df["tolong"]):
			
 
				-        if _exists==0 and _toolong==0:
			
 
				-            _count += 1
			
 
				-            _source = str(tablename).replace("_TEMP","")
			
 
				-            sql = " insert into %s(%s) select %s from %s where id='%s' "%(tablename,row_name,row_name,_source,uuid)
			
 
				-            cursor.execute(sql)
			
 
				-            log("%d:%s"%(_count,sql))
			
 
				-    conn.commit()
			
 
				-    conn.close()
			
 
				+
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # di = Dataflow_init()
			
--- a/BaseDataMaintenance/maintenance/document/ApprovalData.py
+++ b/BaseDataMaintenance/maintenance/document/ApprovalData.py
@@ -0,0 +1,646 @@
 
				+
			
 
				+from BaseDataMaintenance.common.Utils import *
			
 
				+from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity
			
 
				+from tablestore import *
			
 
				+import pandas as pd
			
 
				+from queue import Queue
			
 
				+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
			
 
				+from BaseDataMaintenance.model.ots.document import Document
			
 
				+
			
 
				+import json
			
 
				+from uuid import uuid4
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+'''
			
 
				+"approval": [
			
 
				+        {
			
 
				+            "approval_items": "", #审批事项
			
 
				+            "approval_result": "", #审批结果
			
 
				+            "approver": "",#审批部门
			
 
				+            "city": "深圳",
			
 
				+            "construct_company": "深圳市赛孚电子科技有限公司",# 建设单位
			
 
				+            "construction_scale": "",#建设规模
			
 
				+            "declare_company": "",#申报单位
			
 
				+            "district": "光明",
			
 
				+            "doc_num": "",#审批文号
			
 
				+            "evaluation_agency": "",#环评机构
			
 
				+            "legal_person": "陈雷", # 项目法人
			
 
				+            "moneysource": "",# 资金来源
			
 
				+            "phone": "",
			
 
				+            "pro_type": "",#申报类型
			
 
				+            "project_addr": "广东省深圳市光明区玉塘街道田寮社区第七工业区26栋301",
			
 
				+            "project_code": "",
			
 
				+            "project_name": "深圳市赛孚电子科技有限公司销售医用射线装置项目",
			
 
				+            "properties": "新建", #建设性质
			
 
				+            "province": "广东",
			
 
				+            "time_commencement": "",# 开工时间
			
 
				+            "time_completion": "",#竣工时间
			
 
				+            "time_declare": "",#申报时间
			
 
				+            "total_tendereeMoney": "200000", # 总投资
			
 
				+            "year_limit": ""#建设年限,
			
 
				+"compilation_unit": "编制单位", 
			
 
				+"publisher": "发布单位",
			
 
				+"time_approval":"审批时间",
			
 
				+"time_release": "发布日期"
			
 
				+        }
			
 
				+    ]
			
 
				+'''
			
 
				+
			
 
				+
			
 
				+key_trans = {
			
 
				+    "doctitle":"公告标题",
			
 
				+    "page_time":"公告时间",
			
 
				+    "province": "省份",
			
 
				+    "city": "城市",
			
 
				+    "district": "地区",
			
 
				+
			
 
				+    "approval_items": "审批事项",
			
 
				+    "approval_result": "审批结果",
			
 
				+    "declare_company": "申报单位",
			
 
				+    "construct_company": "建设单位",
			
 
				+    "evaluation_agency": "环评机构",
			
 
				+    "approver": "审批部门",
			
 
				+    "compilation_unit": "编制单位",
			
 
				+    "publisher": "发布单位",
			
 
				+
			
 
				+    "total_tendereeMoney": "总投资",
			
 
				+    "construction_scale": "建设规模",
			
 
				+    "proportion":"建筑面积",
			
 
				+    "usearea":"用地面积",
			
 
				+
			
 
				+    "doc_num": "审批文号",
			
 
				+
			
 
				+    "legal_person": "项目法人",
			
 
				+    "moneysource": "资金来源",
			
 
				+    "moneyuse":"资金构成",
			
 
				+    "env_invest":"环保投资",
			
 
				+    "phone": "电话",
			
 
				+    "pro_type": "申报类型",
			
 
				+    "project_addr": "项目地址",
			
 
				+    "project_code": "项目编号",
			
 
				+    "project_name": "项目名称",
			
 
				+    "properties": "建设性质",
			
 
				+    "time_commencement": "开工时间",
			
 
				+    "time_completion": "竣工时间",
			
 
				+    "time_declare": "申报时间",
			
 
				+
			
 
				+    "year_limit": "建设年限",
			
 
				+
			
 
				+    "time_approval":"审批时间",
			
 
				+    "time_release": "发布日期"
			
 
				+}
			
 
				+
			
 
				+key_trans_d = {"docid":"公告id"}
			
 
				+key_trans_d.update(key_trans)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def extract_proportion(content, has_preffix=True):
			
 
				+    if not content:
			
 
				+        return "", ""
			
 
				+    # log("content")
			
 
				+    # log(content)
			
 
				+    suffix = "[大概约为是:：【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
			
 
				+    reg_dict = {
			
 
				+        0: "(?P<proportion>(总((建筑|建设)(面积|规模)|长|长度))" + suffix,
			
 
				+        1: "(?P<proportion>((建筑|建设)(面积|规模)|全长)" + suffix,
			
 
				+        2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
			
 
				+    }
			
 
				+
			
 
				+    if not has_preffix:
			
 
				+        reg_dict[3] = "(?P<proportion>" + suffix
			
 
				+
			
 
				+    _proportion = ""
			
 
				+    for i in range(len(list(reg_dict.keys()))):
			
 
				+        if _proportion:
			
 
				+            break
			
 
				+        _pattern = reg_dict.get(i)
			
 
				+        # logging.info('content ' + str(content))
			
 
				+        match = re.search(_pattern, str(content))
			
 
				+        if match:
			
 
				+            _proportion = match.groupdict().get("proportion", "")
			
 
				+
			
 
				+    if not _proportion:
			
 
				+        return "", ""
			
 
				+
			
 
				+    # 统一格式
			
 
				+    multiple_cnt = 1
			
 
				+    digit = ""
			
 
				+
			
 
				+    # 确定具体数字
			
 
				+    match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
			
 
				+    if match:
			
 
				+        # logging.info(str(_proportion) + '  ' + str(match.group()))
			
 
				+        d1 = match.group('d1')
			
 
				+        d2 = match.group('d2')
			
 
				+        try:
			
 
				+            d1 = int(re.sub(',', '', d1))
			
 
				+        except:
			
 
				+            return "", ""
			
 
				+        if d2:
			
 
				+            d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
			
 
				+            # print('d1, d2', d1, d2)
			
 
				+            d1 += d2
			
 
				+        digit = d1
			
 
				+    # print('digit', digit)
			
 
				+
			
 
				+    # 确定中文倍数
			
 
				+    _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
			
 
				+    match = re.search('[十百千万亿]+', _proportion2)
			
 
				+    _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
			
 
				+    if match:
			
 
				+        for c in match.group():
			
 
				+            multiple_cnt *= _dict.get(c)
			
 
				+        _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
			
 
				+    else:
			
 
				+        _proportion3 = _proportion2
			
 
				+    # print('multiple_cnt2', multiple_cnt)
			
 
				+
			
 
				+    # 确定面积/长度
			
 
				+    match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
			
 
				+    if match:
			
 
				+        unit = '㎡'
			
 
				+    else:
			
 
				+        unit = 'm'
			
 
				+
			
 
				+    # 确定单位倍数
			
 
				+    match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
			
 
				+    if match:
			
 
				+        if unit == 'm':
			
 
				+            if re.search('[kK千公]', match.group()):
			
 
				+                multiple_cnt *= 1000
			
 
				+            elif re.search('[里]', match.group()):
			
 
				+                multiple_cnt *= Decimal(str(500))
			
 
				+        else:
			
 
				+            if '亩' in match.group():
			
 
				+                multiple_cnt *= Decimal(str(666.67))
			
 
				+            elif '顷' in match.group():
			
 
				+                multiple_cnt *= 10000
			
 
				+            elif re.search('千米|公里|k[mM㎡]', match.group()):
			
 
				+                multiple_cnt *= 1000000
			
 
				+    # print('multiple_cnt1', multiple_cnt)
			
 
				+
			
 
				+    # 拼接
			
 
				+    digit = str(digit * multiple_cnt) + unit
			
 
				+
			
 
				+    return _proportion, digit
			
 
				+
			
 
				+def extract_usearea(content, has_preffix=True):
			
 
				+    if not content:
			
 
				+        return "", ""
			
 
				+    # log("content")
			
 
				+    # log(content)
			
 
				+    suffix = "[大概约为是:：【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
			
 
				+    reg_dict = {
			
 
				+        0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)|长|长度))" + suffix,
			
 
				+        1: "(?P<proportion>((用地|占地|使用)(面积|规模)|全长)" + suffix,
			
 
				+        2: "(?P<proportion>((用地|占地|使用)?面积)" + suffix
			
 
				+    }
			
 
				+
			
 
				+    if not has_preffix:
			
 
				+        reg_dict[3] = "(?P<proportion>" + suffix
			
 
				+
			
 
				+    _proportion = ""
			
 
				+    for i in range(len(list(reg_dict.keys()))):
			
 
				+        if _proportion:
			
 
				+            break
			
 
				+        _pattern = reg_dict.get(i)
			
 
				+        # logging.info('content ' + str(content))
			
 
				+        match = re.search(_pattern, str(content))
			
 
				+        if match:
			
 
				+            _proportion = match.groupdict().get("proportion", "")
			
 
				+
			
 
				+    if not _proportion:
			
 
				+        return "", ""
			
 
				+
			
 
				+    # 统一格式
			
 
				+    multiple_cnt = 1
			
 
				+    digit = ""
			
 
				+
			
 
				+    # 确定具体数字
			
 
				+    match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
			
 
				+    if match:
			
 
				+        # logging.info(str(_proportion) + '  ' + str(match.group()))
			
 
				+        d1 = match.group('d1')
			
 
				+        d2 = match.group('d2')
			
 
				+        try:
			
 
				+            d1 = int(re.sub(',', '', d1))
			
 
				+        except:
			
 
				+            return "", ""
			
 
				+        if d2:
			
 
				+            d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
			
 
				+            # print('d1, d2', d1, d2)
			
 
				+            d1 += d2
			
 
				+        digit = d1
			
 
				+    # print('digit', digit)
			
 
				+
			
 
				+    # 确定中文倍数
			
 
				+    _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
			
 
				+    match = re.search('[十百千万亿]+', _proportion2)
			
 
				+    _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
			
 
				+    if match:
			
 
				+        for c in match.group():
			
 
				+            multiple_cnt *= _dict.get(c)
			
 
				+        _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
			
 
				+    else:
			
 
				+        _proportion3 = _proportion2
			
 
				+    # print('multiple_cnt2', multiple_cnt)
			
 
				+
			
 
				+    # 确定面积/长度
			
 
				+    match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
			
 
				+    if match:
			
 
				+        unit = '㎡'
			
 
				+    else:
			
 
				+        unit = 'm'
			
 
				+
			
 
				+    # 确定单位倍数
			
 
				+    match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
			
 
				+    if match:
			
 
				+        if unit == 'm':
			
 
				+            if re.search('[kK千公]', match.group()):
			
 
				+                multiple_cnt *= 1000
			
 
				+            elif re.search('[里]', match.group()):
			
 
				+                multiple_cnt *= Decimal(str(500))
			
 
				+        else:
			
 
				+            if '亩' in match.group():
			
 
				+                multiple_cnt *= Decimal(str(666.67))
			
 
				+            elif '顷' in match.group():
			
 
				+                multiple_cnt *= 10000
			
 
				+            elif re.search('千米|公里|k[mM㎡]', match.group()):
			
 
				+                multiple_cnt *= 1000000
			
 
				+    # print('multiple_cnt1', multiple_cnt)
			
 
				+
			
 
				+    # 拼接
			
 
				+    digit = str(digit * multiple_cnt) + unit
			
 
				+
			
 
				+    return _proportion, digit
			
 
				+
			
 
				+def extract_env_invest(content):
			
 
				+    pattern = "环保投资[大概约为是:：]*(?P<invs>\d+(\.\d+)?万?元)"
			
 
				+
			
 
				+    match = re.search(pattern,content)
			
 
				+    if match is not None:
			
 
				+        invest =  match.groupdict().get("invs","")
			
 
				+        money = getUnifyMoney(invest)
			
 
				+        if money>0:
			
 
				+            return money
			
 
				+    return ""
			
 
				+
			
 
				+def extract_moneyuse(content):
			
 
				+    list_sentences = re.split("，|。",content)
			
 
				+    list_data = []
			
 
				+    pattern = "^.{,20}[费用|预备费|费][大概约为是:：]*\d+(\.\d+)?万?元.{,20}$"
			
 
				+    for sentence in list_sentences:
			
 
				+        match = re.search(pattern,sentence)
			
 
				+        if match is not None:
			
 
				+            list_data.append(sentence)
			
 
				+    return "，".join(list_data)
			
 
				+
			
 
				+def get_approval_data(ots_client,ots_capacity,docid):
			
 
				+
			
 
				+    bool_query = BoolQuery(must_queries=[
			
 
				+        TermQuery("docid",docid)
			
 
				+    ])
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query),
			
 
				+                                                                   ColumnsToGet(["doctitle","project_name","page_time","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_data = getRow_ots(rows)
			
 
				+    for _d in list_data:
			
 
				+        approval_json = _d.get("approval_json")
			
 
				+        partitionkey = _d.get("partitionkey")
			
 
				+        docid = _d.get("docid")
			
 
				+        doctitle = _d.get("doctitle")
			
 
				+        project_name = _d.get("project_name")
			
 
				+        page_time = _d.get("page_time")
			
 
				+        extract_json = _d.get("extract_json")
			
 
				+
			
 
				+        _d_html = {"partitionkey":partitionkey,"docid":docid}
			
 
				+        _html = Document(_d_html)
			
 
				+        _html.fix_columns(ots_capacity,["dochtmlcon"],True)
			
 
				+        dochtml = _html.getProperties().get("dochtmlcon","")
			
 
				+        doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
			
 
				+        attachmenttextcon = ""
			
 
				+        try:
			
 
				+            _extract = json.loads(extract_json)
			
 
				+        except Exception  as e:
			
 
				+            _extract = {}
			
 
				+        proportion = _extract.get("pb",{}).get("proportion")
			
 
				+        _,usearea = extract_usearea(doctextcon+attachmenttextcon)
			
 
				+        env_invest = extract_env_invest(doctextcon+attachmenttextcon)
			
 
				+        moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
			
 
				+
			
 
				+        if approval_json:
			
 
				+            list_approval = json.loads(approval_json)
			
 
				+            for _appr in list_approval:
			
 
				+                _appr["partitionkey"] = partitionkey
			
 
				+                _appr["docid"] = docid
			
 
				+                _appr["doctitle"] = doctitle
			
 
				+                _appr["page_time"] = page_time
			
 
				+                _appr["proportion"] = proportion
			
 
				+                _appr["usearea"] = usearea
			
 
				+                _appr["env_invest"] = env_invest
			
 
				+                _appr["moneyuse"] = moneyuse
			
 
				+
			
 
				+                fix_area(ots_client,_appr)
			
 
				+
			
 
				+                construction_scale = _d.get("construction_scale","")
			
 
				+                proportion,_ = extract_proportion(construction_scale)
			
 
				+                if proportion!="":
			
 
				+                    _appr["proportion"] = proportion
			
 
				+                _,usearea = extract_usearea(construction_scale)
			
 
				+                if usearea!="":
			
 
				+                    _appr["usearea"] = usearea
			
 
				+                env_invest = extract_env_invest(construction_scale)
			
 
				+                if env_invest!="":
			
 
				+                    _appr["env_invest"] = env_invest
			
 
				+                moneyuse = extract_moneyuse(construction_scale)
			
 
				+                if moneyuse!="":
			
 
				+                    _appr["moneyuse"] = moneyuse
			
 
				+
			
 
				+            return list_approval
			
 
				+
			
 
				+
			
 
				+def check_approval(appr1,appr2):
			
 
				+    check_keys = ["declare_company","construct_company","total_tendereeMoney","proportion","usearea","doc_num","project_code"]
			
 
				+    same_count = 0
			
 
				+    for k in check_keys:
			
 
				+        if k in appr1 and k in appr2:
			
 
				+            if appr1[k]==appr2[k] and appr1[k] is not None and appr1[k]!="":
			
 
				+                same_count += 1
			
 
				+
			
 
				+    if same_count>=1:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def merge_approval_real(ots_client,ots_capacity,approval):
			
 
				+    doc_num = approval.get("doc_num","")
			
 
				+    doctitle = approval.get("doctitle","")
			
 
				+    project_name = approval.get("project_name","")
			
 
				+    project_code = approval.get("project_code","")
			
 
				+
			
 
				+    docid = approval.get("docid")
			
 
				+    should_queries = []
			
 
				+
			
 
				+    if doc_num!="":
			
 
				+        should_queries.append(MatchPhraseQuery("doctitle",doc_num))
			
 
				+        should_queries.append(MatchPhraseQuery("doctextcon",doc_num))
			
 
				+        should_queries.append(MatchPhraseQuery("attachmenttextcon",doc_num))
			
 
				+    if doctitle!="":
			
 
				+        should_queries.append(MatchPhraseQuery("doctitle",doctitle))
			
 
				+        should_queries.append(MatchPhraseQuery("doctextcon",doctitle))
			
 
				+        should_queries.append(MatchPhraseQuery("attachmenttextcon",doctitle))
			
 
				+    if project_name!="":
			
 
				+        should_queries.append(MatchPhraseQuery("doctitle",project_name))
			
 
				+        should_queries.append(MatchPhraseQuery("doctextcon",project_name))
			
 
				+        should_queries.append(MatchPhraseQuery("attachmenttextcon",project_name))
			
 
				+    if project_code!="":
			
 
				+        should_queries.append(MatchPhraseQuery("doctitle",project_code))
			
 
				+        should_queries.append(MatchPhraseQuery("doctextcon",project_code))
			
 
				+        should_queries.append(MatchPhraseQuery("attachmenttextcon",project_code))
			
 
				+
			
 
				+
			
 
				+    _query = BoolQuery(should_queries=should_queries,must_not_queries=[TermQuery("docid",docid)])
			
 
				+    bool_query = BoolQuery(must_queries=[
			
 
				+        RangeQuery("status",201,301),
			
 
				+        _query
			
 
				+    ])
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query),
			
 
				+                                                                   ColumnsToGet(["doctitle","page_time","project_name","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_data = getRow_ots(rows)
			
 
				+    approvals = [approval]
			
 
				+    for _d in list_data:
			
 
				+        approval_json = _d.get("approval_json")
			
 
				+        partitionkey = _d.get("partitionkey")
			
 
				+        docid = _d.get("docid")
			
 
				+        doctitle = _d.get("doctitle")
			
 
				+        project_name = _d.get("project_name")
			
 
				+        page_time = _d.get("page_time")
			
 
				+        extract_json = _d.get("extract_json")
			
 
				+
			
 
				+
			
 
				+        _d_html = {"partitionkey":partitionkey,"docid":docid}
			
 
				+        _html = Document(_d_html)
			
 
				+        _html.fix_columns(ots_capacity,["dochtmlcon"],True)
			
 
				+        dochtml = _html.getProperties().get("dochtmlcon","")
			
 
				+        doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
			
 
				+        attachmenttextcon = ""
			
 
				+
			
 
				+        try:
			
 
				+            _extract = json.loads(extract_json)
			
 
				+        except Exception  as e:
			
 
				+            _extract = {}
			
 
				+        proportion = _extract.get("pb",{}).get("proportion")
			
 
				+        _,usearea = extract_usearea(doctextcon+attachmenttextcon)
			
 
				+        env_invest = extract_env_invest(doctextcon+attachmenttextcon)
			
 
				+        moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
			
 
				+        if approval_json:
			
 
				+            list_approval = json.loads(approval_json)
			
 
				+            for _appr in list_approval:
			
 
				+                _appr["partitionkey"] = partitionkey
			
 
				+                _appr["docid"] = docid
			
 
				+                _appr["doctitle"] = doctitle
			
 
				+                _appr["page_time"] = page_time
			
 
				+                _appr["usearea"] = usearea
			
 
				+                _appr["env_invest"] = env_invest
			
 
				+                _appr["moneyuse"] = moneyuse
			
 
				+
			
 
				+                fix_area(ots_client,_appr)
			
 
				+
			
 
				+                construction_scale = _d.get("construction_scale","")
			
 
				+                proportion,_ = extract_proportion(construction_scale)
			
 
				+                if proportion!="":
			
 
				+                    _appr["proportion"] = proportion
			
 
				+                _,usearea = extract_usearea(construction_scale)
			
 
				+                if usearea!="":
			
 
				+                    _appr["usearea"] = usearea
			
 
				+                env_invest = extract_env_invest(construction_scale)
			
 
				+                if env_invest!="":
			
 
				+                    _appr["env_invest"] = env_invest
			
 
				+                moneyuse = extract_moneyuse(construction_scale)
			
 
				+                if moneyuse!="":
			
 
				+                    _appr["moneyuse"] = moneyuse
			
 
				+                if check_approval(approval,_appr):
			
 
				+                    approvals.append(_appr)
			
 
				+    return approvals
			
 
				+
			
 
				+def get_enterprise_area(ots_client,name):
			
 
				+    bool_query = BoolQuery(must_queries=[
			
 
				+        TermQuery("name",name)
			
 
				+    ])
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
			
 
				+                                                                   SearchQuery(bool_query),
			
 
				+                                                                   ColumnsToGet(["province","city","district"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_data = getRow_ots(rows)
			
 
				+    _d = {}
			
 
				+    if len(list_data)>0:
			
 
				+        _d["province"] = list_data[0].get("province","")
			
 
				+        _d["city"] = list_data[0].get("city","")
			
 
				+        _d["district"] = list_data[0].get("district","")
			
 
				+    return _d
			
 
				+
			
 
				+def area_count(_d):
			
 
				+    keys = ["province","city","district"]
			
 
				+    return sum([1 if _d.get(k,"") not in ("","全国","未知") else 0 for k in keys])
			
 
				+
			
 
				+def fix_area(ots_client,appr):
			
 
				+    if appr.get("district","")!="":
			
 
				+        return
			
 
				+    declare_company = appr.get("declare_company","")
			
 
				+    _d = get_enterprise_area(ots_client,declare_company)
			
 
				+    if area_count(_d)>area_count(appr):
			
 
				+        appr.update(_d)
			
 
				+
			
 
				+    construct_company = appr.get("construct_company","")
			
 
				+    _d = get_enterprise_area(ots_client,construct_company)
			
 
				+    if area_count(_d)>area_count(appr):
			
 
				+        appr.update(_d)
			
 
				+
			
 
				+    approver = appr.get("approver","")
			
 
				+    _d = get_enterprise_area(ots_client,approver)
			
 
				+    if area_count(_d)>area_count(appr):
			
 
				+        appr.update(_d)
			
 
				+
			
 
				+    compilation_unit = appr.get("compilation_unit","")
			
 
				+    _d = get_enterprise_area(ots_client,approver)
			
 
				+    if area_count(_d)>area_count(appr):
			
 
				+        appr.update(_d)
			
 
				+
			
 
				+    publisher = appr.get("publisher","")
			
 
				+    _d = get_enterprise_area(ots_client,publisher)
			
 
				+    if area_count(_d)>area_count(appr):
			
 
				+        appr.update(_d)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def generate_projects(approvals):
			
 
				+    project_id = str(uuid4())
			
 
				+    approvals.sort(key=lambda x:x.get("page_time",""),reverse=False)
			
 
				+    _dict = {}
			
 
				+    for appr in approvals:
			
 
				+        _d = {}
			
 
				+        _d_area = {}
			
 
				+        for k,v in appr.items():
			
 
				+            if v is not None and v!="":
			
 
				+                if k in ("province","city","district"):
			
 
				+                    _d_area[k] = v
			
 
				+                else:
			
 
				+                    _d[k] = v
			
 
				+        if _dict.get("province","")=="" and _d_area.get("province","")!="":
			
 
				+            _dict.update(_d_area)
			
 
				+        if _dict.get("city","")=="" and _d_area.get("city","")!="":
			
 
				+            _dict.update(_d_area)
			
 
				+        if _dict.get("district","")=="" and _d_area.get("district","")!="":
			
 
				+            _dict.update(_d_area)
			
 
				+        _dict.update(_d)
			
 
				+    _dict["id"] = project_id
			
 
				+    return _dict
			
 
				+
			
 
				+
			
 
				+def merge_approval():
			
 
				+    ots_client = getConnect_ots()
			
 
				+    ots_capacity = getConnect_ots_capacity()
			
 
				+
			
 
				+    list_data = []
			
 
				+
			
 
				+    # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-11\20241104审批项目公告_审批要素.xlsx"
			
 
				+    # df = pd.read_excel(filename)
			
 
				+    # _count = 0
			
 
				+    # for docid in df["公告id"]:
			
 
				+    #     docid = int(docid)
			
 
				+    #     _count += 1
			
 
				+    #     # if _count>3000:
			
 
				+    #     #     break
			
 
				+    #     # if docid!=400066972170 and docid!=400066972181:
			
 
				+    #     #     continue
			
 
				+    #     # list_approval = get_approval_data(ots_client,docid)
			
 
				+    #     # if list_approval:
			
 
				+    #     #     list_data.extend(list_approval)
			
 
				+    #     list_data.append(docid)
			
 
				+
			
 
				+    bool_query = BoolQuery(must_queries=[
			
 
				+        RangeQuery("status",201,301),
			
 
				+        TermQuery("page_time","2024-11-04"),
			
 
				+        TermQuery("docchannel",302),
			
 
				+    ])
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
			
 
				+                                                                   ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_row = getRow_ots(rows)
			
 
				+    for _data in list_row:
			
 
				+        list_data.append(_data.get("docid"))
			
 
				+
			
 
				+    while next_token:
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                   ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        list_row = getRow_ots(rows)
			
 
				+        for _data in list_row:
			
 
				+            list_data.append(_data.get("docid"))
			
 
				+        print("%d/%d"%(len(list_data),total_count))
			
 
				+        # if len(list_data)>=2000:
			
 
				+        #     break
			
 
				+
			
 
				+    task_queue = Queue()
			
 
				+    for _data in list_data:
			
 
				+        task_queue.put(_data)
			
 
				+
			
 
				+    result_queue = Queue()
			
 
				+
			
 
				+    def merge_approval_handle(docid,result_queue):
			
 
				+        print("docid",docid)
			
 
				+        list_approval = get_approval_data(ots_client,ots_capacity,docid)
			
 
				+        if list_approval:
			
 
				+            for appr in list_approval:
			
 
				+                approvals = merge_approval_real(ots_client,ots_capacity,appr)
			
 
				+                result_queue.put(approvals)
			
 
				+
			
 
				+    mt = MultiThreadHandler(task_queue,merge_approval_handle,result_queue,30)
			
 
				+    mt.run()
			
 
				+
			
 
				+    list_approvals = []
			
 
				+    try:
			
 
				+        while 1:
			
 
				+            item = result_queue.get(timeout=1)
			
 
				+            list_approvals.append(item)
			
 
				+    except:
			
 
				+        pass
			
 
				+
			
 
				+    data_approval = []
			
 
				+    data_approvals_p = []
			
 
				+    for approvals in list_approvals:
			
 
				+        _project = generate_projects(approvals)
			
 
				+        _project_id = _project.get("id")
			
 
				+
			
 
				+        for _approval in approvals:
			
 
				+
			
 
				+            _d = {"项目id":_project_id}
			
 
				+            for k,v in key_trans_d.items():
			
 
				+                if k in _approval:
			
 
				+                    _d[v] = _approval[k]
			
 
				+                else:
			
 
				+                    _d[v] = ""
			
 
				+            data_approval.append(_d)
			
 
				+        _d = {"项目id":_project_id}
			
 
				+        for k,v in key_trans.items():
			
 
				+            if k in _project:
			
 
				+                _d[v] = _project[k]
			
 
				+            else:
			
 
				+                _d[v] = ""
			
 
				+        data_approvals_p.append(_d)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    df_approval = pd.DataFrame(data_approval)
			
 
				+    df_approvals_p = pd.DataFrame(data_approvals_p)
			
 
				+    df_approval.to_excel("a.xlsx")
			
 
				+    df_approvals_p.to_excel("b.xlsx")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    merge_approval()
			
--- a/BaseDataMaintenance/maintenance/enterprise/enterprise2Redis.py
+++ b/BaseDataMaintenance/maintenance/enterprise/enterprise2Redis.py
@@ -64,8 +64,7 @@ class enterprise2Redis():
 
				                                   http_auth=('elastic', 'WWBu9#1HWHo$$gJm'),
			
 
				                                   port=9200)
			
 
				         body = {
			
 
				-            "_source": ["name", "history_names", 'legal_person', 'reg_capital', 'credit_code', 'tax_number',
			
 
				-                        'reg_number', 'org_number',
			
 
				+            "_source": ["name", 'credit_code',
			
 
				                         "zhao_biao_number", "zhong_biao_number", "dai_li_number", "bid_number"],
			
 
				             'query': {  # 查询命令
			
 
				                 "bool": {
			
@@ -86,7 +85,7 @@ class enterprise2Redis():
 
				                 }
			
 
				             },
			
 
				             "sort": [
			
 
				-                {"create_time": "desc"}
			
 
				+                {"update_time": "desc"}
			
 
				             ]
			
 
				         }
			
 
				 
			
@@ -114,21 +113,34 @@ class enterprise2Redis():
 
				         for item in result:
			
 
				             item = item['_source']
			
 
				             name = item['name']
			
 
				-            history_names = item.get("history_names", "")
			
 
				-            legal_person = item.get("legal_person", "")
			
 
				-            reg_capital = item.get("reg_capital", "")
			
 
				+            # history_names = item.get("history_names", "")
			
 
				+            # legal_person = item.get("legal_person", "")
			
 
				+            # reg_capital = item.get("reg_capital", "")
			
 
				             credit_code = item.get("credit_code", "")
			
 
				-            tax_number = item.get("tax_number", "")
			
 
				-            reg_number = item.get("reg_number", "")
			
 
				-            org_number = item.get("org_number", "")
			
 
				+            credit_code = re.sub("\s","",credit_code)
			
 
				+            credit_code = credit_code if re.search("^[\dA-Z]{2}\d{6}[\dA-Z]{10}$",credit_code) else ""
			
 
				+            # tax_number = item.get("tax_number", "")
			
 
				+            # tax_number = re.sub("\s","",tax_number)
			
 
				+            # tax_number = tax_number if len(tax_number)>=15 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",tax_number) else ""
			
 
				+            # reg_number = item.get("reg_number", "")
			
 
				+            # reg_number = re.sub("\s","",reg_number)
			
 
				+            # reg_number = reg_number if len(reg_number)>=10 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",reg_number) else ""
			
 
				+            # org_number = item.get("org_number", "") # 已弃用，统一社会信用代码取代组织机构代码
			
 
				+            # org_number = org_number if not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",org_number) else ""
			
 
				             zhao_biao_number = item.get("zhao_biao_number", 0)
			
 
				+            zhao_biao_number = zhao_biao_number if zhao_biao_number else 0
			
 
				             zhong_biao_number = item.get("zhong_biao_number", 0)
			
 
				+            zhong_biao_number = zhong_biao_number if zhong_biao_number else 0
			
 
				             dai_li_number = item.get("dai_li_number", 0)
			
 
				+            dai_li_number = dai_li_number if dai_li_number else 0
			
 
				             bid_number = item.get("bid_number", 0)
			
 
				+            bid_number = bid_number if bid_number else 0
			
 
				 
			
 
				             num = 0
			
 
				-            for business in [history_names, legal_person, reg_capital, credit_code, tax_number, reg_number, org_number]:
			
 
				-                if len(str(business).replace("-", "")) > 1:
			
 
				+            for business in [credit_code]: # 新增实体只判断credit_code
			
 
				+                business = re.sub("\s-","",str(business))
			
 
				+                business = re.sub("^nan$","",business)
			
 
				+                if len(business) > 1:
			
 
				                     num += 1
			
 
				             isLegal = isLegalNewName(name)
			
 
				             if isLegal >= 0:
			
@@ -136,14 +148,16 @@ class enterprise2Redis():
 
				                     legal_name_num += 1
			
 
				                     _json = {"have_business": 1, "zhao_biao_number": zhao_biao_number,
			
 
				                              "zhong_biao_number": zhong_biao_number,
			
 
				-                             "dai_li_number": dai_li_number, "bid_number": bid_number}
			
 
				+                             "dai_li_number": dai_li_number, "bid_number": bid_number,
			
 
				+                             "credit_code":credit_code}
			
 
				                     _json = json.dumps(_json, ensure_ascii=False)
			
 
				                     add_redis_list.append((name, _json))
			
 
				                 elif num == 0 and bid_number > 0 and len(name) > 4:
			
 
				                     legal_name_num += 1
			
 
				                     _json = {"have_business": 0, "zhao_biao_number": zhao_biao_number,
			
 
				                              "zhong_biao_number": zhong_biao_number,
			
 
				-                             "dai_li_number": dai_li_number, "bid_number": bid_number}
			
 
				+                             "dai_li_number": dai_li_number, "bid_number": bid_number,
			
 
				+                             "credit_code":credit_code}
			
 
				                     _json = json.dumps(_json, ensure_ascii=False)
			
 
				                     add_redis_list.append((name, _json))
			
 
				 
			
@@ -259,13 +273,21 @@ def isLegalNewName(enterprise_name):
 
				         return -1
			
 
				     if re.search("[区市镇乡县洲州路街]$", enterprise_name) and not re.search("(超市|门市|保护区|园区|景区|校区|社区|服务区|工区|小区|集市|花市|夜市|学区|旅游区|矿区|林区|度假区|示范区|菜市)$", enterprise_name):
			
 
				         return -1
			
 
				-    if re.search("^个人|^个体|测试$", enterprise_name):
			
 
				+    # if re.search("^.?(个人|个体|测试)|(个人|个体|测试).?$", enterprise_name):
			
 
				+    if re.search("^.?测试|测试.?$", enterprise_name):
			
 
				         return -1
			
 
				     if re.search("个人|个体", enterprise_name):
			
 
				-        _split = re.split("个人|个体", enterprise_name)
			
 
				-        if len(_split[0]) <= 5:
			
 
				-            return -1
			
 
				-    if re.search("测试", enterprise_name) and len(enterprise_name) < 8:
			
 
				+        if re.search("(个人|个体).?工商户",enterprise_name):
			
 
				+            #  按照字数过滤，比如剔除个体工商户这些字眼之后还有6个字以上的，可能是有用的 2024/12/5新增
			
 
				+            _name = re.sub("(个人|个体).?[工商户]*|[\(\)（）｛｝\{\}\[\]【】]","",enterprise_name)
			
 
				+            if len(re.findall("[\u4e00-\u9fa5]", _name))<=4:
			
 
				+                return -1
			
 
				+        else:
			
 
				+            _split = re.split("个人|个体", enterprise_name)
			
 
				+            if len(_split[0]) <= 5:
			
 
				+                return -1
			
 
				+
			
 
				+    if (re.search("测试", enterprise_name) and len(enterprise_name) < 8) or len(re.findall("测试", enterprise_name))>1:
			
 
				         return -1
			
 
				     if re.search("^(省|自治[县州区]|市|县|区|镇|乡|街道)", enterprise_name) and not re.search(
			
 
				             "^(镇江|乡宁|镇原|镇海|镇安|镇巴|镇坪|镇赉|镇康|镇沅|镇雄|镇远|镇宁|乡城|镇平|市中|市南|市北)", enterprise_name):
			
@@ -281,7 +303,61 @@ def isLegalNewName(enterprise_name):
 
				         return 0
			
 
				     return 1
			
 
				 
			
 
				-
			
 
				+def test1():
			
 
				+    legal_name_num = 0
			
 
				+    add_redis_list = []
			
 
				+    result =[{'_source': {'reg_number': '230602601147025', 'org_number': 'MADB2NAN6', 'update_time': '2024-08-01 07:28:14', 'credit_code': '92230602MADB2NAN6N', 'name': '大庆萨尔图区若飞物资经销处（个体工商户）'}, '_score': None, 'sort': ['2024-08-01 07:28:14'], '_index': 'enterprise_v3', '_type': '_doc', '_id': '大庆萨尔图区若飞物资经销处（个体工商户）'}]
			
 
				+    for item in result:
			
 
				+        item = item['_source']
			
 
				+        name = item['name']
			
 
				+        # history_names = item.get("history_names", "")
			
 
				+        # legal_person = item.get("legal_person", "")
			
 
				+        # reg_capital = item.get("reg_capital", "")
			
 
				+        credit_code = item.get("credit_code", "")
			
 
				+        credit_code = re.sub("\s", "", credit_code)
			
 
				+        credit_code = credit_code if re.search("^[\dA-Z]{2}\d{6}[\dA-Z]{10}$", credit_code) else ""
			
 
				+        # tax_number = item.get("tax_number", "")
			
 
				+        # tax_number = re.sub("\s","",tax_number)
			
 
				+        # tax_number = tax_number if len(tax_number)>=15 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",tax_number) else ""
			
 
				+        # reg_number = item.get("reg_number", "")
			
 
				+        # reg_number = re.sub("\s","",reg_number)
			
 
				+        # reg_number = reg_number if len(reg_number)>=10 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",reg_number) else ""
			
 
				+        # org_number = item.get("org_number", "") # 已弃用，统一社会信用代码取代组织机构代码
			
 
				+        # org_number = org_number if not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",org_number) else ""
			
 
				+        zhao_biao_number = item.get("zhao_biao_number", 0)
			
 
				+        zhao_biao_number = zhao_biao_number if zhao_biao_number else 0
			
 
				+        zhong_biao_number = item.get("zhong_biao_number", 0)
			
 
				+        zhong_biao_number = zhong_biao_number if zhong_biao_number else 0
			
 
				+        dai_li_number = item.get("dai_li_number", 0)
			
 
				+        dai_li_number = dai_li_number if dai_li_number else 0
			
 
				+        bid_number = item.get("bid_number", 0)
			
 
				+        bid_number = bid_number if bid_number else 0
			
 
				+
			
 
				+        num = 0
			
 
				+        for business in [credit_code]:  # 新增实体只判断credit_code
			
 
				+            business = re.sub("\s-", "", str(business))
			
 
				+            business = re.sub("^nan$", "", business)
			
 
				+            if len(business) > 1:
			
 
				+                num += 1
			
 
				+        isLegal = isLegalNewName(name)
			
 
				+        if isLegal >= 0:
			
 
				+            if num >= 1 and len(name) > 4:
			
 
				+                legal_name_num += 1
			
 
				+                _json = {"have_business": 1, "zhao_biao_number": zhao_biao_number,
			
 
				+                         "zhong_biao_number": zhong_biao_number,
			
 
				+                         "dai_li_number": dai_li_number, "bid_number": bid_number,
			
 
				+                         "credit_code": credit_code}
			
 
				+                _json = json.dumps(_json, ensure_ascii=False)
			
 
				+                add_redis_list.append((name, _json))
			
 
				+            elif num == 0 and bid_number > 0 and len(name) > 4:
			
 
				+                legal_name_num += 1
			
 
				+                _json = {"have_business": 0, "zhao_biao_number": zhao_biao_number,
			
 
				+                         "zhong_biao_number": zhong_biao_number,
			
 
				+                         "dai_li_number": dai_li_number, "bid_number": bid_number,
			
 
				+                         "credit_code": credit_code}
			
 
				+                _json = json.dumps(_json, ensure_ascii=False)
			
 
				+                add_redis_list.append((name, _json))
			
 
				+    print(add_redis_list)
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				 
			
@@ -308,6 +384,9 @@ if __name__ == '__main__':
 
				 
			
 
				     # e = enterprise2Redis()
			
 
				     # e.monitor_enterprise2redis()
			
 
				+
			
 
				+    # print(isLegalNewName('大庆萨尔图区若飞物资经销处（个体工商户）'))
			
 
				+
			
 
				     pass
			
 
				 
			
 
				 
			
--- a/BaseDataMaintenance/maintenance/gpt_extract.py
+++ b/BaseDataMaintenance/maintenance/gpt_extract.py
@@ -0,0 +1,164 @@
 
				+#coding:utf8
			
 
				+
			
 
				+from BaseDataMaintenance.chat.ERNIE_utils import *
			
 
				+
			
 
				+from BaseDataMaintenance.dataSource.source import getConnect_ots
			
 
				+from BaseDataMaintenance.chat.chatUtil import *
			
 
				+
			
 
				+from tablestore import *
			
 
				+from BaseDataMaintenance.common.Utils import getRow_ots,getCurrent_date,timeAdd
			
 
				+from bs4 import BeautifulSoup
			
 
				+import json
			
 
				+import re
			
 
				+import pandas as pd
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_columns(ots_client,docid,columns):
			
 
				+
			
 
				+    bool_query = BoolQuery(must_queries=[TermQuery("docid",docid)])
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query),
			
 
				+                                                                   ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_data = getRow_ots(rows)
			
 
				+    _dict = {}
			
 
				+    if len(list_data)==1:
			
 
				+        _dict = list_data[0]
			
 
				+    return _dict
			
 
				+
			
 
				+
			
 
				+def jsonchat(msg,try_times):
			
 
				+
			
 
				+    try:
			
 
				+        print(msg)
			
 
				+    except Exception as e:
			
 
				+        pass
			
 
				+    while try_times>0:
			
 
				+        try:
			
 
				+            try_times -= 1
			
 
				+            resp = chat(msg)
			
 
				+            time.sleep(1)
			
 
				+
			
 
				+            if resp.status_code == 200:
			
 
				+                result_dict = json.loads(resp.text)
			
 
				+                result = result_dict.get("result", "")
			
 
				+                error_msg = result_dict.get("error_msg")
			
 
				+                if error_msg is not None:
			
 
				+                    print("error_msg",error_msg)
			
 
				+                    time.sleep(10)
			
 
				+                    continue
			
 
				+                _pattern = "```json(?P<json>.*)```"
			
 
				+                _search = re.search(_pattern, result, re.DOTALL)
			
 
				+                if _search is not None:
			
 
				+                    _json = _search.groupdict().get("json")
			
 
				+                    _d = json.loads(_json)
			
 
				+                    return _json
			
 
				+        except Exception as e:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				+def extract_tenderee():
			
 
				+    filename = r'F:\Workspace2016\DataMining\data\2024-11-26_174430_数据导出.xlsx'
			
 
				+    df = pd.read_excel(filename)
			
 
				+
			
 
				+    ots_client = getConnect_ots()
			
 
				+
			
 
				+    list_data = []
			
 
				+
			
 
				+    for docid in df["docid"]:
			
 
				+        docid = int(docid)
			
 
				+        # if docid!=559799502:
			
 
				+        #     continue
			
 
				+        _dict = get_columns(ots_client,docid,["doctextcon","attachmenttextcon","nlp_enterprise","nlp_enterprise_attachment"])
			
 
				+        doctextcon = _dict.get("doctextcon","")
			
 
				+        attachmenttextcon = _dict.get("attachmenttextcon","")
			
 
				+        nlp_enterprise = _dict.get("nlp_enterprise","")
			
 
				+        nlp_enterprise_attachment = _dict.get("nlp_enterprise_attachment","")
			
 
				+
			
 
				+        pre_tenderee = ""
			
 
				+        if len(nlp_enterprise)>2:
			
 
				+            _ent = json.loads(nlp_enterprise)
			
 
				+            pre_tenderee = _ent[0]
			
 
				+        if len(nlp_enterprise_attachment)>2:
			
 
				+            _ent = json.loads(nlp_enterprise_attachment)
			
 
				+            pre_tenderee = _ent[0]
			
 
				+
			
 
				+        msg = '''从内容中提取出招标人，招标人应该是公司实体，如果没有则返回"",返回结果为json格式{"tenderee":""}\n%s\n%s''' % (str(doctextcon),str(attachmenttextcon))
			
 
				+        _json = jsonchat(msg,3)
			
 
				+        new_tenderee = ""
			
 
				+        if _json is not None:
			
 
				+            _d = json.loads(_json)
			
 
				+            new_tenderee = _d.get("tenderee")
			
 
				+        new_d = {"docid":docid,"nlp_enterprise":nlp_enterprise,"nlp_enterprise_attachment":nlp_enterprise_attachment,
			
 
				+                 "pre_tenderee":pre_tenderee,"new_tenderee":new_tenderee}
			
 
				+        list_data.append(new_d)
			
 
				+        print(new_d)
			
 
				+    df1 = pd.DataFrame(list_data)
			
 
				+    df1.to_excel("tenderee_extract.xlsx",columns=["docid","nlp_enterprise","nlp_enterprise_attachment","pre_tenderee","new_tenderee"])
			
 
				+
			
 
				+def prompt_tenderee():
			
 
				+    _prompt = '招标人，招标人应该是公司实体，如果没有则返回""'
			
 
				+    _ret = {"招标人":""}
			
 
				+    return _prompt,_ret
			
 
				+
			
 
				+def prompt_budget():
			
 
				+    _prompt = "预算金额，如果没有则默认0"
			
 
				+    _ret = {"预算金额":0}
			
 
				+    return _prompt,_ret
			
 
				+
			
 
				+def prompt_win_tenderer():
			
 
				+    _prompt = '中标人及其中标金额，中标人应该是公司实体，中标金额没有则默认0，中标人与中标金额放到一个字典中，如果有多个，则在数组中分别返回，如果没有则返回空数组'
			
 
				+    _ret = {"中标人及金额":[{"中标人":"","中标金额":0}]}
			
 
				+    return _prompt,_ret
			
 
				+
			
 
				+def extract_bidding_budget():
			
 
				+    pass
			
 
				+
			
 
				+def extract_win_tenderer():
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+def get_data_to_qualify(ots_client,count=-1):
			
 
				+    current_date = getCurrent_date('%Y-%m-%d')
			
 
				+    last_date = timeAdd(current_date,-1)
			
 
				+    bool_query = BoolQuery(
			
 
				+        must_queries=[
			
 
				+            RangeQuery("crtime",last_date,current_date),
			
 
				+            RangeQuery("status",201,301),
			
 
				+            TermsQuery("docchannel",[52,101,119,120])
			
 
				+        ]
			
 
				+    )
			
 
				+
			
 
				+    list_data = []
			
 
				+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query,limit=100,get_total_count=True),
			
 
				+                                                                   ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+    list_data.extend(getRow_ots(rows))
			
 
				+    while 1:
			
 
				+        if next_token is None or len(list_data)>=30*10000:
			
 
				+            break
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+                                                                   SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+                                                                   ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        list_data.extend(getRow_ots(rows))
			
 
				+        if count>0 and list_data>=count:
			
 
				+            break
			
 
				+    return list_data
			
 
				+
			
 
				+
			
 
				+def quality_inspection():
			
 
				+
			
 
				+
			
 
				+def merge_extract_json():
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    extract_tenderee()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/BaseDataMaintenance/maintenance/preproject/fillColumns.py
+++ b/BaseDataMaintenance/maintenance/preproject/fillColumns.py
@@ -232,7 +232,7 @@ class PreprojectFill():
 
				             win_tenderer_concat = _row.get(preproject_last_win_tenderer_contact)
			
 
				             win_tenderer_phone = _row.get(preproject_last_win_tenderer_phone)
			
 
				 
			
 
				-
			
 
				+            # 有联系人先根据联系人取电话
			
 
				             if tenderee is not None and tenderee!="":
			
 
				                 # if (tenderee_concat is None or tenderee_concat=="") and (tenderee_phone is None or tenderee_phone==""):
			
 
				                 if tenderee_phone is None or tenderee_phone=="":
			
@@ -543,8 +543,8 @@ class PreprojectFill():
 
				                         # in_doctextcon, last_doctitle, last_tenderee_contact, last_tenderee_phone
			
 
				 
			
 
				                         _preproject = Preproject(result_row)
			
 
				-                        if not _preproject.exists_row(self.ots_client):
			
 
				-                            _preproject.update_row(self.ots_client)
			
 
				+                        # if not _preproject.exists_row(self.ots_client):
			
 
				+                        _preproject.update_row(self.ots_client)
			
 
				 
			
 
				         _mul = MultiThreadHandler(self.purchaseIntention_process_queue,comsumer_handle,None,20)
			
 
				         _mul.run()
			
--- a/BaseDataMaintenance/maintenance/product/extract_data.py
+++ b/BaseDataMaintenance/maintenance/product/extract_data.py
--- a/BaseDataMaintenance/maintenance/product/htmlparser.py
+++ b/BaseDataMaintenance/maintenance/product/htmlparser.py
@@ -120,11 +120,12 @@ class ParseDocument():
 
				             _html = ""
			
 
				         self.html = _html
			
 
				 
			
 
				-        # self.soup = BeautifulSoup(self.html,"lxml")
			
 
				+
			
 
				         # self.soup = BeautifulSoup(self.html,"html.parser")
			
 
				         self.auto_merge_table = auto_merge_table
			
 
				 
			
 
				-        self.soup = BeautifulSoup(self.html,"html5lib")
			
 
				+        self.soup = BeautifulSoup(self.html,"lxml")
			
 
				+        # self.soup = BeautifulSoup(self.html,"html5lib")
			
 
				         _body = self.soup.find("body")
			
 
				         if _body is not None:
			
 
				             self.soup = _body
			
@@ -199,7 +200,7 @@ class ParseDocument():
 
				                 if v is not None:
			
 
				                     groups.append((k,v))
			
 
				         if len(groups):
			
 
				-            # groups.sort(key=lambda x:x[0])
			
 
				+            groups.sort(key=lambda x:x[0])
			
 
				             return groups
			
 
				         return None
			
 
				 
			
--- a/BaseDataMaintenance/maxcompute/1.py
+++ b/BaseDataMaintenance/maxcompute/1.py
--- a/BaseDataMaintenance/maxcompute/documentDumplicate.py
+++ b/BaseDataMaintenance/maxcompute/documentDumplicate.py
@@ -237,6 +237,7 @@ class f_get_extractCount(object):
 
				     def evaluate(self, extractjson):
			
 
				         if extractjson is not None:
			
 
				             _extract = json.loads(extractjson)
			
 
				+            return _extract.get("extract_count",0)
			
 
				         else:
			
 
				             _extract = {}
			
 
				         dict_pack = _extract.get("prem",{})
			
@@ -776,25 +777,33 @@ def getSimLevel(str1,str2):
 
				     return _v
			
 
				 
			
 
				 def getLength(_str):
			
 
				-    return len(_str if _str is not None else "")
			
 
				+    return len(str(_str) if _str is not None else "")
			
 
				 
			
 
				 def check_money(bidding_budget_less,bidding_budget_greater,
			
 
				                 win_bid_price_less,win_bid_price_greater,
			
 
				                 moneys_less,moneys_greater,
			
 
				                 moneys_attachment_less,moneys_attachment_greater):
			
 
				 
			
 
				+    bidding_budget_less_source = bidding_budget_less
			
 
				+    bidding_budget_greater_source = bidding_budget_greater
			
 
				+    win_bid_price_less_source = win_bid_price_less
			
 
				+    win_bid_price_greater_source = win_bid_price_greater
			
 
				     #只判断最高前六位
			
 
				     if getLength(bidding_budget_less)>0:
			
 
				+        bidding_budget_less_source = float(bidding_budget_less_source)
			
 
				         bidding_budget_less = round(float(bidding_budget_less))
			
 
				         bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
			
 
				     if getLength(bidding_budget_greater)>0:
			
 
				+        bidding_budget_greater_source = float(bidding_budget_greater_source)
			
 
				         bidding_budget_greater = round(float(bidding_budget_greater))
			
 
				         bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
			
 
				 
			
 
				     if getLength(win_bid_price_less)>0:
			
 
				+        win_bid_price_less_source = float(win_bid_price_less_source)
			
 
				         win_bid_price_less = round(float(win_bid_price_less))
			
 
				         win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
			
 
				     if getLength(win_bid_price_greater)>0:
			
 
				+        win_bid_price_greater_source = float(win_bid_price_greater_source)
			
 
				         win_bid_price_greater = round(float(win_bid_price_greater))
			
 
				         win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
			
 
				 
			
@@ -815,14 +824,21 @@ def check_money(bidding_budget_less,bidding_budget_greater,
 
				                 budget_is_same = True
			
 
				             if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
			
 
				                 budget_is_same = True
			
 
				+            if bidding_budget_less_source in moneys_greater or bidding_budget_less_source in moneys_attachment_greater:
			
 
				+                budget_is_same = True
			
 
				             if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
			
 
				                 budget_is_same = True
			
 
				+            if bidding_budget_greater_source in moneys_less or bidding_budget_greater_source in moneys_attachment_less:
			
 
				+                budget_is_same = True
			
 
				             if budget_is_same=="":
			
 
				                 return False
			
 
				 
			
 
				     if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
			
 
				+
			
 
				+
			
 
				         price_less = float(win_bid_price_less)
			
 
				         price_greater = float(win_bid_price_greater)
			
 
				+
			
 
				         if price_less!=price_greater:
			
 
				 
			
 
				             if min(price_less,price_greater)>0:
			
@@ -832,8 +848,12 @@ def check_money(bidding_budget_less,bidding_budget_greater,
 
				                 price_is_same = True
			
 
				             if price_less in moneys_greater or price_less in moneys_attachment_greater:
			
 
				                 price_is_same = True
			
 
				+            if win_bid_price_less_source in moneys_greater or win_bid_price_less_source in moneys_attachment_greater:
			
 
				+                price_is_same = True
			
 
				             if price_greater in moneys_less or price_greater in moneys_attachment_less:
			
 
				                 price_is_same = True
			
 
				+            if win_bid_price_greater_source in moneys_less or win_bid_price_greater_source in moneys_attachment_less:
			
 
				+                price_is_same = True
			
 
				             if price_is_same=="":
			
 
				                 return False
			
 
				     return True
			
@@ -867,6 +887,85 @@ def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
 
				         return False
			
 
				     return True
			
 
				 
			
 
				+
			
 
				+def check_punish(punish_less,punish_greater):
			
 
				+    same_count = 0
			
 
				+    not_same_count = 0
			
 
				+    _flag = True
			
 
				+    keys = list(set(list(punish_less.keys())) | set(list(punish_greater.keys())))
			
 
				+    for k in keys:
			
 
				+        v1 = punish_less.get(k)
			
 
				+        v2 = punish_greater.get(k)
			
 
				+        if getLength(v1)>0 and getLength(v2)>0:
			
 
				+            if k=="punish_code":
			
 
				+                if not check_codes([v1],[v2]):
			
 
				+                    not_same_count += 1
			
 
				+                    _flag = False
			
 
				+                else:
			
 
				+                    same_count += 1
			
 
				+            if k=="punishDecision":
			
 
				+                if getSimilarityOfString(v1,v2)>0.8:
			
 
				+                    same_count += 1
			
 
				+            if k in ("complainants","punishPeople","institutions"):
			
 
				+                if v1==v2:
			
 
				+                    same_count += 1
			
 
				+                else:
			
 
				+                    not_same_count == 1
			
 
				+                    _flag = False
			
 
				+    return _flag,same_count,not_same_count
			
 
				+
			
 
				+def check_source_type(source_type_less,source_type_greater):
			
 
				+    if getLength(source_type_less)>0 and getLength(source_type_greater)>0:
			
 
				+        if source_type_less!=source_type_greater:
			
 
				+            return False
			
 
				+    return True
			
 
				+def check_approval(approval_less,approval_greater,b_log):
			
 
				+
			
 
				+    if b_log:
			
 
				+        logging.info("approval_less %s==approval_greater %s"%(approval_less,approval_greater))
			
 
				+    for _less in approval_less:
			
 
				+        for _greater in approval_greater:
			
 
				+            same_count = 0
			
 
				+            not_same_count = 0
			
 
				+            flag = True
			
 
				+            keys = ["source_stage","source_type","doc_num","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","evaluation_agency","legal_person","compilation_unit","time_approval"]
			
 
				+            for k in keys:
			
 
				+                v1 = _less.get(k)
			
 
				+                v2 = _greater.get(k)
			
 
				+                if getLength(v1)>0 and getLength(v2)>0:
			
 
				+                    if k in ("source_stage","source_type"):
			
 
				+                        if v1!=v2:
			
 
				+                            flag = False
			
 
				+
			
 
				+                    if k in ("project_code","doc_num"):
			
 
				+                        if check_codes([v1],[v2]):
			
 
				+                            same_count += 1
			
 
				+                        else:
			
 
				+                            not_same_count -= 1
			
 
				+                            if b_log:
			
 
				+                                logging.info("check approval %s false %s-%s"%(k,v1,v2))
			
 
				+                            flag = False
			
 
				+                    if k in ("approval_items","approval_result","project_name"):
			
 
				+                        if getSimilarityOfString(v1,v2)>0.8:
			
 
				+                            same_count += 1
			
 
				+                        else:
			
 
				+                            not_same_count -= 1
			
 
				+                    if k in ("approver","construct_company","declare_company","evaluation_agency","legal_person","compilation_unit"):
			
 
				+                        if v1==v2:
			
 
				+                            same_count += 1
			
 
				+                        else:
			
 
				+                            not_same_count -= 1
			
 
				+                            if b_log:
			
 
				+                                logging.info("check approval %s false %s-%s"%(k,v1,v2))
			
 
				+                            flag = False
			
 
				+            if flag and same_count>1:
			
 
				+                return flag,same_count,not_same_count
			
 
				+    flag = True
			
 
				+    if len(approval_less)>0 and len(approval_greater)>0:
			
 
				+        flag = False
			
 
				+    return flag,0,0
			
 
				+
			
 
				+
			
 
				 def check_codes(project_codes_less,project_codes_greater):
			
 
				     #check the similarity
			
 
				     is_same = False
			
@@ -875,6 +974,8 @@ def check_codes(project_codes_less,project_codes_greater):
 
				 
			
 
				     for project_code_less in project_codes_less:
			
 
				         for project_code_greater in project_codes_greater:
			
 
				+            project_code_less = str(project_code_less).upper()
			
 
				+            project_code_greater = str(project_code_greater).upper()
			
 
				             code_sim = getSimilarityOfString(project_code_less,project_code_greater)
			
 
				             if project_code_less is not None and project_code_greater is not None:
			
 
				                 if code_sim>0.6:
			
@@ -900,6 +1001,7 @@ num_pattern = re.compile("^\d+(?:\.\d+)?$")
 
				 num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
			
 
				 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
			
 
				 building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\(（]?[一二三四五六七八九1-9][）\)]?[次批]"
			
 
				+rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
			
 
				 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
			
 
				 def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
			
 
				     if code_greater is None:
			
@@ -961,7 +1063,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
 
				                 return False
			
 
				 
			
 
				     #check location and keywords
			
 
				-    for _p in [num1_pattern,building_pattern]:
			
 
				+    for _p in [num1_pattern,building_pattern,rebid_pattern]:
			
 
				         num_all_l = re.findall(_p,doctitle_refind_less)
			
 
				         num_all_g = re.findall(_p,doctitle_refind_greater)
			
 
				         set_num_l = set(num_all_l)
			
@@ -995,19 +1097,70 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
 
				                     return False
			
 
				     return True
			
 
				 
			
 
				+
			
 
				+def product_dump(list_product):
			
 
				+    _product_l_l = []
			
 
				+    list_product.sort(key=lambda x:len(x))
			
 
				+    for _l in list_product:
			
 
				+        _exists = False
			
 
				+        for l1 in _product_l_l:
			
 
				+            if l1 in _l:
			
 
				+                _exists = True
			
 
				+                break
			
 
				+        if not _exists:
			
 
				+            _product_l_l.append(_l)
			
 
				+    return _product_l_l
			
 
				 def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
			
 
				     if getLength(product_less)>0 and getLength(product_greater)>0:
			
 
				 
			
 
				         _product_l = product_less.split(split_char)
			
 
				+        _product_l = product_dump(_product_l)
			
 
				         _product_g = product_greater.split(split_char)
			
 
				+        _product_g = product_dump(_product_g)
			
 
				+        _title_l = doctitle_refine_less
			
 
				+        _title_g = doctitle_refine_greater
			
 
				         same_count = 0
			
 
				         if len(_product_l)>len(_product_g):
			
 
				             a = _product_g
			
 
				             _product_g = _product_l
			
 
				             _product_l = a
			
 
				+            _title_l = doctitle_refine_greater
			
 
				+            _title_g = doctitle_refine_less
			
 
				+        set_product_l_in_title = set()
			
 
				+        set_product_g_in_title = set()
			
 
				+        for _l in _product_l:
			
 
				+            if _title_l.find(_l)>=0:
			
 
				+                set_product_l_in_title.add(_l)
			
 
				+        for _g in _product_g:
			
 
				+            if _title_g.find(_g)>=0:
			
 
				+                set_product_g_in_title.add(_g)
			
 
				+        # 限制标题出现的产品要有重叠
			
 
				+        if len(set_product_l_in_title)>0 and len(set_product_g_in_title)>0:
			
 
				+
			
 
				+            
			
 
				+            _set_union = set_product_l_in_title & set_product_g_in_title
			
 
				+
			
 
				+            # 不同的部门若有重叠则通过
			
 
				+            diff_l = set_product_l_in_title-_set_union
			
 
				+            diff_g = set_product_g_in_title-_set_union
			
 
				+
			
 
				+            diff_dump = product_dump(list(diff_l.union(diff_g)))
			
 
				+            if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
			
 
				+                return False
			
 
				+
			
 
				+            # 过于严格，暂时取消
			
 
				+            # if len(_set_union)==0:
			
 
				+            #     return False
			
 
				+            # if len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
			
 
				+            #     _l1 = list(set_product_l_in_title)
			
 
				+            #     _l2 = list(set_product_g_in_title)
			
 
				+            #     _l1.extend(_l2)
			
 
				+            #     _l1 = product_dump(_l1)
			
 
				+            #     if len(_l1)!=len(_set_union):
			
 
				+            #         return False
			
 
				         for _l in _product_l:
			
 
				             for _g in _product_g:
			
 
				-                if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
			
 
				+                if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
			
 
				                     same_count += 1
			
 
				                     break
			
 
				         if same_count/len(_product_l)>=0.5:
			
@@ -1020,12 +1173,15 @@ def check_package(package_less,package_greater,split_char=","):
 
				 
			
 
				         _product_l = package_less.split(split_char)
			
 
				         _product_g = package_greater.split(split_char)
			
 
				+        same_level = False
			
 
				         for _l in _product_l:
			
 
				             for _g in _product_g:
			
 
				+                if abs(len(_l)-len(_g))<=2:
			
 
				+                    save_level = True
			
 
				                 if _l==_g:
			
 
				                     return True
			
 
				-
			
 
				-        return False
			
 
				+        if same_level:
			
 
				+            return False
			
 
				     return True
			
 
				 
			
 
				 def check_time(json_time_less,json_time_greater):
			
@@ -1056,7 +1212,7 @@ def check_time(json_time_less,json_time_greater):
 
				         return 0
			
 
				     return 1
			
 
				 
			
 
				-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
			
 
				+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]",punish_less = {},punish_greater = {},approval_less = [],approval_greater = [],source_type_less = None,source_type_greater=None):
			
 
				     if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
			
 
				         return 1
			
 
				 
			
@@ -1100,6 +1256,11 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
 
				             if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
			
 
				                 return 1
			
 
				 
			
 
				+    #同一个站源，都有附件但附件没有重叠则不去重
			
 
				+    if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
			
 
				+        if b_log:
			
 
				+            logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
			
 
				+        return 0
			
 
				 
			
 
				     if isinstance(project_codes_less,str):
			
 
				         project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
			
@@ -1130,6 +1291,33 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
 
				         same_count += 1
			
 
				     if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
			
 
				         same_count += 1
			
 
				+
			
 
				+    _flag,_c1,_c2 = check_punish(punish_less,punish_greater)
			
 
				+    if not _flag:
			
 
				+        if b_log:
			
 
				+            logging.info("check_punish failed")
			
 
				+        return 0
			
 
				+    else:
			
 
				+        if b_log:
			
 
				+            logging.info("check_punish true %d"%(_c1))
			
 
				+        same_count += _c1
			
 
				+
			
 
				+    _flag,_c1,_c2 = check_approval(approval_less,approval_greater,b_log)
			
 
				+    if not _flag:
			
 
				+        if b_log:
			
 
				+            logging.info("check approval failed")
			
 
				+        return 0
			
 
				+    else:
			
 
				+        if b_log:
			
 
				+            logging.info("check approval true %d"%(_c1))
			
 
				+        same_count += _c1
			
 
				+
			
 
				+    _flag = check_source_type(source_type_less,source_type_greater)
			
 
				+    if not _flag:
			
 
				+        if b_log:
			
 
				+            logging.info("check source type failed")
			
 
				+        return 0
			
 
				+
			
 
				     base_prob = 0
			
 
				     if min_counts<3:
			
 
				         base_prob = 0.9
			
@@ -1140,12 +1328,17 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
 
				     else:
			
 
				         base_prob = 0.6
			
 
				     _prob = base_prob*same_count/all_count
			
 
				-    if min(extract_count_less,extract_count_greater)<=3:
			
 
				-        if _prob<0.1:
			
 
				-            _prob = 0.15
			
 
				+    if min(extract_count_less,extract_count_greater)<=3 and max(extract_count_less,extract_count_greater)>=5:
			
 
				+        if _prob<0.1 and str(page_time_less)==str(page_time_greater):
			
 
				+            if str(docchannel_less) not in ("302","303"):
			
 
				+                _prob = 0.15
			
 
				         if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
			
 
				+            if b_log:
			
 
				+                logging.info("province not same:%s-%s"%(province_less,province_greater))
			
 
				             return 0
			
 
				     if _prob<0.1:
			
 
				+        if b_log:
			
 
				+            logging.info("prob too low:%f"%(_prob))
			
 
				         return _prob
			
 
				 
			
 
				     check_result = {"pass":1}
			
@@ -1207,8 +1400,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
 
				         else:
			
 
				             check_result["entity"] = 1
			
 
				 
			
 
				-    logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
			
 
				-    logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
			
 
				+
			
 
				     if not check_money(bidding_budget_less,bidding_budget_greater,
			
 
				                        win_bid_price_less,win_bid_price_greater,
			
 
				                        moneys_less,moneys_greater,
			
@@ -1266,6 +1458,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
 
				             check_result["time"] = 1
			
 
				 
			
 
				     if hard_level==2 and check_result["product"]<=1:
			
 
				+        if b_log:
			
 
				+            logging.inf("hard_level %s and check_product less than 2"%(str(hard_level)))
			
 
				         return 0
			
 
				     if check_result.get("pass",0)==0:
			
 
				         if b_log:
			
@@ -1506,7 +1700,11 @@ class f_dumplicate_check(BaseUDTF):
 
				             page_attachments_less = '[]'
			
 
				         if page_attachments_greater is None:
			
 
				             page_attachments_greater = '[]'
			
 
				-        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
			
 
				+        punish_less = _extract_less.get("punish",{})
			
 
				+        punish_greater = _extract_greater.get("punish",{})
			
 
				+        approval_less = _extract_less.get("approval",[])
			
 
				+        approval_greater = _extract_greater.get("approval",[])
			
 
				+        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
			
 
				         self.forward(_prob)
			
 
				 
			
 
				 @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
			
@@ -1686,6 +1884,8 @@ class f_redump_probability_final_check(BaseUDAF):
 
				                 web_source_no_greater = document_greater["web_source_no"]
			
 
				                 extract_json_greater = document_greater["extract_json"]
			
 
				                 page_attachments_greater = document_greater["page_attachments"]
			
 
				+
			
 
				+
			
 
				                 _pass = True
			
 
				 
			
 
				                 for document_less in final_group:
			
@@ -1730,7 +1930,12 @@ class f_redump_probability_final_check(BaseUDAF):
 
				                     if page_attachments_greater is None:
			
 
				                         page_attachments_greater = '[]'
			
 
				 
			
 
				-                    _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
			
 
				+                    punish_less = _extract_less.get("punish",{})
			
 
				+                    punish_greater = _extract_greater.get("punish",{})
			
 
				+                    approval_less = _extract_less.get("approval",[])
			
 
				+                    approval_greater = _extract_greater.get("approval",[])
			
 
				+
			
 
				+                    _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
			
 
				 
			
 
				                     if _prob<0.1:
			
 
				                         _pass = False
			
--- a/BaseDataMaintenance/maxcompute/documentMerge.py
+++ b/BaseDataMaintenance/maxcompute/documentMerge.py
@@ -87,6 +87,15 @@ project_nlp_enterprise = "nlp_enterprise"
 
				 project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
			
 
				 project_update_time = "update_time"
			
 
				 project_tmp_attrs = "tmp_attrs"
			
 
				+project_tenderee_code = "tenderee_code"
			
 
				+project_agency_code = "agency_code"
			
 
				+project_candidates = "candidates"
			
 
				+
			
 
				+project_win_tenderer_code = "win_tenderer_code"
			
 
				+project_second_tenderer_code = "second_tenderer_code"
			
 
				+project_third_tenderer_code = "third_tenderer_code"
			
 
				+project_win_tenderer_joints = "win_tenderer_joints"
			
 
				+project_multi_winners = "multi_winners"
			
 
				 
			
 
				 document_partitionkey = "partitionkey"
			
 
				 document_docid = "docid"
			
@@ -148,6 +157,9 @@ document_time_release = "time_release"
 
				 document_info_source = "info_source"
			
 
				 document_nlp_enterprise = "nlp_enterprise"
			
 
				 document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
			
 
				+document_tenderee_code = "tenderee_code"
			
 
				+document_agency_code = "agency_code"
			
 
				+document_candidates = "candidates"
			
 
				 
			
 
				 document_tmp_partitionkey = "partitionkey"
			
 
				 document_tmp_docid = "docid"
			
@@ -183,6 +195,9 @@ document_tmp_opertime = "opertime"
 
				 document_tmp_docchannel = "docchannel"
			
 
				 document_tmp_original_docchannel = "original_docchannel"
			
 
				 
			
 
				+document_tmp_source_stage = "source_stage"
			
 
				+document_tmp_source_type = "source_type"
			
 
				+
			
 
				 document_tmp_extract_json = "extract_json"
			
 
				 document_tmp_industry_json = "industry_json"
			
 
				 document_tmp_other_json = "other_json"
			
@@ -1516,7 +1531,7 @@ def generate_common_properties(list_docs):
 
				     #计数法选择
			
 
				     choose_dict = {}
			
 
				     project_dict = {}
			
 
				-    for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
			
 
				+    for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_tenderee_code,document_agency_code]:
			
 
				         for _doc in list_docs:
			
 
				             _value = _doc.get(_key,"")
			
 
				             if _value!="":
			
@@ -1616,6 +1631,9 @@ def generate_common_properties(list_docs):
 
				     remove_docids = set()
			
 
				     set_nlp_enterprise = set()
			
 
				     set_nlp_enterprise_attachment = set()
			
 
				+
			
 
				+    set_candidates = set()
			
 
				+    list_candidates = []
			
 
				     for _doc in list_docs:
			
 
				         table_name = _doc.get("table_name")
			
 
				         status = _doc.get(document_status,0)
			
@@ -1632,13 +1650,30 @@ def generate_common_properties(list_docs):
 
				 
			
 
				         is_multipack = True if len(sub_docs)>1 else False
			
 
				         extract_count = _doc.get(document_tmp_extract_count,0)
			
 
				+        candidates = _doc.get(document_candidates,"[]")
			
 
				+
			
 
				+        _province = _doc.get(document_province,"")
			
 
				+        _city = _doc.get(document_city,"")
			
 
				+        _district = _doc.get(document_district,"")
			
 
				+
			
 
				+        tenderee = _doc.get(document_tenderee,"")
			
 
				+        agency = _doc.get(document_agency,"")
			
 
				+
			
 
				 
			
 
				         try:
			
 
				             set_nlp_enterprise |= set(json.loads(_doc.get(document_nlp_enterprise,"[]")))
			
 
				             set_nlp_enterprise_attachment |= set(json.loads(_doc.get(document_nlp_enterprise_attachment,"[]")))
			
 
				+
			
 
				+            for item in json.loads(candidates):
			
 
				+                if item.get("name") is not None and item.get("name") not in set_candidates:
			
 
				+                    list_candidates.append(item)
			
 
				+                    set_candidates.add(item.get("name"))
			
 
				+
			
 
				         except Exception as e:
			
 
				             traceback.print_exc()
			
 
				 
			
 
				+
			
 
				+
			
 
				         if product is not None:
			
 
				             list_product.extend(product.split(","))
			
 
				 
			
@@ -1651,7 +1686,7 @@ def generate_common_properties(list_docs):
 
				 
			
 
				         if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
			
 
				             zhao_biao_page_time = page_time
			
 
				-        if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
			
 
				+        if zhong_biao_page_time=="" and _docchannel in (101,118,119,120,121,122):
			
 
				             zhong_biao_page_time = page_time
			
 
				         is_visuable = 0
			
 
				         if table_name=="document":
			
@@ -1675,7 +1710,12 @@ def generate_common_properties(list_docs):
 
				                               document_page_time:page_time,
			
 
				                               document_status:201 if is_visuable==1 else 401,
			
 
				                               "is_multipack":is_multipack,
			
 
				-                              document_tmp_extract_count:extract_count
			
 
				+                              document_tmp_extract_count:extract_count,
			
 
				+                              document_tenderee:tenderee,
			
 
				+                              document_agency:agency,
			
 
				+                              document_province:_province,
			
 
				+                              document_city:_city,
			
 
				+                              document_district:_district
			
 
				                               }
			
 
				                              )
			
 
				 
			
@@ -1691,6 +1731,7 @@ def generate_common_properties(list_docs):
 
				     project_dict[project_product] = ",".join(list(set(list_product)))
			
 
				     project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
			
 
				     project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
			
 
				+    project_dict[project_candidates] = json.dumps(list_candidates[:100],ensure_ascii=False)
			
 
				 
			
 
				     return project_dict
			
 
				 
			
@@ -1716,6 +1757,7 @@ def generate_packages_properties(list_docs):
 
				                 win_tenderer = _d.get(project_win_tenderer,"")
			
 
				                 win_bid_price = _d.get(project_win_bid_price,"")
			
 
				 
			
 
				+
			
 
				                 if sub_project_name=="Project":
			
 
				 
			
 
				                     win_exists = False
			
@@ -1937,7 +1979,7 @@ class f_generate_projects_from_document(BaseUDTF):
 
				                 _product = list_product[_i%len(list_product)]
			
 
				                 self.forward(_uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
			
 
				 
			
 
				-@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
			
 
				+@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
			
 
				 class f_generate_projects_from_project(BaseUDTF):
			
 
				 
			
 
				     def __init__(self):
			
@@ -2012,7 +2054,16 @@ class f_generate_projects_from_project(BaseUDTF):
 
				                 info_source,
			
 
				                 nlp_enterprise,
			
 
				                 nlp_enterprise_attachment,
			
 
				-                update_time):
			
 
				+                update_time,
			
 
				+                tenderee_code,
			
 
				+                agency_code,
			
 
				+                candidates,
			
 
				+                win_tenderer_code,
			
 
				+                second_tenderer_code,
			
 
				+                third_tenderer_code,
			
 
				+                win_tenderer_joints,
			
 
				+                multi_winners
			
 
				+                ):
			
 
				         attrs_dict = {}
			
 
				 
			
 
				         attrs_dict[project_uuid] = uuid
			
@@ -2081,9 +2132,18 @@ class f_generate_projects_from_project(BaseUDTF):
 
				         attrs_dict[project_nlp_enterprise_attachment] = nlp_enterprise_attachment
			
 
				         attrs_dict[project_update_time] = update_time
			
 
				 
			
 
				+        attrs_dict[project_tenderee_code] = tenderee_code
			
 
				+        attrs_dict[project_agency_code] = agency_code
			
 
				+        attrs_dict[project_candidates] = candidates
			
 
				+        attrs_dict[project_win_tenderer_code] = win_tenderer_code
			
 
				+        attrs_dict[project_second_tenderer_code] = second_tenderer_code
			
 
				+        attrs_dict[project_third_tenderer_code] = third_tenderer_code
			
 
				+        attrs_dict[project_win_tenderer_joints] = win_tenderer_joints
			
 
				+        attrs_dict[project_multi_winners] = multi_winners
			
 
				 
			
 
				         popNoneFromDict(attrs_dict)
			
 
				 
			
 
				+
			
 
				         attrs_json = json.dumps(attrs_dict,ensure_ascii=False)
			
 
				         if bidding_budget is None:
			
 
				             bidding_budget = -1
			
@@ -2129,7 +2189,7 @@ def dumplicate_projects(list_projects,b_log=False):
 
				     appendKeyvalueCount(list_projects)
			
 
				     list_projects.sort(key=lambda x:str(x.get(project_page_time,"")))
			
 
				     list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
			
 
				-    cluster_projects = list_projects[:10]
			
 
				+    cluster_projects = list_projects[:100]
			
 
				     _count = 10
			
 
				     log("dumplicate projects rest %d"%len(cluster_projects))
			
 
				     while _count>0:
			
@@ -2170,7 +2230,7 @@ def update_projects_by_project(project_dict,projects):
 
				     _dict = {}
			
 
				     #更新公共属性
			
 
				     for k,v in project_dict.items():
			
 
				-        if k in (project_project_dynamics,project_page_time,project_sub_project_name,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment):
			
 
				+        if k in (project_project_dynamics,project_page_time,project_sub_project_name,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment,project_candidates):
			
 
				             continue
			
 
				         for _proj in projects:
			
 
				             if k not in _proj:
			
@@ -2203,20 +2263,40 @@ def update_projects_by_project(project_dict,projects):
 
				     set_delete_uuid = set()
			
 
				     set_nlp_enterprise = set()
			
 
				     set_nlp_enterprise_attachment = set()
			
 
				+    set_update_uuid = set()
			
 
				+
			
 
				+    set_candidates = set()
			
 
				+
			
 
				+
			
 
				+    try:
			
 
				+        set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
			
 
				+        set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
			
 
				+        list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
			
 
				+    except Exception as e:
			
 
				+        pass
			
 
				+
			
 
				     for _proj in projects:
			
 
				         _docids = _proj.get(project_docids,"")
			
 
				         _codes = _proj.get(project_project_codes,"")
			
 
				         _product = _proj.get(project_product,"")
			
 
				         _uuid = _proj.get(project_uuid,"")
			
 
				+        update_uuid = _proj.get("project_uuid","")
			
 
				         delete_uuid = _proj.get(project_delete_uuid,"")
			
 
				         set_docid = set_docid | set(_docids.split(","))
			
 
				         set_code = set_code | set(_codes.split(","))
			
 
				         set_product = set_product | set(_product.split(","))
			
 
				         set_uuid = set_uuid | set(_uuid.split(","))
			
 
				+        set_update_uuid = set_update_uuid | set(update_uuid.split(","))
			
 
				         set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
			
 
				         try:
			
 
				             set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
			
 
				             set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
			
 
				+
			
 
				+            for item in json.loads(_proj.get(project_candidates,"[]")):
			
 
				+                if item.get("name") is not None and item.get("name") not in set_candidates:
			
 
				+                    list_candidates.append(item)
			
 
				+                    set_candidates.add(item.get("name"))
			
 
				+
			
 
				         except Exception as e:
			
 
				             pass
			
 
				     set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
			
@@ -2225,12 +2305,9 @@ def update_projects_by_project(project_dict,projects):
 
				 
			
 
				     set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
			
 
				     set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
			
 
				+    set_update_uuid = set_update_uuid | set(project_dict.get("project_uuid","").split(","))
			
 
				+
			
 
				 
			
 
				-    try:
			
 
				-        set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
			
 
				-        set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
			
 
				-    except Exception as e:
			
 
				-        pass
			
 
				 
			
 
				     append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
			
 
				     append_dict[project_docid_number] = len(set_docid)
			
@@ -2238,8 +2315,10 @@ def update_projects_by_project(project_dict,projects):
 
				     append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
			
 
				     append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
			
 
				     append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
			
 
				+    append_dict["update_uuid"] = ",".join([a for a in list(set_update_uuid) if a!=""])
			
 
				     append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
			
 
				     append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
			
 
				+    append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
			
 
				 
			
 
				     dict_dynamic = {}
			
 
				     set_docid = set()
			
@@ -2568,6 +2647,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
 
				 
			
 
				 
			
 
				 def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
			
 
				+
			
 
				     docids = _proj.get(project_docids,"")
			
 
				     page_time = _proj.get(project_page_time,"")
			
 
				     project_codes = _proj.get(project_project_codes,"")
			
@@ -2620,6 +2700,14 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
				 
			
 
				     project_dynamics_to_merge = _dict.get(project_project_dynamics)
			
 
				 
			
 
				+    # if len(set([docids,docids_to_merge])&set(["576859812","545764033"]))==2:
			
 
				+    #     if return_prob:
			
 
				+    #         return True,1
			
 
				+    #     return True
			
 
				+
			
 
				+    if b_log:
			
 
				+        log("check %s-%s ,%s-%s"%(docids,docids_to_merge,sub_project_name,sub_project_name_to_merge))
			
 
				+
			
 
				     is_few = False
			
 
				     if (0 if project_codes=="" else 1) + (0 if project_name=="" else 1) + (0 if bidding_budget<0 else 1) +(0 if tenderee=="" else 1) + (0 if win_bid_price<0 else 1) + (0 if win_tenderer=="" else 1)<=1:
			
 
				         is_few = True
			
@@ -2678,21 +2766,20 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
				 
			
 
				     prob_count += _codes_check
			
 
				 
			
 
				-    if is_few:
			
 
				-        if _codes_check!=1:
			
 
				-            if _title_check!=1:
			
 
				-                if return_prob:
			
 
				-                    return False,0
			
 
				-                return False
			
 
				-            if len(enterprise)>0 and len(enterprise_to_merge)>0:
			
 
				-                if len(enterprise & enterprise_to_merge)==0:
			
 
				-                    if return_prob:
			
 
				-                        return False,0
			
 
				-                    return False
			
 
				-            if _product_check==-1:
			
 
				+    if _codes_check!=1:
			
 
				+        if _title_check!=1:
			
 
				+            if return_prob:
			
 
				+                return False,0
			
 
				+            return False
			
 
				+        if len(enterprise)>0 and len(enterprise_to_merge)>0:
			
 
				+            if len(enterprise & enterprise_to_merge)==0:
			
 
				                 if return_prob:
			
 
				                     return False,0
			
 
				                 return False
			
 
				+        if _product_check==-1:
			
 
				+            if return_prob:
			
 
				+                return False,0
			
 
				+            return False
			
 
				 
			
 
				     min_count = 2
			
 
				     if product=="" or product_to_merge=="":
			
@@ -2737,8 +2824,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
				 
			
 
				     _prob = prob_count/8
			
 
				 
			
 
				-    if b_log:
			
 
				-        log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
			
 
				+
			
 
				     if _prob<0.15:
			
 
				         if b_log:
			
 
				             log("prob less than 0.15 prob_count:%d"%(prob_count))
			
@@ -2923,17 +3009,128 @@ class MyEncoder(json.JSONEncoder):
 
				             return obj
			
 
				         return json.JSONEncoder.default(self, obj)
			
 
				 
			
 
				+def update_document_from_dynamic(_proj):
			
 
				+    try:
			
 
				+        list_dynamic = []
			
 
				+        try:
			
 
				+            list_dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
			
 
				+        except Exception as e:
			
 
				+            pass
			
 
				+
			
 
				+        dict_update_dict = {}
			
 
				+        dict_column_count = {}
			
 
				+        dict_addr_count = {}
			
 
				+        for _dynamic in list_dynamic:
			
 
				+            docid = _dynamic.get(document_docid)
			
 
				+            tenderee = _dynamic.get(document_tenderee)
			
 
				+            agency = _dynamic.get(document_agency)
			
 
				+            province = _dynamic.get(document_province)
			
 
				+            city = _dynamic.get(document_city)
			
 
				+            district = _dynamic.get(document_district)
			
 
				+
			
 
				+
			
 
				+            if getLength(tenderee)>0:
			
 
				+                if tenderee not in dict_column_count:
			
 
				+                    dict_column_count[tenderee] = {"count":1,"type":document_tenderee,"value":tenderee}
			
 
				+                else:
			
 
				+                    dict_column_count[tenderee]["count"] += 1
			
 
				+            if getLength(agency)>0:
			
 
				+                if agency not in dict_column_count:
			
 
				+                        dict_column_count[agency] = {"count":1,"type":document_agency,"value":agency}
			
 
				+                else:
			
 
				+                    dict_column_count[agency]["count"] += 1
			
 
				+
			
 
				+            if province is not None and city is not None and district is not None:
			
 
				+                addr = "%s%s%s"%(province,city,district)
			
 
				+                if addr not in dict_addr_count:
			
 
				+                    dict_addr_count[addr] = {"count":1}
			
 
				+                    dict_addr_count[addr][document_province] = province
			
 
				+                    dict_addr_count[addr][document_city] = city
			
 
				+                    dict_addr_count[addr][document_district] = district
			
 
				+                    if district!="":
			
 
				+                        dict_addr_count[addr]["level"] = 3
			
 
				+                    elif city!="":
			
 
				+                        dict_addr_count[addr]["level"] = 2
			
 
				+                    else:
			
 
				+                        dict_addr_count[addr]["level"] = 1
			
 
				+                else:
			
 
				+                    dict_addr_count[addr]["count"] += 1
			
 
				+
			
 
				+        dict_list_v = {}
			
 
				+        for k,v in dict_column_count.items():
			
 
				+            _type = v.get("type")
			
 
				+            if _type not in dict_list_v:
			
 
				+                dict_list_v[_type] = []
			
 
				+            dict_list_v[_type].append(v)
			
 
				+        for k,v in dict_list_v.items():
			
 
				+            v.sort(key=lambda x:x["count"],reverse=True)
			
 
				+            if len(v)>0:
			
 
				+                _proj[k] = v[0]["value"]
			
 
				+                for _dynamic in list_dynamic:
			
 
				+                    docid = _dynamic.get(document_docid)
			
 
				+                    _v = _dynamic.get(k)
			
 
				+                    if _v is not None and _v!="":
			
 
				+                        if _v!=v[0]["value"]:
			
 
				+                            if docid not in dict_update_dict:
			
 
				+                                dict_update_dict[docid] = {document_docid:docid}
			
 
				+                            dict_update_dict[docid][k] = v[0]["value"]
			
 
				+        list_v = []
			
 
				+        for k,v in dict_addr_count.items():
			
 
				+            list_v.append(v)
			
 
				+        list_v.sort(key=lambda x:x.get("count",0),reverse=True)
			
 
				+        list_v.sort(key=lambda x:x.get("level",0),reverse=True)
			
 
				+        if len(list_v)>0:
			
 
				+            province = list_v[0].get(document_province)
			
 
				+            city = list_v[0].get(document_city)
			
 
				+            district = list_v[0].get(document_district)
			
 
				+
			
 
				+            _proj[document_province] = province
			
 
				+            _proj[document_city] = city
			
 
				+            _proj[document_district] = district
			
 
				+            for _dynamic in list_dynamic:
			
 
				+                docid = _dynamic.get(document_docid)
			
 
				+
			
 
				+                if document_province in _dynamic:
			
 
				+                    if _dynamic.get(document_province,"")==province or _dynamic.get(document_province,"") in ("全国","未知",""):
			
 
				+                        if province!=_dynamic.get(document_province,"") or city!=_dynamic.get(document_city,"") or district!=_dynamic.get(document_district,""):
			
 
				+                            if docid not in dict_update_dict:
			
 
				+                                dict_update_dict[docid] = {document_docid:docid}
			
 
				+                            dict_update_dict[docid][document_province] = province
			
 
				+                            dict_update_dict[docid][document_city] = city
			
 
				+                            dict_update_dict[docid][document_district] = district
			
 
				+        update_v = []
			
 
				+        for k,v in dict_update_dict.items():
			
 
				+            update_v.append(v)
			
 
				+        _proj["document_update"] = update_v
			
 
				+    except Exception as e:
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				 def to_project_json(projects):
			
 
				 
			
 
				     list_proj = []
			
 
				     for _proj in projects:
			
 
				         _uuid = _proj.get(project_uuid,"")
			
 
				+        update_uuid = _proj.get("update_uuid","")
			
 
				+        _project_uuid = _proj.get("project_uuid","")
			
 
				         if "enterprise" in _proj:
			
 
				             _proj.pop("enterprise")
			
 
				         list_uuid = [a for a in _uuid.split(",") if a!=""]
			
 
				+        list_update_uuid = [a for a in update_uuid.split(",") if a!=""]
			
 
				+        if _project_uuid:
			
 
				+            list_update_uuid.append(_project_uuid)
			
 
				+        list_update_uuid = list(set(list_update_uuid))
			
 
				         if len(list_uuid)>0:
			
 
				             _proj["keep_uuid"] = list_uuid[0]
			
 
				             _proj["delete_uuid"] = ",".join(list_uuid[1:])
			
 
				+            list_update_uuid.extend(list_uuid[1:])
			
 
				+            _proj["update_uuid"] = ",".join(list_update_uuid)
			
 
				+        elif len(list_update_uuid)>0:
			
 
				+            _proj["keep_uuid"] = list_update_uuid[0]
			
 
				+            _proj["delete_uuid"] = _proj.get("delete_uuid","")
			
 
				+            _proj["update_uuid"] = ",".join(list_update_uuid[1:])
			
 
				         else:
			
 
				             _proj["keep_uuid"] = _proj.get("keep_uuid","")
			
 
				             to_delete = _proj.get("to_delete","")
			
@@ -2944,6 +3141,9 @@ def to_project_json(projects):
 
				         list_proj.append(_proj)
			
 
				         if project_uuid in _proj:
			
 
				             _proj.pop(project_uuid)
			
 
				+        if "project_uuid" in _proj:
			
 
				+            _proj.pop("project_uuid")
			
 
				+        update_document_from_dynamic(_proj)
			
 
				     return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
			
 
				 
			
 
				 def get_page_time_dis(page_time,n_page_time):
			
@@ -2964,6 +3164,15 @@ def check_page_time_dup(page_time,n_page_time):
 
				     return False
			
 
				 
			
 
				 
			
 
				+def check_fix_document(doctitle,n_doctitle):
			
 
				+    _fix = re.search("更正|更新|变更|澄清",doctitle)
			
 
				+    _n_fix = re.search("更正|更新|变更|澄清",n_doctitle)
			
 
				+    if _fix is not None and _n_fix is not None:
			
 
				+        return True
			
 
				+    if _fix is  None and _n_fix is None:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				 def dumplicate_document_in_merge(list_projects,dup_docid):
			
 
				     '''
			
 
				     合并时去重
			
@@ -3013,6 +3222,8 @@ def dumplicate_document_in_merge(list_projects,dup_docid):
 
				                             continue
			
 
				                         if is_multipack or n_is_multipack:
			
 
				                             continue
			
 
				+                        if not check_fix_document(doctitle,n_doctitle):
			
 
				+                            continue
			
 
				                         n_title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",n_doctitle)
			
 
				                         if title_search is None and n_title_search is None:
			
 
				                             pass
			
--- a/BaseDataMaintenance/model/oracle/QiTaShiXinTemp.py
+++ b/BaseDataMaintenance/model/oracle/QiTaShiXinTemp.py
@@ -0,0 +1,17 @@
 
				+
			
 
				+import traceback
			
 
				+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
			
 
				+
			
 
				+dict_replace = {""}
			
 
				+
			
 
				+class QiTaShiXin(SouSuTemp):
			
 
				+
			
 
				+    def __init__(self,_dict):
			
 
				+        SouSuTemp.__init__(self,_dict)
			
 
				+        self.table_name = "bxkc.t_qi_ta_shi_xin_temp"
			
 
				+        self.setValue("docchannel",303,True)
			
 
				+        self.setValue("original_type","qi_ta_shi_xin",True)
			
 
				+
			
 
				+    def getPrimary_keys(self):
			
 
				+        return ["ID"]
			
 
				+
			
--- a/BaseDataMaintenance/model/oracle/T_SHEN_PI_XIANG_MU.py
+++ b/BaseDataMaintenance/model/oracle/T_SHEN_PI_XIANG_MU.py
@@ -41,6 +41,8 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
				     def getProperties_ots(self):
			
 
				         new_dict = {}
			
 
				         for k,v in self.__dict__.items():
			
 
				+            if k=="all_columns":
			
 
				+                continue
			
 
				             if v is not None:
			
 
				                 if isinstance(v,(str,int,float)):
			
 
				                     pass
			
@@ -52,12 +54,20 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
				         docid = int(new_dict.get("id",0))
			
 
				         partition_key = docid%500+1
			
 
				 
			
 
				-        new_dict["partition_key"] = partition_key
			
 
				+        new_dict["partitionkey"] = partition_key
			
 
				         new_dict["docid"] = docid
			
 
				         new_dict["original_id"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
			
 
				+        new_dict["uuid"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
			
 
				         new_dict.pop(T_SHEN_PI_XIANG_MU_ID)
			
 
				 
			
 
				-        new_dict["uuid"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
			
 
				+        try:
			
 
				+            if new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_STAGE) is not None:
			
 
				+                new_dict[T_SHEN_PI_XIANG_MU_SOURCE_STAGE] = int(new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_STAGE,0))
			
 
				+            if new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_TYPE) is not None:
			
 
				+                new_dict[T_SHEN_PI_XIANG_MU_SOURCE_TYPE] = int(new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_TYPE,0))
			
 
				+        except Exception as e:
			
 
				+            pass
			
 
				+
			
 
				 
			
 
				         new_dict["crtime"] = new_dict.get(T_SHEN_PI_XIANG_MU_CREATE_TIME)
			
 
				         new_dict["docchannel"] = 302
			
@@ -65,11 +75,13 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
				         new_dict["doctitle"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_TITLE,"")
			
 
				         new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_TITLE)
			
 
				 
			
 
				-        new_dict["dochtmlcon"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
			
 
				-        new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
			
 
				+        new_dict["dochtmlcon"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_CONTENT,"")
			
 
				+        if T_SHEN_PI_XIANG_MU_PAGE_CONTENT in new_dict:
			
 
				+            new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
			
 
				 
			
 
				-        new_dict["detail_link"] = new_dict.get(T_SHEN_PI_XIANG_MU_DETAILLINK)
			
 
				-        new_dict.pop(T_SHEN_PI_XIANG_MU_DETAILLINK)
			
 
				+        new_dict["detail_link"] = new_dict.get(T_SHEN_PI_XIANG_MU_DETAILLINK,"")
			
 
				+        if T_SHEN_PI_XIANG_MU_DETAILLINK in new_dict:
			
 
				+            new_dict.pop(T_SHEN_PI_XIANG_MU_DETAILLINK)
			
 
				 
			
 
				         new_dict[T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS] = new_dict.get(T_SHEN_PI_XIANG_MU_ATTACHMENT_PATH,"[]")
			
 
				 
			
@@ -81,15 +93,31 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
				             new_dict["original_docchannel"] = new_dict["docchannel"]
			
 
				         return new_dict
			
 
				 
			
 
				-    def select_rows(conn,max_shenpi_id,limit=500):
			
 
				+    @staticmethod
			
 
				+    def get_max_id(conn):
			
 
				+        cursor = conn.cursor()
			
 
				+        sql = "select max(id) from %s"%("bxkc.t_shen_pi_xiang_mu_new")
			
 
				+
			
 
				+        cursor.execute(sql)
			
 
				+        rows = cursor.fetchall()
			
 
				+
			
 
				+        if len(rows)>0:
			
 
				+            max_id = rows[0][0]
			
 
				+            log("selext_max_id:%d"%(max_id))
			
 
				+            return max_id
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def select_rows(conn,_id,limit=500):
			
 
				         list_result = []
			
 
				         s_limit = ""
			
 
				         if limit is not None:
			
 
				             s_limit = "limit %d"%limit
			
 
				-        s_where = " where id>%d "%(max_shenpi_id)
			
 
				+        s_where = " where id=%d "%(_id)
			
 
				 
			
 
				         cursor = conn.cursor()
			
 
				-        sql = "select %s from %s %s %s order by id asc"%("*","t_shen_pi_xiang_mu_new",s_where,s_limit)
			
 
				+        sql = "select %s from %s %s "%("*","bxkc.t_shen_pi_xiang_mu_new",s_where)
			
 
				         log("select rows:%s"%(sql))
			
 
				         cursor.execute(sql)
			
 
				 
			
@@ -98,7 +126,7 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
				         for row in rows:
			
 
				             _dict = {}
			
 
				             for _vol,_val in zip(vol,row):
			
 
				-                _name = _vol[0]
			
 
				+                _name = str(_vol[0]).lower()
			
 
				                 _dict[_name] = _val
			
 
				             list_result.append(T_SHEN_PI_XIANG_MU(_dict))
			
 
				         return list_result
			
--- a/BaseDataMaintenance/model/oracle/TouSuChuLiTemp.py
+++ b/BaseDataMaintenance/model/oracle/TouSuChuLiTemp.py
@@ -0,0 +1,17 @@
 
				+
			
 
				+import traceback
			
 
				+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
			
 
				+
			
 
				+dict_replace = {""}
			
 
				+
			
 
				+class TouSuChuLiTemp(SouSuTemp):
			
 
				+
			
 
				+    def __init__(self,_dict):
			
 
				+        SouSuTemp.__init__(self,_dict)
			
 
				+        self.table_name = "bxkc.t_tou_su_chu_li_temp"
			
 
				+        self.setValue("docchannel",303,True)
			
 
				+        self.setValue("original_type","tou_su_chu_li",True)
			
 
				+
			
 
				+    def getPrimary_keys(self):
			
 
				+        return ["ID"]
			
 
				+
			
--- a/BaseDataMaintenance/model/oracle/TouSuTemp.py
+++ b/BaseDataMaintenance/model/oracle/TouSuTemp.py
@@ -0,0 +1,215 @@
 
				+
			
 
				+import traceback
			
 
				+from BaseDataMaintenance.model.oracle.BaseModel import BaseModel
			
 
				+from datetime import datetime
			
 
				+from BaseDataMaintenance.common.Utils import getCurrent_date,log
			
 
				+
			
 
				+dict_oracle2ots = {"WEB_SOURCE_NO":"web_source_no",
			
 
				+                    "AREA":"area",
			
 
				+                    "PROVINCE":"province",
			
 
				+                    "CITY":"city",
			
 
				+                    "WEB_SOURCE_NAME":"web_source_name",
			
 
				+                    "INFO_SOURCE":"info_source",
			
 
				+                    "INFO_TYPE":"info_type",
			
 
				+                    "INDUSTRY":"industry",
			
 
				+                    "ID":"uuid",
			
 
				+                    "PAGE_TITLE":"doctitle",
			
 
				+                    "PAGE_TIME":"page_time",
			
 
				+                    "PAGE_CONTENT":"dochtmlcon",
			
 
				+                    "ATTACHMENT_PATH":"page_attachments",
			
 
				+                    "CREATE_TIME":"crtime",
			
 
				+                    "DISTRICT":"district",
			
 
				+                    "DETAILLINK":"detail_link",
			
 
				+                   "RECORD_ID":"record_id",
			
 
				+                   "PUNISHNO":"punishno",
			
 
				+                   "INSTITUTION":"institution",
			
 
				+                   "PUNISHTIME":"punish_time",
			
 
				+                   "PUNISHTYPE":"punish_type",
			
 
				+                   "COMPLAINANT":"complainant",
			
 
				+                   "PUNISHPERPLE":"punish_perple",
			
 
				+                   "PUNISHWHETHER":"punish_whether",
			
 
				+                   "PUNISHDECISION":"punish_decision",
			
 
				+                   "docchannel":"docchannel",
			
 
				+                   "original_type":"original_type"}
			
 
				+
			
 
				+
			
 
				+class SouSuTemp(BaseModel):
			
 
				+
			
 
				+    def __init__(self,_dict):
			
 
				+        self.all_columns = []
			
 
				+        for k,v in _dict.items():
			
 
				+            self.setValue(k,v,True)
			
 
				+
			
 
				+    def getPrimary_keys(self):
			
 
				+        raise NotImplementedError()
			
 
				+
			
 
				+    def getProperties(self):
			
 
				+        return self.__dict__
			
 
				+
			
 
				+    def getProperties_ots(self):
			
 
				+        new_dict = {}
			
 
				+        for k,v in self.__dict__.items():
			
 
				+            if k in dict_oracle2ots:
			
 
				+                n_k = dict_oracle2ots[k]
			
 
				+                if v is not None:
			
 
				+                    if isinstance(v,(str,int,float)):
			
 
				+                        pass
			
 
				+                    elif isinstance(v,(datetime)):
			
 
				+                        v = v.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+                    else:
			
 
				+                        v = str(v)
			
 
				+                    new_dict[n_k] = v
			
 
				+        opertime = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
			
 
				+        publishtime = "%s %s"%(new_dict.get("page_time",""),opertime.split(" ")[1])
			
 
				+        new_dict["opertime"] = opertime
			
 
				+        new_dict["publishtime"] = publishtime
			
 
				+        if "docchannel" in new_dict:
			
 
				+            new_dict["original_docchannel"] = new_dict["docchannel"]
			
 
				+        return new_dict
			
 
				+
			
 
				+    def setValue(self,k,v,isColumn=False):
			
 
				+        if "all_columns" not in self.__dict__:
			
 
				+            self.all_columns = []
			
 
				+        self.__dict__[k] = v
			
 
				+        if isColumn:
			
 
				+            if k not in (set(self.all_columns)):
			
 
				+                self.all_columns.append(k)
			
 
				+
			
 
				+    def delete_row(self,conn):
			
 
				+        try:
			
 
				+            cursor = conn.cursor()
			
 
				+            sql = "delete %s  "%(self.table_name)
			
 
				+            s_where = " where 1=1 "
			
 
				+            _set_keys = set(self.getPrimary_keys())
			
 
				+            has_key = False
			
 
				+            if len(_set_keys)==0:
			
 
				+                return
			
 
				+            for k,v in self.__dict__.items():
			
 
				+                if k in _set_keys:
			
 
				+                    if v is None or str(v)=="":
			
 
				+                        raise RuntimeError("主键%s为空"%k)
			
 
				+                    s_where += " and %s="%k
			
 
				+                    if isinstance(v,str):
			
 
				+                        s_where += "'%s' "%v
			
 
				+                    else:
			
 
				+                        s_where += "%d "%v
			
 
				+                    has_key = True
			
 
				+            log("delete sql:%s-%s %s"%(str(has_key),sql,s_where))
			
 
				+            if has_key:
			
 
				+                sql = "%s %s"%(sql,s_where)
			
 
				+                update_rows = cursor.execute(sql)
			
 
				+                conn.commit()
			
 
				+                return update_rows
			
 
				+        except Exception as e:
			
 
				+            traceback.print_exc()
			
 
				+        return 0
			
 
				+
			
 
				+    def insert_row(self,conn):
			
 
				+        try:
			
 
				+            cursor = conn.cursor()
			
 
				+            sql = "insert into %s"%(self.table_name)
			
 
				+            s_columns = "("
			
 
				+            s_values = "values("
			
 
				+            _set_columns = set(self.all_columns)
			
 
				+            for k,v in self.__dict__.items():
			
 
				+                if k in _set_columns:
			
 
				+                    if v is not None and str(v)!="":
			
 
				+                        s_columns += "%s,"%k
			
 
				+
			
 
				+                        if isinstance(v,(int,)):
			
 
				+                            s_values += "%d,"%v
			
 
				+
			
 
				+                        elif isinstance(v,(datetime)):
			
 
				+                            s_values += "to_date('%s','yyyy-MM-dd HH24:mi:ss'),"%v.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+                        else:
			
 
				+                            s_values += "'%s',"%str(v).replace("'","\'")
			
 
				+            s_columns = "%s)"%s_columns[:-1]
			
 
				+            s_values = "%s)"%s_values[:-1]
			
 
				+            sql = "%s%s%s"%(sql,s_columns,s_values)
			
 
				+            print("sql",sql)
			
 
				+            cursor.execute(sql)
			
 
				+            conn.commit()
			
 
				+        except Exception as e:
			
 
				+            traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+    def update_row(self,conn,conditions=[]):
			
 
				+        cursor = conn.cursor()
			
 
				+        sql = "update %s set "%(self.table_name)
			
 
				+        s_columns = ""
			
 
				+        s_where = " where 1=1 "
			
 
				+        _set_columns = set(self.all_columns)
			
 
				+        _set_keys = set(self.getPrimary_keys())
			
 
				+        for k,v in self.__dict__.items():
			
 
				+            if k in _set_columns and k not in _set_keys:
			
 
				+                if v is not None and str(v)!="":
			
 
				+                    s_columns += "%s="%k
			
 
				+                    if isinstance(v,str):
			
 
				+                        s_columns += "'%s',"%v
			
 
				+                    else:
			
 
				+                        s_columns += "%d,"%v
			
 
				+            elif k in _set_keys:
			
 
				+                if v is None or str(v)=="":
			
 
				+                    raise RuntimeError("主键%s为空"%k)
			
 
				+                s_where += " and %s="%k
			
 
				+                if isinstance(v,str):
			
 
				+                    s_where += "'%s' "%v
			
 
				+                else:
			
 
				+                    s_where += "%d "%v
			
 
				+        s_columns = "%s"%s_columns[:-1]
			
 
				+        sql = "%s%s%s"%(sql,s_columns,s_where)
			
 
				+        update_rows = cursor.execute(sql)
			
 
				+        conn.commit()
			
 
				+        return update_rows
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def exists(self,conn):
			
 
				+        s_where = " where 1=1 "
			
 
				+        _set_columns = set(self.all_columns)
			
 
				+        _set_keys = set(self.getPrimary_keys())
			
 
				+        for k,v in self.__dict__.items():
			
 
				+            if k in _set_keys:
			
 
				+                if v is None or str(v)=="":
			
 
				+                    raise RuntimeError("主键%s为空"%k)
			
 
				+                s_where += " and %s="%k
			
 
				+                if isinstance(v,str):
			
 
				+                    s_where += "'%s' "%v
			
 
				+                else:
			
 
				+                    s_where += "%d "%v
			
 
				+        cursor = conn.cursor()
			
 
				+        sql = "select count(1) from %s %s"%(self.table_name,s_where)
			
 
				+        cursor.execute(sql)
			
 
				+        rows = cursor.fetchall()
			
 
				+        if rows[0][0]==0:
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def select_rows(conn,cls,table_name,conditions,rows_to_get="*",limit=60):
			
 
				+        list_result = []
			
 
				+        s_limit = ""
			
 
				+        if limit is not None:
			
 
				+            s_limit = " and rownum<=%d"%limit
			
 
				+        if len(conditions)>0:
			
 
				+            s_where = " where %s"%(" and ".join(conditions))
			
 
				+        else:
			
 
				+            s_where = " where 1=1 "
			
 
				+
			
 
				+        cursor = conn.cursor()
			
 
				+        sql = "select %s from %s %s %s"%(rows_to_get,table_name,s_where,s_limit)
			
 
				+        log(sql)
			
 
				+        cursor.execute(sql)
			
 
				+
			
 
				+        vol = cursor.description
			
 
				+        rows = cursor.fetchall()
			
 
				+        for row in rows:
			
 
				+            _dict = {}
			
 
				+            for _vol,_val in zip(vol,row):
			
 
				+                _name = _vol[0]
			
 
				+                _dict[_name] = _val
			
 
				+            list_result.append(cls(_dict))
			
 
				+        return list_result
			
 
				+
			
 
				+
			
 
				+
			
--- a/BaseDataMaintenance/model/oracle/WeiFaJiLuTemp.py
+++ b/BaseDataMaintenance/model/oracle/WeiFaJiLuTemp.py
@@ -0,0 +1,56 @@
 
				+
			
 
				+import traceback
			
 
				+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
			
 
				+
			
 
				+dict_replace = {""}
			
 
				+
			
 
				+class WeiFaJiLuTemp(SouSuTemp):
			
 
				+
			
 
				+    def __init__(self,_dict):
			
 
				+        SouSuTemp.__init__(self,_dict)
			
 
				+        self.table_name = "bxkc.t_wei_fa_ji_lu_temp"
			
 
				+        self.setValue("docchannel",303,True)
			
 
				+        self.setValue("original_type","wei_fa_ji_lu",True)
			
 
				+
			
 
				+    def getPrimary_keys(self):
			
 
				+        return ["ID"]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def synchonize():
			
 
				+        try:
			
 
				+            print("123")
			
 
				+            from BaseDataMaintenance.dataSource.source import getConnection_oracle
			
 
				+            conn = getConnection_oracle()
			
 
				+            cursor = conn.cursor()
			
 
				+            has_commit = 0
			
 
				+            while 1:
			
 
				+                sql = '''
			
 
				+                INSERT INTO bxkc.t_wei_fa_ji_lu_temp
			
 
				+SELECT *
			
 
				+FROM (
			
 
				+         SELECT w.*
			
 
				+         FROM bxkc.t_wei_fa_ji_lu w
			
 
				+                  LEFT JOIN bxkc.id_wei_fa_ji_lu b ON w.id = b.id
			
 
				+         WHERE b.id IS not NULL
			
 
				+     ) res
			
 
				+WHERE ROWNUM < 10001
			
 
				+                '''
			
 
				+                cursor.execute(sql)
			
 
				+                row_effected = cursor.rowcount
			
 
				+
			
 
				+                if row_effected==0:
			
 
				+                    break
			
 
				+                print("row_effected",row_effected)
			
 
				+                sql1 = '''
			
 
				+                delete bxkc.id_wei_fa_ji_lu where id in (select id from bxkc.t_wei_fa_ji_lu_temp)
			
 
				+                '''
			
 
				+                cursor.execute(sql1)
			
 
				+                conn.commit()
			
 
				+
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    WeiFaJiLuTemp.synchonize()
			
--- a/BaseDataMaintenance/model/ots/BaseModel.py
+++ b/BaseDataMaintenance/model/ots/BaseModel.py
@@ -20,9 +20,12 @@ class BaseModel():
 
				         raise NotImplementedError
			
 
				 
			
 
				     def setValue(self,k,v,isColumn=True):
			
 
				+        if k=="all_columns":
			
 
				+            return
			
 
				         if "all_columns" not in self.__dict__ or not isinstance(self.__dict__["all_columns"],(list)):
			
 
				             self.all_columns = []
			
 
				         self.__dict__[k] = v
			
 
				+
			
 
				         if isColumn:
			
 
				             if k not in (set(self.all_columns)):
			
 
				                 self.all_columns.append(k)
			
--- a/BaseDataMaintenance/model/ots/document.py
+++ b/BaseDataMaintenance/model/ots/document.py
@@ -307,10 +307,11 @@ def turn_document_status():
 
				 
			
 
				         bool_query = BoolQuery(
			
 
				             must_queries=[
			
 
				-                # MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
			
 
				-                WildcardQuery("web_source_no","03716-*"),
			
 
				-                RangeQuery("page_time","2024-04-24"),
			
 
				-                TermQuery("save",1)
			
 
				+                MatchPhraseQuery("doctitle","破产清算案"),
			
 
				+                MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
			
 
				+                # WildcardQuery("web_source_no","03716-*"),
			
 
				+                # RangeQuery("product_number",500),
			
 
				+                # TermQuery("save",1)
			
 
				                 # RangeQuery("status",0,1),
			
 
				                 # NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")),
			
 
				                 # TermQuery("docid",397656324)
			
@@ -341,25 +342,25 @@ def turn_document_status():
 
				         #
			
 
				         # )
			
 
				 
			
 
				-        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
 
				-                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
			
 
				-                                                                       columns_to_get=ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
			
 
				-        list_data = getRow_ots(rows)
			
 
				-        print(total_count)
			
 
				-        _count = len(list_data)
			
 
				-        for _data in list_data:
			
 
				-            _document = Document_tmp(_data)
			
 
				-            task_queue.put(_document)
			
 
				-        while next_token:
			
 
				-            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
 
				-                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				-                                                                           columns_to_get=ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
			
 
				-            list_data = getRow_ots(rows)
			
 
				-            _count += len(list_data)
			
 
				-            print("%d/%d"%(_count,total_count))
			
 
				-            for _data in list_data:
			
 
				-                _document = Document_tmp(_data)
			
 
				-                task_queue.put(_document)
			
 
				+        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
			
 
				+        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        # list_data = getRow_ots(rows)
			
 
				+        # print(total_count)
			
 
				+        # _count = len(list_data)
			
 
				+        # for _data in list_data:
			
 
				+        #     _document = Document(_data)
			
 
				+        #     task_queue.put(_document)
			
 
				+        # while next_token:
			
 
				+        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
			
 
				+        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
 
				+        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
			
 
				+        #     list_data = getRow_ots(rows)
			
 
				+        #     _count += len(list_data)
			
 
				+        #     print("%d/%d"%(_count,total_count))
			
 
				+        #     for _data in list_data:
			
 
				+        #         _document = Document(_data)
			
 
				+        #         task_queue.put(_document)
			
 
				 
			
 
				         # docids = [223820830,224445409]
			
 
				         # for docid in docids:
			
@@ -367,19 +368,39 @@ def turn_document_status():
 
				         #              document_partitionkey:int(docid)%500+1,
			
 
				         #              }
			
 
				         #     task_queue.put(Document(_dict))
			
 
				-        # import pandas as pd
			
 
				-        # df = pd.read_excel(r"F:\Workspace2016\DataMining\export\abc1.xlsx")
			
 
				-        # for docid in df["docid1"]:
			
 
				-        #     _dict = {document_docid:int(docid),
			
 
				-        #              document_partitionkey:int(docid)%500+1,
			
 
				-        #              }
			
 
				-        #     task_queue.put(Document(_dict))
			
 
				+        import pandas as pd
			
 
				+        df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
			
 
				+        list_docid = df["docid"]
			
 
				+        # list_docid = [519497468]
			
 
				+
			
 
				+        # list_docid = []
			
 
				+        # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
			
 
				+        # with open(filename,"r",encoding="utf8") as f:
			
 
				+        #     while 1:
			
 
				+        #         line = f.readline()
			
 
				+        #         if not line:
			
 
				+        #             break
			
 
				+        #         line = line.strip()
			
 
				+        #         docid = line.split('-')[-1]
			
 
				+        #         if re.search("^\d+$",docid) is not None:
			
 
				+        #             list_docid.append(int(docid))
			
 
				+
			
 
				+        for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
			
 
				+            if recall_flag == 1:
			
 
				+                _dict = {document_docid:int(docid),
			
 
				+                         document_partitionkey:int(docid)%500+1,
			
 
				+                         "construct_company":construct_company
			
 
				+                         }
			
 
				+                task_queue.put(Document(_dict))
			
 
				         # for docid in df["docid2"]:
			
 
				         #     _dict = {document_docid:int(docid),
			
 
				         #              document_partitionkey:int(docid)%500+1,
			
 
				         #              }
			
 
				         #     task_queue.put(Document(_dict))
			
 
				-        # log("task_queue size:%d"%(task_queue.qsize()))
			
 
				+        log("task_queue size:%d"%(task_queue.qsize()))
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				     def _handle(item,result_queue,ots_client):
			
 
				         #change attach value
			
@@ -405,8 +426,14 @@ def turn_document_status():
 
				         # item.setValue(document_province,"广东",True)
			
 
				         # item.setValue(document_city,"珠海",True)
			
 
				         # item.setValue(document_district,"金湾区",True)
			
 
				-        item.setValue(document_status,66,True)
			
 
				+        # item.setValue(document_status,66,True)
			
 
				         # print(item.getProperties())
			
 
				+        # item.setValue(document_status,1,True)
			
 
				+        # product = item.getProperties().get(document_product)
			
 
				+        # l_product = product.split(",")
			
 
				+        # n_product = ",".join(l_product[:500])
			
 
				+        # item.setValue(document_product,n_product,True)
			
 
				+        # item.fix_columns(ots_client,["extract_json","doctitle",""],True)
			
 
				         item.update_row(ots_client)
			
 
				         # log("update %d status done"%(item.getProperties().get(document_docid)))
			
 
				         pass
			
--- a/BaseDataMaintenance/model/ots/document_tmp.py
+++ b/BaseDataMaintenance/model/ots/document_tmp.py
@@ -254,6 +254,7 @@ def turn_document_tmp_status():
 
				     ots_client = getConnect_ots()
			
 
				 
			
 
				     def producer1(task_queue,ots_client):
			
 
				+        a = ''
			
 
				         for l_a in a.split("\n"):
			
 
				             l_a = l_a.strip()
			
 
				             if l_a !="":
			
@@ -266,8 +267,8 @@ def turn_document_tmp_status():
 
				         bool_query = BoolQuery(
			
 
				             must_queries=[
			
 
				                 # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
			
 
				-                TermQuery("save",1),
			
 
				-                RangeQuery("status",72),
			
 
				+                # TermQuery("save",66),
			
 
				+                RangeQuery("status",1,51),
			
 
				                 # BoolQuery(should_queries=[
			
 
				                 #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
			
 
				                 #                           # MatchPhraseQuery("doctitle","中国电信"),
			
@@ -280,11 +281,11 @@ def turn_document_tmp_status():
 
				                 #                                  ]
			
 
				                 # )
			
 
				             ],
			
 
				-            # must_not_queries=[
			
 
				-            #     TermQuery("docid",288599518)
			
 
				-            #     # ExistsQuery("status"),
			
 
				-            #     # ExistsQuery("page_time"),
			
 
				-            #                   ]
			
 
				+            must_not_queries=[
			
 
				+                # TermQuery("docid",288599518)
			
 
				+                # ExistsQuery("doctitle"),
			
 
				+                # ExistsQuery("page_time"),
			
 
				+                              ]
			
 
				         )
			
 
				 
			
 
				         rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
@@ -297,6 +298,7 @@ def turn_document_tmp_status():
 
				         for _data in list_data:
			
 
				             _document = Document_tmp(_data)
			
 
				             task_queue.put(_document)
			
 
				+        print(list_data)
			
 
				         while next_token:
			
 
				             rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
			
 
				                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
			
@@ -348,23 +350,23 @@ def turn_document_tmp_status():
 
				         # _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '')
			
 
				         # item.setValue(document_tmp_extract_json,_extract_json,True)
			
 
				         # json.loads(_extract_json)
			
 
				-        # item.setValue(document_tmp_status,71,True)
			
 
				+        item.setValue(document_tmp_status,0,True)
			
 
				         # item.setValue(document_tmp_save,1,True)
			
 
				         # if item.exists_row(ots_client):
			
 
				         #     item.update_row(ots_client)
			
 
				         # print(item.getProperties())
			
 
				-        # item.update_row(ots_client)
			
 
				+        item.update_row(ots_client)
			
 
				         # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
			
 
				         # item.delete_row(ots_client)
			
 
				-        from BaseDataMaintenance.model.ots.document import Document
			
 
				-
			
 
				-        Doc = Document(item.getProperties())
			
 
				-        if Doc.fix_columns(ots_client,["status"],True):
			
 
				-            if Doc.getProperties().get("status",0)>=401:
			
 
				-                print(Doc.getProperties().get("docid"),"redo")
			
 
				-                item.setValue("status",66,True)
			
 
				-                item.update_row(ots_client)
			
 
				-        pass
			
 
				+        # from BaseDataMaintenance.model.ots.document import Document
			
 
				+        #
			
 
				+        # Doc = Document(item.getProperties())
			
 
				+        # if Doc.fix_columns(ots_client,["status"],True):
			
 
				+        #     if Doc.getProperties().get("status",0)>=401:
			
 
				+        #         print(Doc.getProperties().get("docid"),"redo")
			
 
				+        #         item.setValue("status",66,True)
			
 
				+        #         item.update_row(ots_client)
			
 
				+        # pass
			
 
				 
			
 
				     t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
			
 
				     t_producer.start()