Ver Fonte

Merge remote-tracking branch 'origin/master'

fangjiasheng há 5 meses atrás
pai
commit
8bcf3fb0dd
31 ficheiros alterados com 2616 adições e 456 exclusões
  1. 2 0
      .idea/encodings.xml
  2. 1 1
      .idea/misc.xml
  3. 77 0
      BaseDataMaintenance/chat/ERNIE_utils.py
  4. 86 0
      BaseDataMaintenance/chat/chatUtil.py
  5. 2 3
      BaseDataMaintenance/common/Utils.py
  6. 3 9
      BaseDataMaintenance/common/activateMQUtils.py
  7. 4 3
      BaseDataMaintenance/common/documentFingerprint.py
  8. 2 2
      BaseDataMaintenance/common/ossUtils.py
  9. 152 74
      BaseDataMaintenance/dataMonitor/data_monitor.py
  10. 5 5
      BaseDataMaintenance/dataSource/setttings.py
  11. 3 2
      BaseDataMaintenance/fixDoc_to_queue_extract.py
  12. 1 1
      BaseDataMaintenance/maintenance/attachment/attachmentProcess.py
  13. 238 136
      BaseDataMaintenance/maintenance/dataflow.py
  14. 218 92
      BaseDataMaintenance/maintenance/dataflow_mq.py
  15. 646 0
      BaseDataMaintenance/maintenance/document/ApprovalData.py
  16. 98 19
      BaseDataMaintenance/maintenance/enterprise/enterprise2Redis.py
  17. 164 0
      BaseDataMaintenance/maintenance/gpt_extract.py
  18. 3 3
      BaseDataMaintenance/maintenance/preproject/fillColumns.py
  19. 6 3
      BaseDataMaintenance/maintenance/product/extract_data.py
  20. 4 3
      BaseDataMaintenance/maintenance/product/htmlparser.py
  21. 21 1
      BaseDataMaintenance/maxcompute/1.py
  22. 218 13
      BaseDataMaintenance/maxcompute/documentDumplicate.py
  23. 237 26
      BaseDataMaintenance/maxcompute/documentMerge.py
  24. 17 0
      BaseDataMaintenance/model/oracle/QiTaShiXinTemp.py
  25. 38 10
      BaseDataMaintenance/model/oracle/T_SHEN_PI_XIANG_MU.py
  26. 17 0
      BaseDataMaintenance/model/oracle/TouSuChuLiTemp.py
  27. 215 0
      BaseDataMaintenance/model/oracle/TouSuTemp.py
  28. 56 0
      BaseDataMaintenance/model/oracle/WeiFaJiLuTemp.py
  29. 3 0
      BaseDataMaintenance/model/ots/BaseModel.py
  30. 59 32
      BaseDataMaintenance/model/ots/document.py
  31. 20 18
      BaseDataMaintenance/model/ots/document_tmp.py

+ 2 - 0
.idea/encodings.xml

@@ -2,8 +2,10 @@
 <project version="4">
   <component name="Encoding">
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/attachmentProcessTime.xlsx" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/chat/chatUtil.py" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/dataSource/searchPaddle.py" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/gpt_extract.py" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_exclude_name_from_tw_prod.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_product_name_exclude_name.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/update_product.csv" charset="GBK" />

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
     <output url="file://$PROJECT_DIR$/out" />
   </component>
 </project>

+ 77 - 0
BaseDataMaintenance/chat/ERNIE_utils.py

@@ -0,0 +1,77 @@
+
+import requests
+import json
+
+def get_access_token():
+    """
+    使用 API Key,Secret Key 获取access_token,替换下列示例中的应用API Key、应用Secret Key
+    """
+
+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=gnwVXv96An9qMYqq9eWbeNqk&client_secret=mDsRQbCPsV4N7x28LbwkhTAaLmrrDnXk"
+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=Ok8QMe4qIQOAex0F9Gf1uns0&client_secret=6DjGGDdvhnBaEOMdSXAg02KxZnQhWpbd"
+
+    payload = json.dumps("")
+    headers = {
+        'Content-Type': 'application/json',
+        'Accept': 'application/json'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+    return response.json().get("access_token")
+
+def main():
+    _token = get_access_token()
+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
+
+    # url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/xuanyuan_70b_chat?access_token=" + _token
+
+    payload = json.dumps({
+        "messages": [
+            {
+                "role": "user",
+                "content": '''
+               今天是几号
+                '''
+            }
+        ]
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+
+
+
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+
+    print(response.text)
+
+def chat(msg,token=None,api_url=None):
+    if token is None:
+        token = get_access_token()
+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    if api_url is None:
+        api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions"
+        # api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-3.5-128k"
+    url =  api_url+"?access_token="+ token
+    payload = json.dumps({
+        "messages": [
+            {
+                "role": "user",
+                "content": '''
+               %s
+                '''%msg
+            }
+        ],
+        "stream":False
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload)
+
+    return response
+
+if __name__ == '__main__':
+    main()

+ 86 - 0
BaseDataMaintenance/chat/chatUtil.py

@@ -0,0 +1,86 @@
+#coding:utf8
+
+from bs4 import BeautifulSoup
+import re
+
+def html2text(_html):
+
+    if type(_html)==str:
+        _soup = BeautifulSoup(_html,"lxml")
+    else:
+        _soup = _html
+    list_table = _soup.find_all("table")
+    list_tbody = _soup.find_all("tbody")
+    if len(list_table)>0 or len(list_tbody)>0:
+        list_childs = _soup.find_all(recursive=False)
+        list_child_text = []
+        for child in list_childs:
+            list_child_text.append(html2text(child))
+        return "\n".join(list_child_text)
+
+    else:
+        if _soup.name=="table" or _soup.name=="tbody":
+            _table_text = ""
+            trs = _soup.find_all("tr")
+            list_tr_text = []
+            for tr in trs:
+                tds = tr.find_all("th")
+                if len(tds)>0:
+                    list_td_text = []
+                    for td in tds:
+                        list_td_text.append(re.sub('\s','',td.get_text()))
+                    list_tr_text.append("|".join(list_td_text))
+                tds = tr.find_all("td")
+                if len(tds)>0:
+                    list_td_text = []
+                    for td in tds:
+                        list_td_text.append(re.sub('\s','',td.get_text()))
+                    list_tr_text.append("|".join(list_td_text))
+            _table_text = "%s\n\n"%"\n".join(list_tr_text)
+            if _table_text == "":
+                _table_text = _soup.get_text()
+            _soup.decompose()
+            return _table_text
+        else:
+            _text = re.sub('\s','',_soup.get_text().strip())
+            _soup.decompose()
+            return _text
+
+def table2list(_html):
+    if type(_html)==str:
+        _soup = BeautifulSoup(_html,'lxml')
+    else:
+        _soup = _html
+    print("===",type(_soup),_soup.name)
+    if _soup.name=="table" or _soup.name=="tbody":
+        _table_text = ""
+        trs = _soup.find_all("tr")
+        list_tr_text = []
+        for tr in trs:
+            tds = tr.find_all("th")
+            if len(tds)>0:
+                list_td_text = []
+                for td in tds:
+                    list_td_text.append(re.sub('\s','',td.get_text()))
+                if len(list_td_text)>0:
+                    list_tr_text.append(list_td_text)
+            tds = tr.find_all("td")
+            if len(tds)>0:
+                list_td_text = []
+                for td in tds:
+                    list_td_text.append(re.sub('\s','',td.get_text()))
+                if len(list_td_text)>0:
+                    list_tr_text.append(list_td_text)
+        return list_tr_text
+
+def tableList2text(table_list):
+    list_tr_text = []
+    for tr in table_list:
+        tds = tr
+        if len(tds)>0:
+            list_td_text = []
+            for td in tds:
+                list_td_text.append(re.sub('\s','',td))
+            list_tr_text.append("|".join(list_td_text))
+    _table_text = "%s\n\n"%"\n".join(list_tr_text)
+    return _table_text

+ 2 - 3
BaseDataMaintenance/common/Utils.py

@@ -720,7 +720,6 @@ def getMultipleFactor(unit):
     MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
     return MultipleFactor.get(unit)
 
-
 def getUnifyMoney(money):
     '''
     @summary:将中文金额字符串转换为数字金额
@@ -735,9 +734,9 @@ def getUnifyMoney(money):
     money = re.sub("[,,]","",money)
     money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
     result = Decimal(0)
-    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
     # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
-    chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
+    chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"]  # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
 
     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))

+ 3 - 9
BaseDataMaintenance/common/activateMQUtils.py

@@ -15,18 +15,12 @@ def send_msg_toacmq(pool_conn,msg,dest,retry_times=5):
         conn = pool_conn.getConnector()
         try:
             conn.send(body=str(msg), destination=dest, persistent='false')
+            pool_conn.putConnector(conn)
             return True
         except Exception as e:
             traceback.print_exc()
-            try:
-                conn.disconnect()
-            except Exception as e:
-                pass
-        finally:
-            if conn.is_connected():
-                pool_conn.putConnector(conn)
-            else:
-                del conn
+            time.sleep(2)
+            del conn
     return False
 
 class MyListener(object):

+ 4 - 3
BaseDataMaintenance/common/documentFingerprint.py

@@ -1,4 +1,4 @@
-
+#coding:utf8
 
 import hashlib
 import codecs
@@ -47,6 +47,7 @@ def getFingerprint(sourceHtml):
     return _fingerprint
 
 if __name__=="__main__":
-    sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
-    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
+    # sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
+    sourceHtml = "天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示"+'<div> <div> <div> 天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示 </div> <div> <div> <p>一、办理事项:天全县农村敬老院护理能力提升改造项目初步设计及概算审批</p> <p>二、项目业主:天全县民政局</p> <p>三、项目代码:2107-511825-04-01-642123</p> <p>四、办理状态:办结。</p> <p>五、办理时间:2024年5月14日</p> </div> </div> </div> </div>'
+    sourceHtml = "天全县农村敬老院护理能力提升改造项目初步设计及概算审批公示"+'审批项目'
     print(getFingerprint(sourceHtml))

+ 2 - 2
BaseDataMaintenance/common/ossUtils.py

@@ -108,7 +108,7 @@ def test_download(filemd5):
 
 
 if __name__=="__main__":
-    # print(getMDFFromFile('8a9c96a68803c2ad01881d0ee93618e5.pdf'))
-    test_download("892bde698088f1d61b5310782550d0e1")
+    print(getMDFFromFile(r'G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-09\中国区超低氮锅炉电锅炉招标文件与附件(1).zip'))
+    # test_download("892bde698088f1d61b5310782550d0e1")
     # print(bucket.sign_url("GET","0015//20220623/2022-06-22/WGH001018/1655926900020.png",86500*30))
     # print(time.strftime("%Y-%m-%d",time.localtime(1658655178)))

+ 152 - 74
BaseDataMaintenance/dataMonitor/data_monitor.py

@@ -1,20 +1,29 @@
-import os, sys
+
+
+import os,sys
 import subprocess
-from datetime import datetime, timedelta
+from datetime import datetime,timedelta
+
 import psutil
 from apscheduler.schedulers.blocking import BlockingScheduler
-from BaseDataMaintenance.dataSource.source import getConnect_ots, getConnect_activateMQ
+
+from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_activateMQ
+
 from BaseDataMaintenance.dataSource.interface import *
 from BaseDataMaintenance.common.Utils import *
+
 from tablestore import *
 from BaseDataMaintenance.dataSource.setttings import *
 from queue import Queue
 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
 
+
 from BaseDataMaintenance.maintenance.dataflow_settings import *
 
+
 import pandas as pd
 
+
 flow_attachment_log_path = "/data/python/flow_attachment.log"
 
 flow_extract_log_path = "/data/python/flow_extract.log"
@@ -27,6 +36,48 @@ flow_init_check_dir = "/data/python/flow_init_check"
 flow_dumplicate_log_path = "/home/appuser/python/flow_dumplicate.log"
 
 
+def fixDoc_to_queue_init(filename=""):
+    import pandas as pd
+    from BaseDataMaintenance.model.oracle.GongGaoTemp import dict_oracle2ots
+    from BaseDataMaintenance.model.oracle.TouSuTemp import dict_oracle2ots as dict_oracle2ots_tousu
+
+    from BaseDataMaintenance.dataSource.source import getConnection_oracle
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    if filename=="":
+        filename = os.path.join(current_path,"check.xlsx")
+    df = pd.read_excel(filename)
+    if "docchannel" in dict_oracle2ots:
+        dict_oracle2ots.pop("docchannel")
+    row_name = ",".join(list(dict_oracle2ots.keys()))
+
+    list_tousu_keys = []
+    for k,v in dict_oracle2ots_tousu.items():
+        if str(k).isupper():
+            list_tousu_keys.append(k)
+    row_name_tousu = ",".join(list(list_tousu_keys))
+    conn = getConnection_oracle()
+    cursor = conn.cursor()
+    _count = 0
+    for uuid,tablename,_exists,_toolong in zip(df["uuid"],df["tablename"],df["exists"],df["tolong"]):
+        if _exists==0 and _toolong==0:
+            _count += 1
+            is_tousu = False
+            if tablename in ('bxkc.t_wei_fa_ji_lu_temp','bxkc.t_tou_su_chu_li_temp','bxkc.t_qi_ta_shi_xin_temp'):
+                is_tousu = True
+            _source = str(tablename).replace("_TEMP","")
+            if is_tousu:
+                _source = str(tablename).replace("_temp","")
+            _rowname = row_name_tousu if is_tousu else row_name
+
+            sql = " insert into %s(%s) select %s from %s where id='%s' "%(tablename,_rowname,_rowname,_source,uuid)
+            log("%d:%s"%(_count,sql))
+            cursor.execute(sql)
+
+    conn.commit()
+    conn.close()
+
+    return _count
+
 class BaseDataMonitor():
 
     def __init__(self):
@@ -43,35 +94,32 @@ class BaseDataMonitor():
     def get_last_tenmin_time(self, nums=15):
         current_time = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
 
-        last_ten_minite_time = timeAdd(current_time, 0, "%Y-%m-%d %H:%M:%S", -10)
+        last_ten_minite_time = timeAdd(current_time,0,"%Y-%m-%d %H:%M:%S",-10)
         return last_ten_minite_time[:nums]
 
-    def check_document_uuid(self, log_filename):
+    def check_document_uuid(self,log_filename):
 
-        def _handle(_item, result_queue):
-            bool_query = BoolQuery(must_queries=[TermQuery("uuid", _item.get("uuid"))])
+        def _handle(_item,result_queue):
+            bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
 
-            rows, next_token, total_count, is_all_succeed = ots_client.search("document_tmp", "document_tmp_index",
-                                                                              SearchQuery(bool_query,
-                                                                                          get_total_count=True),
-                                                                              columns_to_get=ColumnsToGet(
-                                                                                  return_type=ColumnReturnType.NONE))
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
+                                                                           SearchQuery(bool_query,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
 
             _item["exists"] = total_count
-
-        check_filename = "%s_check.xlsx" % (log_filename)
+        check_filename = "%s_check.xlsx"%(log_filename)
         list_uuid = []
         task_queue = Queue()
         dict_tolong = {}
         if not os.path.exists(check_filename) and os.path.exists(log_filename):
             _regrex = "delete\s+(?P<tablename>bxkc[^\s]+)\s+.*ID='(?P<uuid>.+)'"
             _regrex_tolong = "msg too long:(?P<uuid>[^,]+),\d+"
-            with open(log_filename, "r", encoding="utf8") as f:
+            with open(log_filename,"r",encoding="utf8") as f:
                 while 1:
                     _line = f.readline()
                     if not _line:
                         break
-                    _match = re.search(_regrex, _line)
+                    _match = re.search(_regrex,_line)
                     if _match is not None:
                         _uuid = _match.groupdict().get("uuid")
                         tablename = _match.groupdict().get("tablename")
@@ -99,36 +147,44 @@ class BaseDataMonitor():
                        "tolong": []}
 
             for _data in list_uuid:
-                for k, v in df_data.items():
-                    if k != "tolong":
+                for k,v in df_data.items():
+                    if k!="tolong":
                         v.append(_data.get(k))
-                df_data["tolong"].append(dict_tolong.get(_data["uuid"], 0))
+                df_data["tolong"].append(dict_tolong.get(_data["uuid"],0))
             df2 = pd.DataFrame(df_data)
             df2.to_excel(check_filename)
 
     def monitor_init(self):
 
-        def _handle(_item, result_queue):
-            bool_query = BoolQuery(must_queries=[TermQuery("uuid", _item.get("uuid"))])
+        def _handle(_item,result_queue):
+            bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
 
-            rows, next_token, total_count, is_all_succeed = ots_client.search("document_tmp", "document_tmp_index",
-                                                                              SearchQuery(bool_query,
-                                                                                          get_total_count=True),
-                                                                              columns_to_get=ColumnsToGet(
-                                                                                  return_type=ColumnReturnType.NONE))
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
+                                                                           SearchQuery(bool_query,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+
+            if total_count>0:
+                _item["exists"] = total_count
+            else:
+                bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
+
+                rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                               SearchQuery(bool_query,get_total_count=True),
+                                                                               columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+
+                _item["exists"] = total_count
 
-            _item["exists"] = total_count
 
         try:
             current_date = getCurrent_date("%Y-%m-%d")
 
-            last_date = timeAdd(current_date, -1, "%Y-%m-%d")
+            last_date = timeAdd(current_date,-1,"%Y-%m-%d")
 
             if not os.path.exists(flow_init_check_dir):
                 os.mkdir(flow_init_check_dir)
 
-            log_filename = os.path.join(flow_init_log_dir, "flow_init_%s.log" % (last_date))
-            check_filename = os.path.join(flow_init_check_dir, "flow_init_%s.xlsx" % (last_date))
+            log_filename = os.path.join(flow_init_log_dir,"flow_init_%s.log"%(last_date))
+            check_filename = os.path.join(flow_init_check_dir,"flow_init_%s.xlsx"%(last_date))
 
             list_uuid = []
             task_queue = Queue()
@@ -136,66 +192,76 @@ class BaseDataMonitor():
             if not os.path.exists(check_filename) and os.path.exists(log_filename):
                 _regrex = "delete\s+(?P<tablename>bxkc[^\s]+)\s+.*ID='(?P<uuid>.+)'"
                 _regrex_tolong = "msg too long:(?P<uuid>[^,]+),\d+"
-                with open(log_filename, "r", encoding="utf8") as f:
+                with open(log_filename,"r",encoding="utf8") as f:
                     while 1:
                         _line = f.readline()
                         if not _line:
                             break
-                        _match = re.search(_regrex, _line)
+                        _match = re.search(_regrex,_line)
                         if _match is not None:
                             _uuid = _match.groupdict().get("uuid")
                             tablename = _match.groupdict().get("tablename")
                             if _uuid is not None:
-                                list_uuid.append({"uuid": _uuid, "tablename": tablename})
-                        _match = re.search(_regrex_tolong, _line)
+                                list_uuid.append({"uuid":_uuid,"tablename":tablename})
+                        _match = re.search(_regrex_tolong,_line)
                         if _match is not None:
                             _uuid = _match.groupdict().get("uuid")
                             dict_tolong[_uuid] = 1
 
-                if list_uuid == 0:
+
+                if list_uuid==0:
                     _msg = "数据遗漏检查出错"
-                    sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
+                    sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
                     # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
 
                 ots_client = getConnect_ots()
 
                 for _d in list_uuid:
                     task_queue.put(_d)
-                mt = MultiThreadHandler(task_queue, _handle, None, 30)
+                mt = MultiThreadHandler(task_queue,_handle,None,30)
                 mt.run()
-                df_data = {"uuid": [],
-                           "tablename": [],
-                           "exists": [],
-                           "tolong": []}
+                df_data = {"uuid":[],
+                           "tablename":[],
+                           "exists":[],
+                           "tolong":[]}
 
                 for _data in list_uuid:
-                    for k, v in df_data.items():
-                        if k != "tolong":
+                    for k,v in df_data.items():
+                        if k!="tolong":
                             v.append(_data.get(k))
-                    df_data["tolong"].append(dict_tolong.get(_data["uuid"], 0))
+                    df_data["tolong"].append(dict_tolong.get(_data["uuid"],0))
                 df2 = pd.DataFrame(df_data)
                 df2.to_excel(check_filename)
 
             counts = 0
             df_data = pd.read_excel(check_filename)
-            for _exists, _tolong in zip(df_data["exists"], df_data["tolong"]):
-                if _exists == 0 and _tolong == 0:
+            for _exists,_tolong in zip(df_data["exists"],df_data["tolong"]):
+                if _exists==0 and _tolong==0:
                     counts += 1
-            if counts > 0:
-                _msg = "数据遗漏检查报警,%s有%s条公告遗漏,详见%s" % (last_date, str(counts), check_filename)
-                sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
+            if counts>0:
+                _msg = "数据遗漏检查报警,%s有%s条公告遗漏,详见%s"%(last_date,str(counts),check_filename)
+                sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
                 # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
 
+                _count = fixDoc_to_queue_init(check_filename)
+                if _count>0:
+                    _msg = "数据遗漏检查报警%d条公告已重新同步"%(_count)
+                    sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
+                    df_data.to_excel("%s_bak.xlsx"%check_filename)
+                    os.remove(check_filename)
+
+
 
 
         except Exception as e:
             _msg = "数据遗漏检查报错"
-            sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS, atAll=True)
+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
             traceback.print_exc()
 
+
     def monitor_attachment(self):
-        from BaseDataMaintenance.java.MQInfo import getAllQueueSize, getQueueSize
+        from BaseDataMaintenance.java.MQInfo import getAllQueueSize,getQueueSize
         try:
             # query = BoolQuery(must_queries=[
             #     RangeQuery("status",0,11),
@@ -206,7 +272,7 @@ class BaseDataMonitor():
             #                                                                            columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
             total_count_todeal = getQueueSize("dataflow_attachment")
 
-            if total_count_todeal > 1000:
+            if total_count_todeal>1000:
                 # query = BoolQuery(must_queries=[
                 #     RangeQuery("crtime",self.get_last_tenmin_time(16))
                 # ])
@@ -224,7 +290,11 @@ class BaseDataMonitor():
                 #                                                                            SearchQuery(query,None,True),
                 #                                                                            columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
 
-                # 通过命令行获取日志情况
+
+
+
+
+                #通过命令行获取日志情况
                 # _cmd = 'cat %s | grep -c "%s.*process filemd5"'%(flow_attachment_log_path,self.get_last_tenmin_time())
                 # log(_cmd)
                 # process_count = self.cmd_execute(_cmd)
@@ -239,16 +309,15 @@ class BaseDataMonitor():
 
                 # _msg = "附件提取队列报警:队列堆积%s条公告,最近十分钟处理公告附件数:%s,处理成功数:%s"%(str(total_count_todeal),str(process_count),str(process_succeed_count))
 
-                # 通过读取文件获取日志情况
+                #通过读取文件获取日志情况
                 dict_type = {}
-                _pattern = "%s.*process filemd5\:[^\s]* (?P<result>(True|False)) of type\:(?P<type>[^\s]*).*download:(?P<downloadtime>\d+\.\d+)s recognize takes (?P<costtime>\d+)s upload takes (?P<uploadtime>\d+\.\d+)s" % (
-                    re.escape(self.get_last_tenmin_time()))
-                with open(flow_attachment_log_path, "r", encoding="utf8") as f:
+                _pattern = "%s.*process filemd5\:[^\s]* (?P<result>(True|False)) of type\:(?P<type>[^\s]*).*download:(?P<downloadtime>\d+\.\d+)s recognize takes (?P<costtime>\d+)s upload takes (?P<uploadtime>\d+\.\d+)s"%(re.escape(self.get_last_tenmin_time()))
+                with open(flow_attachment_log_path,"r",encoding="utf8") as f:
                     while True:
                         line = f.readline()
                         if not line:
                             break
-                        _match = re.search(_pattern, str(line))
+                        _match = re.search(_pattern,str(line))
                         if _match is not None:
                             _type = _match.groupdict().get("type")
                             _result = _match.groupdict().get("result")
@@ -333,9 +402,12 @@ class BaseDataMonitor():
 
             total_count_todeal = getQueueSize("dataflow_extract")
 
-            if total_count_todeal > 500:
-                _cmd = 'cat %s | grep "%s" | grep -c "process.*docid"' % (
-                flow_extract_log_path, self.get_last_tenmin_time())
+            if total_count_todeal>1000:
+                _cmd = 'cat %s | grep "%s" | grep -c "要素提取失败:docid"'%(flow_extract_log_path,self.get_last_tenmin_time())
+                log(_cmd)
+                process_failed_count = self.cmd_execute(_cmd)
+
+                _cmd = 'cat %s | grep "%s" | grep -c "process.*docid"'%(flow_extract_log_path,self.get_last_tenmin_time())
                 log(_cmd)
                 process_count = self.cmd_execute(_cmd)
                 _cmd = 'cat %s | grep "%s" | grep -c "process.*docid.*1$"' % (
@@ -369,8 +441,8 @@ class BaseDataMonitor():
                 #                                                                              SearchQuery(query,None,True),
                 #                                                                              columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
 
-                _msg = "要素提取队列报警:队列堆积%s条公告,最近十分钟入库数:%s,最近十分钟处理公告数:%s,其中成功处理数:%s,查库免提取数:%s" % (
-                str(total_count_todeal), str(init_count), str(process_count), str(success_count), str(exists_count))
+                _msg = "要素提取队列报警:队列堆积%s条公告,最近十分钟入库数:%s,最近十分钟处理公告数:%s,其中成功处理数:%s,处理失败数:%s,查库免提取数:%s" % (
+                str(total_count_todeal), str(init_count), str(process_count), str(success_count), str(process_failed_count),str(exists_count))
                 log(_msg)
                 atAll = False
                 if success_count == 0:
@@ -432,9 +504,9 @@ class BaseDataMonitor():
                                                                                columns_to_get=ColumnsToGet(
                                                                                    return_type=ColumnReturnType.NONE))
 
-        if total_count >= 200:
-            _msg = "数据流报警:待同步到成品表公告数为:%d" % (total_count)
-            sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS)
+        if total_count>=2000:
+            _msg = "数据流报警:待同步到成品表公告数为:%d"%(total_count)
+            sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS)
             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
 
     def monitor_preproject(self):
@@ -681,26 +753,28 @@ class BaseDataMonitor():
             sentMsgToDD(_msg, ACCESS_TOKEN_DATAWORKS)
             # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
 
+
     def start_monitor(self):
-        # 数据监控
+        #数据监控
 
         scheduler = BlockingScheduler()
 
         # scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
-        scheduler.add_job(self.monitor_extract, "cron", minute="*/10")
-        scheduler.add_job(self.monitor_proposedBuilding, "cron", hour="*/11")
+        scheduler.add_job(self.monitor_extract,"cron",minute="*/10")
+        scheduler.add_job(self.monitor_proposedBuilding,"cron",hour="*/11")
         # scheduler.add_job(self.monitor_dumplicate,"cron",minute="*/10")
-        scheduler.add_job(self.monitor_sychr, "cron", minute="*/10")
+        scheduler.add_job(self.monitor_merge,"cron",hour="*/2")
+        scheduler.add_job(self.monitor_sychr, "cron", minute="*/30")
         scheduler.add_job(self.monitor_preproject, "cron", hour="8")
-        scheduler.add_job(self.monitor_merge, "cron", minute="*/60")
         scheduler.add_job(self.monitor_init, "cron", hour="*/3")
         scheduler.start()
 
+
     def start_attach_monitor(self):
-        # 附件监控
+        #附件监控
         scheduler = BlockingScheduler()
 
-        scheduler.add_job(self.monitor_attachment, "cron", minute="*/10")
+        scheduler.add_job(self.monitor_attachment,"cron",minute="*/10")
         scheduler.start()
 
 
@@ -775,11 +849,15 @@ def monitor_convert_interface():
 
 
 if __name__ == '__main__':
+
     # dm = BaseDataMonitor()
     # # dm.start_monitor()
     # log_filename = "C:\\Users\\Administrator\\Desktop\\flow_init_2023-02-03.log"
     # dm.check_document_uuid(log_filename)
 
-    sentMsgToDD("报警test_msg", ACCESS_TOKEN_DATAWORKS)
+    sentMsgToDD("报警test_msg",ACCESS_TOKEN_DATAWORKS)
     # dm.monitor_proposedBuilding()
     # print(dm.get_last_tenmin_time(16))
+
+
+

+ 5 - 5
BaseDataMaintenance/dataSource/setttings.py

@@ -43,12 +43,12 @@ oracle_host = "121.46.18.113"
 oracle_port = 10522
 oracle_host = "192.168.0.150"
 oracle_port = 1522
-# oracle_user = "bxkc_data_readonly"
-# oracle_pass = "P7WUrgcz0@#j8pjg"
-oracle_user = "BXKC_WRITE"
+# oracle_user = "BXKC_DATA_READONLY"
+# oracle_pass = "nXcQG3Z8DW=Hzr!h"
+# oracle_user = "BXKC_WRITE"
+# oracle_pass = "PHNhX3%rVy4@fDB&"
+oracle_user = "bxkc_db"
 oracle_pass = "PHNhX3%rVy4@fDB&"
-# oracle_user = "bxkc_db"
-# oracle_pass = "xb9F#24Hd#5rStr9"
 oracle_db = "yanphone"
 
 ots_AccessKeyId = 'LTAI5tFuoxHm8Uxrr5nT8wTZ'

+ 3 - 2
BaseDataMaintenance/fixDoc_to_queue_extract.py

@@ -3,9 +3,10 @@ import sys,os
 
 sys.path.append(os.path.dirname(__file__)+"/..")
 
-from BaseDataMaintenance.maintenance.dataflow_mq import fixDoc_to_queue_extract,fixDoc_to_queue_init
+from BaseDataMaintenance.maintenance.dataflow_mq import fixDoc_to_queue_extract
+from BaseDataMaintenance.dataMonitor.data_monitor import fixDoc_to_queue_init
 
 
 if __name__ == '__main__':
     # fixDoc_to_queue_extract()
-    fixDoc_to_queue_init(filename="/data/python/flow_init_check/flow_init_2023-12-28.xlsx")
+    fixDoc_to_queue_init(filename="/data/python/flow_init_check/flow_init_2024-12-02.xlsx")

+ 1 - 1
BaseDataMaintenance/maintenance/attachment/attachmentProcess.py

@@ -811,7 +811,7 @@ class AttachmentRec():
                             attach.setValue(attachment_process_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
                             attach.setValue(attachment_status,ATTACHMENT_PROCESSED_FAILED)
                             log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
-                            sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
+                            # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
 
 
                 attach.update_row(self.ots_client)

+ 238 - 136
BaseDataMaintenance/maintenance/dataflow.py

@@ -260,7 +260,7 @@ class Dataflow():
                         log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
                     else:
                         log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
-                        sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
+                        # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
                         _html = ""
                         return False
 
@@ -350,8 +350,8 @@ class Dataflow():
 
 
     def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
-                                  set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
-                                  set_range=set(["page_time","status"]),set_phrase=set(["doctitle"])):
+                                  set_term=set(["doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
+                                  set_range=set(["page_time","status"]),set_phrase=set(["doctitle","project_name"])):
         list_must_queries = []
         list_must_no_queries = []
         for k,v in _dict.items():
@@ -415,7 +415,10 @@ class Dataflow():
         if agency is not None and agency!="":
             extract_count += 1
         if sub_docs_json is not None:
-            sub_docs = json.loads(sub_docs_json)
+            try:
+                sub_docs = json.loads(sub_docs_json)
+            except Exception as e:
+                sub_docs = []
             sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
             sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
             # log("==%s"%(str(sub_docs)))
@@ -2203,7 +2206,7 @@ class Dataflow_dumplicate(Dataflow):
                 createComsumer(listener,self.doc_delete_queue)
 
 
-    def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
+    def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
         dict_time = {}
         for k in keys:
             dict_time[k] = _extract.get(k)
@@ -2231,10 +2234,12 @@ class Dataflow_dumplicate(Dataflow):
         _dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
         _dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
                                        "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
-        _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
+        _dict["extract_count"] = _extract.get("extract_count",0)
         _dict["package"] = self.c_f_get_package.evaluate(extract_json)
         _dict["project_name"] = _extract.get("name","")
         _dict["dict_time"] = self.get_dict_time(_extract)
+        _dict["punish"] = _extract.get("punish",{})
+        _dict["approval"] = _extract.get("approval",[])
 
     def dumplicate_fianl_check(self,base_list,b_log=False):
         the_group = base_list
@@ -2272,22 +2277,22 @@ class Dataflow_dumplicate(Dataflow):
     def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
         document_less = _dict1
         docid_less = _dict1["docid"]
-        docchannel_less = document_less["docchannel"]
-        page_time_less = document_less["page_time"]
+        docchannel_less = document_less.get("docchannel",0)
+        page_time_less = document_less.get("page_time")
         doctitle_refine_less = document_less["doctitle_refine"]
-        project_codes_less = document_less["project_codes"]
+        project_codes_less = document_less.get("project_codes")
         nlp_enterprise_less = document_less["nlp_enterprise"]
-        tenderee_less = document_less["tenderee"]
-        agency_less = document_less["agency"]
+        tenderee_less = document_less.get("tenderee","")
+        agency_less = document_less.get("agency")
         win_tenderer_less = document_less["win_tenderer"]
         bidding_budget_less = document_less["bidding_budget"]
         win_bid_price_less = document_less["win_bid_price"]
-        product_less = document_less["product"]
-        package_less = document_less["package"]
-        json_time_less = document_less["dict_time"]
-        project_name_less = document_less["project_name"]
-        fingerprint_less = document_less["fingerprint"]
-        extract_count_less = document_less["extract_count"]
+        product_less = document_less.get("product")
+        package_less = document_less.get("package")
+        json_time_less = document_less.get("dict_time")
+        project_name_less = document_less.get("project_name")
+        fingerprint_less = document_less.get("fingerprint")
+        extract_count_less = document_less.get("extract_count",0)
         web_source_no_less = document_less.get("web_source_no")
         province_less = document_less.get("province")
         city_less = document_less.get("city")
@@ -2295,26 +2300,29 @@ class Dataflow_dumplicate(Dataflow):
         moneys_less = document_less.get("moneys")
         moneys_attachment_less = document_less.get("moneys_attachment")
         page_attachments_less = document_less.get(document_tmp_attachment_path,"[]")
+        punish_less = document_less.get("punish",{})
+        approval_less = document_less.get("approval",[])
+        source_type_less = document_less.get("source_type")
 
 
         document_greater = _dict2
         docid_greater = _dict2["docid"]
         page_time_greater = document_greater["page_time"]
-        docchannel_greater = document_greater["docchannel"]
-        doctitle_refine_greater = document_greater["doctitle_refine"]
+        docchannel_greater = document_greater.get("docchannel",0)
+        doctitle_refine_greater = document_greater.get("doctitle_refine","")
         project_codes_greater = document_greater["project_codes"]
         nlp_enterprise_greater = document_greater["nlp_enterprise"]
-        tenderee_greater = document_greater["tenderee"]
-        agency_greater = document_greater["agency"]
+        tenderee_greater = document_greater.get("tenderee","")
+        agency_greater = document_greater.get("agency","")
         win_tenderer_greater = document_greater["win_tenderer"]
         bidding_budget_greater = document_greater["bidding_budget"]
         win_bid_price_greater = document_greater["win_bid_price"]
-        product_greater = document_greater["product"]
-        package_greater = document_greater["package"]
+        product_greater = document_greater.get("product")
+        package_greater = document_greater.get("package")
         json_time_greater = document_greater["dict_time"]
-        project_name_greater = document_greater["project_name"]
-        fingerprint_greater = document_greater["fingerprint"]
-        extract_count_greater = document_greater["extract_count"]
+        project_name_greater = document_greater.get("project_name")
+        fingerprint_greater = document_greater.get("fingerprint")
+        extract_count_greater = document_greater.get("extract_count",0)
         web_source_no_greater = document_greater.get("web_source_no")
         province_greater = document_greater.get("province")
         city_greater = document_greater.get("city")
@@ -2324,12 +2332,16 @@ class Dataflow_dumplicate(Dataflow):
         moneys_attachment_greater = document_greater.get("moneys_attachment")
         page_attachments_greater = document_greater.get(document_tmp_attachment_path,"[]")
 
+        punish_greater = document_greater.get("punish",{})
+        approval_greater = document_greater.get("approval",[])
+        source_type_greater = document_greater.get("source_type")
+
         hard_level=1
         if web_source_no_less==web_source_no_greater=="17397-3":
             hard_level=2
 
         if self.check_rule==1:
-            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
+            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less=punish_less,punish_greater=punish_greater,approval_less=approval_less,approval_greater=approval_greater,source_type_less=source_type_less,source_type_greater=source_type_greater)
         else:
             _prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
 
@@ -2559,7 +2571,7 @@ class Dataflow_dumplicate(Dataflow):
                 else:
                     bool_query = _query
                 rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
-                                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=30,get_total_count=True),
+                                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
                                                                                     ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                 list_dict = getRow_ots(rows)
                 list_data = []
@@ -2854,7 +2866,7 @@ class Dataflow_dumplicate(Dataflow):
 
         return list_rules,table_name,table_index
 
-    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]):
+    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
         q_size = self.queue_dumplicate.qsize()
         log("dumplicate queue size %d"%(q_size))
 
@@ -2939,7 +2951,7 @@ class Dataflow_dumplicate(Dataflow):
         # mt.run()
 
 
-    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
+    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment,document_tenderee_code,document_agency_code,document_candidates]):
         '''
         根据docid查询公告内容,先查询document_tmp,再查询document
         :param list_docids:
@@ -3049,7 +3061,7 @@ class Dataflow_dumplicate(Dataflow):
                     continue
             if v is None or v=="" or v=="[]" or v=="未知":
                 continue
-            if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
+            if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates):
                 continue
             _dict[k] = v
         for _proj in projects:
@@ -3058,14 +3070,19 @@ class Dataflow_dumplicate(Dataflow):
             if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
                 _proj[project_page_time] = project_dict.get(project_page_time,"")
 
-        #拼接属性
-        append_dict = {}
-        set_docid = set()
-        set_product = set()
-        set_code = set()
-        set_nlp_enterprise = set()
-        set_nlp_enterprise_attachment = set()
+
         for _proj in projects:
+            #拼接属性
+            append_dict = {}
+            set_docid = set()
+            set_product = set()
+            set_code = set()
+            set_nlp_enterprise = set()
+            set_nlp_enterprise_attachment = set()
+            set_candidates = set()
+
+
+
             _docids = _proj.get(project_docids,"")
             _codes = _proj.get(project_project_codes,"")
             _product = _proj.get(project_product,"")
@@ -3081,15 +3098,22 @@ class Dataflow_dumplicate(Dataflow):
             try:
                 set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
                 set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
-            except Exception as e:
-                pass
+                list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
+                for item in list_candidates:
+                    if item.get("name") is not None and item.get("name") not in set_candidates:
+                        set_candidates.add(item.get("name"))
 
-            set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
-            set_product = set_product | set(project_dict.get(project_product,"").split(","))
 
-            try:
+                set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
+                set_product = set_product | set(project_dict.get(project_product,"").split(","))
+
                 set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
                 set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
+
+                for item in json.loads(_proj.get(project_candidates,"[]")):
+                    if item.get("name") is not None and item.get("name") not in set_candidates:
+                        set_candidates.add(item.get("name"))
+                        list_candidates.append(item)
             except Exception as e:
                 pass
 
@@ -3101,6 +3125,7 @@ class Dataflow_dumplicate(Dataflow):
 
             append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
             append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
+            append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
 
 
             dict_dynamic = {}
@@ -3119,6 +3144,7 @@ class Dataflow_dumplicate(Dataflow):
             list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
 
             append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
+
             _proj.update(append_dict)
 
 
@@ -3151,74 +3177,84 @@ class Dataflow_dumplicate(Dataflow):
 
 
         #更新私有属性
-        for _pp in list_package_properties:
-
-            flag_update = False
-            sub_project_name = _pp.get(project_sub_project_name,"")
-            if sub_project_name=="Project":
-                sub_project_name = ""
-            win_tenderer = _pp.get(project_win_tenderer,"")
-            win_bid_price = _pp.get(project_win_bid_price,0)
-            bidding_budget = _pp.get(project_bidding_budget,0)
-            if win_tenderer!="" and bidding_budget!=0:
-                _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
-                if _key in dict_package:
-                    if self.is_same_package(_pp,dict_package[_key]):
-                        ud = self.getUpdate_dict(_pp)
-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
-                        dict_package[_key].update(ud)
-                        flag_update = True
-                        continue
-            if win_tenderer!="" and  win_bid_price!=0:
-                _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
-                if _key in dict_package:
-                    if self.is_same_package(_pp,dict_package[_key]):
-                        ud = self.getUpdate_dict(_pp)
-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
-                        dict_package[_key].update(ud)
-                        flag_update = True
-                        continue
-            if win_tenderer!="":
-                _key = "%s-%s"%(sub_project_name,win_tenderer)
-                if _key in dict_package:
-                    if self.is_same_package(_pp,dict_package[_key]):
-                        ud = self.getUpdate_dict(_pp)
-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
-                        dict_package[_key].update(ud)
-                        flag_update = True
-                        continue
-            if bidding_budget!=0:
-                _key = "%s-%s"%(sub_project_name,str(bidding_budget))
-                if _key in dict_package:
-                    if self.is_same_package(_pp,dict_package[_key]):
-                        ud = self.getUpdate_dict(_pp)
-                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
-                        dict_package[_key].update(ud)
-                        flag_update = True
-                        continue
-            if not flag_update:
-                _pp.update(project_dict)
-                projects.append(_pp)
+        if len(projects)==1 and len(list_package_properties)==1:
+            _pp = list_package_properties[0]
+            pp = projects[0]
+            ud = self.getUpdate_dict(_pp)
+            self.set_project_uuid(ud,pp.get("uuid"))
+            pp.update(_pp)
+        else:
 
+            for _pp in list_package_properties:
 
-                _counts = 0
+                flag_update = False
+                sub_project_name = _pp.get(project_sub_project_name,"")
+                if sub_project_name=="Project":
+                    sub_project_name = ""
+                win_tenderer = _pp.get(project_win_tenderer,"")
+                win_bid_price = _pp.get(project_win_bid_price,0)
+                bidding_budget = _pp.get(project_bidding_budget,0)
                 if win_tenderer!="" and bidding_budget!=0:
                     _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
-                    dict_package[_key] = _pp
-                _counts += 1
+                    if _key in dict_package:
+                        if self.is_same_package(_pp,dict_package[_key]):
+                            ud = self.getUpdate_dict(_pp)
+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
+                            dict_package[_key].update(ud)
+                            flag_update = True
+                            continue
                 if win_tenderer!="" and  win_bid_price!=0:
                     _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
-                    dict_package[_key] = _pp
-                    _counts +=1
-                if _counts==0:
-                    if win_tenderer!="":
-                        _key = "%s-%s"%(sub_project_name,win_tenderer)
+                    if _key in dict_package:
+                        if self.is_same_package(_pp,dict_package[_key]):
+                            ud = self.getUpdate_dict(_pp)
+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
+                            dict_package[_key].update(ud)
+                            flag_update = True
+                            continue
+                if win_tenderer!="":
+                    _key = "%s-%s"%(sub_project_name,win_tenderer)
+                    if _key in dict_package:
+                        if self.is_same_package(_pp,dict_package[_key]):
+                            ud = self.getUpdate_dict(_pp)
+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
+                            dict_package[_key].update(ud)
+                            flag_update = True
+                            continue
+                if bidding_budget!=0:
+                    _key = "%s-%s"%(sub_project_name,str(bidding_budget))
+                    if _key in dict_package:
+                        if self.is_same_package(_pp,dict_package[_key]):
+                            ud = self.getUpdate_dict(_pp)
+                            self.set_project_uuid(ud,dict_package[_key].get("uuid"))
+                            dict_package[_key].update(ud)
+                            flag_update = True
+                            continue
+                if not flag_update:
+                    _pp.update(project_dict)
+                    projects.append(_pp)
+
+
+                    _counts = 0
+                    if win_tenderer!="" and bidding_budget!=0:
+                        _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
                         dict_package[_key] = _pp
-                        _counts += 1
-                    if bidding_budget!=0:
-                        _key = "%s-%s"%(sub_project_name,str(bidding_budget))
+                    _counts += 1
+                    if win_tenderer!="" and  win_bid_price!=0:
+                        _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
                         dict_package[_key] = _pp
-                        _counts += 1
+                        _counts +=1
+                    if _counts==0:
+                        if win_tenderer!="":
+                            _key = "%s-%s"%(sub_project_name,win_tenderer)
+                            dict_package[_key] = _pp
+                            _counts += 1
+                        if bidding_budget!=0:
+                            _key = "%s-%s"%(sub_project_name,str(bidding_budget))
+                            dict_package[_key] = _pp
+                            _counts += 1
+
+
 
 
 
@@ -3255,33 +3291,42 @@ class Dataflow_dumplicate(Dataflow):
             list_projects = dumplicate_projects(list_projects)
         list_projects.extend(list_delete_projects)
         project_json = to_project_json(list_projects)
-        print("delete_json",project_json)
         return project_json
 
 
     def delete_doc_handle(self,_dict,result_queue):
         headers = _dict.get("frame")
         conn = _dict.get("conn")
-        log("==========delete")
+
         if headers is not None:
             message_id = headers.headers["message-id"]
             body = headers.body
             item = json.loads(body)
             docid = item.get("docid")
+            log("==========start delete docid:%s"%(str(docid)))
             if docid is None:
-                return
+                ackMsg(conn,message_id)
             delete_result = self.delete_projects_by_document(docid)
 
+            log("1")
             _uuid = uuid4().hex
             _d = {PROJECT_PROCESS_UUID:_uuid,
                   PROJECT_PROCESS_CRTIME:1,
                   PROJECT_PROCESS_PROJECTS:delete_result}
             _pp = Project_process(_d)
-            if _pp.update_row(self.ots_client):
+            log("2")
+            try:
+                if _pp.update_row(self.ots_client):
+                    ackMsg(conn,message_id)
+            except Exception as e:
                 ackMsg(conn,message_id)
+            log("3")
             #取消插入结果队列,改成插入project_process表
             # if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
             #     ackMsg(conn,message_id)
+            log("==========end delete docid:%s"%(str(docid)))
+        else:
+            log("has not headers")
 
     def generate_common_properties(self,list_docs):
         '''
@@ -3539,6 +3584,9 @@ class Dataflow_dumplicate(Dataflow):
             project_info_source,
             project_nlp_enterprise,
             project_nlp_enterprise_attachment,
+            project_tenderee_code,
+            project_agency_code,
+            project_candidates
         ],sort="page_time",table_name="project2",table_index="project2_index")
 
         return list_project_dict
@@ -3654,6 +3702,14 @@ class Dataflow_dumplicate(Dataflow):
                       should_q_cod]
             list_query.append([_query,2])
 
+        if win_tenderer!="" and sub_project_name!="":
+            _query = [TermQuery(project_win_tenderer,win_tenderer),
+                      TermQuery(project_sub_project_name,sub_project_name)
+                                             ]
+            list_query.append([_query,2])
+
+
+
         if win_tenderer!="" and float(win_bid_price)>0:
             _query = [TermQuery(project_win_tenderer,win_tenderer),
                                              TermQuery(project_win_bid_price,win_bid_price)]
@@ -3710,10 +3766,7 @@ class Dataflow_dumplicate(Dataflow):
                 _uuid = _proj.get("uuid")
                 if _uuid is not None:
                     set_uuid = set_uuid | set(_uuid.split(","))
-            must_not_q = []
-            for _uuid in list(set_uuid):
-                must_not_q.append(TermQuery("uuid",_uuid))
-                print("must_not_q uuid:%s"%(_uuid))
+
 
 
             projects_merge_count = 0
@@ -3729,6 +3782,10 @@ class Dataflow_dumplicate(Dataflow):
             docids = ""
             for _proj in list_projects[:30]:
 
+                must_not_q = []
+                for _uuid in list(set_uuid):
+                    must_not_q.append(TermQuery("uuid",_uuid))
+
                 docids = _proj.get(project_docids,"")
                 page_time = _proj.get(project_page_time,"")
                 project_codes = _proj.get(project_project_codes,"")
@@ -3754,8 +3811,8 @@ class Dataflow_dumplicate(Dataflow):
                 district = _proj.get(project_district,"")
 
                 if is_yanshou:
-                    page_time_less = timeAdd(page_time,-750)
-                    page_time_greater = timeAdd(page_time,720)
+                    page_time_less = timeAdd(page_time,-850)
+                    page_time_greater = timeAdd(page_time,820)
                 else:
                     page_time_less = timeAdd(page_time,-450)
                     page_time_greater = timeAdd(page_time,420)
@@ -3784,6 +3841,7 @@ class Dataflow_dumplicate(Dataflow):
 
                 if page_time_less is not None and page_time_greater is not None:
                     must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
+                                    # RangeQuery("status",201,301)
                                 ]
 
                 #sub_project_name非必要条件
@@ -3832,7 +3890,8 @@ class Dataflow_dumplicate(Dataflow):
                 list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
                 list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
                 # log(page_time_less+"=="+page_time_greater)
-                # log("list_merge_data:%s"%(str(list_merge_data)))
+                if b_log:
+                    log("list_merge_data count:%d"%(len(list_merge_data)))
                 list_check_data = []
                 for _data in list_merge_data:
                     _time = time.time()
@@ -3858,8 +3917,9 @@ class Dataflow_dumplicate(Dataflow):
                         update_projects_by_project(_data,[_proj])
                         projects_update_time += time.time()-_time
 
-            whole_time = time.time()-whole_time_start
-            log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
+                whole_time = time.time()-whole_time_start
+                log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
+
 
             return list_projects
         except Exception as e:
@@ -3892,10 +3952,9 @@ class Dataflow_dumplicate(Dataflow):
             list_docids = [a for a in list_docids if a is not None]
 
 
-
             _time = time.time()
             list_projects = self.search_projects_with_document(list_docids)
-            # log("search projects takes:%.3f"%(time.time()-_time))
+            log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
             if len(list_projects)==0:
                 # _time = time.time()
                 list_docs = self.search_docs(list_docids)
@@ -3914,7 +3973,6 @@ class Dataflow_dumplicate(Dataflow):
             list_projects = self.merge_projects(list_projects,b_log)
             # log("merge projects takes:%.3f"%(time.time()-_time))
 
-
             _time = time.time()
             list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
             # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
@@ -3923,6 +3981,27 @@ class Dataflow_dumplicate(Dataflow):
                 list_projects = []
 
             _time = time.time()
+
+            projects = list_projects
+            for _proj in projects:
+                dup_docid = _proj.get(project_dup_docid,"")
+                list_dup_docid = dup_docid.split(",")
+                new_dup_docid = []
+                for _docid in list_dup_docid:
+                    if _docid=="":
+                        continue
+                    docid = int(_docid)
+                    _d = {"partitionkey":docid%500+1,
+                          "docid":docid,
+                          }
+                    _doc = Document(_d)
+
+                    if _doc.fix_columns(self.ots_client,[document_update_document],True):
+                        if _doc.getProperties().get(document_update_document,"")!="true":
+                            new_dup_docid.append(str(docid))
+                _proj[project_dup_docid] = ",".join(new_dup_docid)
+            list_projects = projects
+
             project_json = to_project_json(list_projects)
             # log("json projects takes:%.3f"%(time.time()-_time))
             if b_log:
@@ -3957,6 +4036,11 @@ class Dataflow_dumplicate(Dataflow):
         has_before = False
         has_after = False
 
+        bidclose_time = page_time
+        web_source_name = item.get(document_tmp_web_source_name,"")
+
+
+
         if len(page_time)>0:
             l_page_time = timeAdd(page_time,days=-90)
             dict_time = item.get("dict_time",{})
@@ -3966,6 +4050,14 @@ class Dataflow_dumplicate(Dataflow):
                         has_before = True
                     if v>page_time:
                         has_after = True
+                    if k==document_tmp_time_bidclose:
+                        bidclose_time = v
+
+        set_web_source = {"中国招标投标公共服务平台","比地招标"}
+
+        if web_source_name in set_web_source and bidclose_time<page_time:
+            return False
+
         log("check page_time has_before %s has_after %s"%(str(has_before),str(has_after)))
         if has_before:
             _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
@@ -4024,7 +4116,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path],b_log=b_log)
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
                 _i += step
 
 
@@ -4049,7 +4141,8 @@ class Dataflow_dumplicate(Dataflow):
 
             dup_docid = set()
             for _dict in final_list:
-                dup_docid.add(_dict.get(document_tmp_docid))
+                if _dict.get("update_document","")!="true":
+                    dup_docid.add(_dict.get(document_tmp_docid))
             if item.get(document_tmp_docid) in dup_docid:
                 dup_docid.remove(item.get(document_tmp_docid))
 
@@ -4057,7 +4150,7 @@ class Dataflow_dumplicate(Dataflow):
             remove_list = []
 
 
-            if self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid)):
+            if (self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
                 dtmp.setValue(document_tmp_save,1,True)
                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
                 dmp_docid = ",".join([str(a) for a in list(dup_docid)])
@@ -4071,6 +4164,7 @@ class Dataflow_dumplicate(Dataflow):
                     for _dict in final_list:
                         if _dict.get(document_tmp_docid) in dup_docid:
                             remove_list.append(_dict)
+
                     dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                     dmp_docid = "%d,%s"%(best_docid,dmp_docid)
                 else:
@@ -4082,16 +4176,19 @@ class Dataflow_dumplicate(Dataflow):
             list_docids = list(dup_docid)
             list_docids.append(best_docid)
 
-            if item.get(document_update_document)=="true":
-                dtmp.setValue(document_tmp_save,1,True)
+            # if item.get(document_update_document)=="true":
+            #     dtmp.setValue(document_tmp_save,1,True)
 
             list_merge_dump = []
             if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
-                log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
+                if exist_finterprint:
+                    log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
                 dtmp.setValue(document_tmp_projects,"[]",True)
             else:
                 project_json,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
-                if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump:
+
+
+                if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
                     dtmp.setValue(document_tmp_save,0,True)
                 dtmp.setValue(document_tmp_projects,project_json,True)
             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
@@ -4145,19 +4242,23 @@ class Dataflow_dumplicate(Dataflow):
 
 
 
+        current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
+        before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
+        after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
         if self.fix_doc_docid is None:
-            current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
-            before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
             bool_query = BoolQuery(must_queries=[
                 TermQuery(document_tmp_save,1),
                 RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
-                RangeQuery(document_tmp_opertime,before_date)
+                RangeQuery(document_tmp_docchannel,0,300),
+                RangeQuery(document_tmp_opertime,before_date,after_date)
             ])
         else:
             bool_query = BoolQuery(must_queries=[
                 TermQuery(document_tmp_save,1),
                 RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
-                RangeQuery(document_tmp_docid,self.fix_doc_docid)
+                RangeQuery(document_tmp_docchannel,0,300),
+                RangeQuery(document_tmp_docid,self.fix_doc_docid),
+                RangeQuery(document_tmp_opertime,before_date,after_date)
             ])
 
         list_data = []
@@ -4192,7 +4293,7 @@ class Dataflow_dumplicate(Dataflow):
         schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
         schedule.add_job(self.flow_remove,"cron",hour="20")
         schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
-        # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
+        schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
         schedule.start()
 
     def changeSaveStatus(self,list_dict):
@@ -4213,16 +4314,17 @@ class Dataflow_dumplicate(Dataflow):
                           document_tmp_save:0
                           }
                     _d_tmp = Document_tmp(_d)
-                    if _d_tmp.fix_columns(self.ots_client,["status"],True):
+                    if _d_tmp.fix_columns(self.ots_client,["status",document_update_document],True):
                         if _d_tmp.getProperties().get("status")==1:
-                            _d_tmp.setValue("status",0,True)
-                            _d_tmp.update_row(self.ots_client)
+                            if _d_tmp.getProperties().get(document_update_document,"")!="true":
+                                _d_tmp.setValue("status",0,True)
+                                _d_tmp.update_row(self.ots_client)
 
 
 
     def test_dumplicate(self,docid):
         # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
-        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]
+        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
         bool_query = BoolQuery(must_queries=[
             TermQuery("docid",docid)
         ])
@@ -4413,7 +4515,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(455485514
+    df_dump.test_dumplicate(576859812
                             )
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061

+ 218 - 92
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -9,7 +9,7 @@ from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
 import os
 from BaseDataMaintenance.common.ossUtils import *
 from BaseDataMaintenance.dataSource.pool import ConnectorPool
-from BaseDataMaintenance.model.ots.document import Document
+from BaseDataMaintenance.model.ots.document import Document,document_attachment_path_filemd5
 
 from BaseDataMaintenance.common.Utils import article_limit
 from BaseDataMaintenance.common.documentFingerprint import getFingerprint
@@ -108,7 +108,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
     def start_attachment_listener(self):
         for _i in range(self.comsumer_count):
-            listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler,_i)
+            listener_attachment = self.AttachmentMQListener(getConnect_activateMQ(),self.attachment_listener_handler ,_i)
             createComsumer(listener_attachment,self.mq_attachment)
             self.list_attachment_comsumer.append(listener_attachment)
 
@@ -254,26 +254,43 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
         '''
 
         try:
+            start_time = time.time()
+
             item = _dict.get("item")
             list_attach = _dict.get("list_attach")
             conn = _dict["conn"]
             message_id = _dict.get("message_id")
 
+            if "retry_times" not in item:
+                item["retry_times"] = 5
             _retry_times = item.get("retry_times",0)
+
+
             dhtml = Document_html({"partitionkey":item.get("partitionkey"),
                                    "docid":item.get("docid")})
 
             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
             dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
             dhtml.delete_bidi_a()
-            dtmp = Document_tmp(item)
-
 
-            start_time = time.time()
             #调用识别接口
             _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
 
+            # 将附件分类写回document
+            page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
+            if len(page_attachments)>0:
+                for _attachment in page_attachments:
+                    filemd5 = _attachment.get(document_attachment_path_filemd5,"")
+                    classification = None
+                    for _attach in list_attach:
+                        if _attach.getProperties().get(attachment_filemd5,"")==filemd5:
+                            classification = _attach.getProperties().get(attachment_classification,"")
+                            break
+                    if classification is not None:
+                        _attachment[attachment_classification] = classification
+                item[document_tmp_attachment_path] = json.dumps(page_attachments,ensure_ascii=False)
 
+            dtmp = Document_tmp(item)
 
             _to_ack = False
             if not _succeed and _retry_times<self.retry_times:
@@ -301,6 +318,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                     dhtml.updateSWFImages(swf_urls)
                     dhtml.updateAttachment(list_html)
 
+
                     dtmp.setValue(document_tmp_attachment_extract_status,1,True)
                     dtmp.setValue(document_tmp_dochtmlcon,dhtml.getProperties().get(document_tmp_dochtmlcon),True)
                     send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(dtmp.getProperties(),cls=MyEncoder),self.mq_extract)
@@ -435,7 +453,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                     if len(_html)>1:
                         _html = "interface return error"
                     else:
-                        sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
+                        # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
                         _html = ""
 
                         return False
@@ -630,7 +648,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                     log("getAttachments search in ots:%s"%(_filemd5))
                     _attach = {attachment_filemd5:_filemd5}
                     _attach_ots = attachment(_attach)
-                    if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time],True):
+                    if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time,attachment_classification],True):
                         if _attach_ots.getProperties().get(attachment_status) is not None:
                             log("getAttachments find in ots:%s"%(_filemd5))
                             _attach_pg = Attachment_postgres(_attach_ots.getProperties())
@@ -828,14 +846,17 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             self.list_extract_comsumer.append(listener_extract)
 
         while 1:
-            for _i in range(len(self.list_extract_comsumer)):
-                if self.list_extract_comsumer[_i].conn.is_connected():
-                    continue
-                else:
-                    listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
-                    createComsumer(listener,self.mq_extract)
-                    self.list_extract_comsumer[_i] = listener
-            time.sleep(5)
+            try:
+                for _i in range(len(self.list_extract_comsumer)):
+                    if self.list_extract_comsumer[_i].conn.is_connected():
+                        continue
+                    else:
+                        listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
+                        createComsumer(listener,self.mq_extract)
+                        self.list_extract_comsumer[_i] = listener
+                time.sleep(5)
+            except Exception as e:
+                traceback.print_exc()
 
     def monitor_listener(self):
         for i in range(len(self.list_extract_comsumer)):
@@ -978,6 +999,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
     def comsumer_handle(self,_dict,result_queue):
         try:
             log("start handle")
+            data = {}
+
             frame = _dict["frame"]
             conn = _dict["conn"]
             message_id = frame.headers["message-id"]
@@ -999,7 +1022,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
-                    _soup = BeautifulSoup(_dochtmlcon,"html5lib")
+                    _soup = BeautifulSoup(_dochtmlcon,"lxml")
                     all_len = len(_soup.get_text()) # 全公告内容text长度
                     _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
                     attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
@@ -1026,7 +1049,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             _extract.setValue(document_extract2_docid,item.get(document_docid))
             all_done = 1
 
-            data = {}
+
             for k,v in item.items():
                 data[k] = v
             data["timeout"] = 440
@@ -1042,8 +1065,9 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             data["web_source_no"] = item.get(document_tmp_web_source_no,"")
             data["web_source_name"] = item.get(document_tmp_web_source_name,"")
             data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
+            data["page_attachments"] = item.get(document_tmp_attachment_path,"[]")
 
-            _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))
+            _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))+str(data["original_docchannel"])
 
             if all_done>0:
                 _time = time.time()
@@ -1078,9 +1102,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             # if all_done>0 and len(_extract.getProperties().get(document_extract2_extract_json,""))<=2:
             #     all_done = -4
             _extract.setValue(document_extract2_industry_json,"{}",True)
+            _to_ack = True
             try:
                 if all_done!=1:
-                    sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
+                    # sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
+                    log("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
                     if extract_times>=10:
                         #process as succeed
                         dtmp.setValue(document_tmp_dochtmlcon,"",False)
@@ -1138,15 +1164,20 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
             if _to_ack:
                 ackMsg(conn,message_id,subscription)
-            log("process %s docid:%d %s"%(str(_to_ack),data["doc_id"],str(all_done)))
+            else:
+                item["extract_times"] -= 1
+                send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract)
+                ackMsg(conn,message_id,subscription)
+            log("process %s docid:%d %s"%(str(_to_ack),data.get("doc_id"),str(all_done)))
         except requests.ConnectionError as e1:
             item["extract_times"] -= 1
             if send_msg_toacmq(self.pool_mq,json.dumps(item,ensure_ascii=False),self.mq_extract):
                 ackMsg(conn,message_id,subscription)
         except Exception as e:
             traceback.print_exc()
-            sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
-            log("process %s docid: failed message_id:%s"%(data["doc_id"],message_id))
+            # sentMsgToDD("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
+            log("要素提取失败:docid:%d with result:%s"%(item.get(document_tmp_docid),str(e)))
+            log("process %s docid: failed message_id:%s"%(data.get("doc_id"),message_id))
             if extract_times>=10:
                 #process as succeed
                 dtmp.setValue(document_tmp_dochtmlcon,"",False)
@@ -1360,7 +1391,7 @@ class Dataflow_init(Dataflow):
         conn_oracle = self.pool_oracle.getConnector()
 
         try:
-            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[],limit=1000)
+            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[])
             for _obj in list_obj:
                 ots_dict = _obj.getProperties_ots()
 
@@ -1379,7 +1410,7 @@ class Dataflow_init(Dataflow):
             traceback.print_exc()
             self.pool_oracle.decrease()
 
-    def shengpi2mq(self):
+    def shenpi2mq(self):
 
         conn_oracle = self.pool_oracle.getConnector()
 
@@ -1395,36 +1426,131 @@ class Dataflow_init(Dataflow):
                     if max_shenpi_id>self.base_shenpi_id:
                         max_shenpi_id -= self.base_shenpi_id
                     self.max_shenpi_id = max_shenpi_id
-            if self.max_shenpi_id is not None:
-                # select data in order
-                list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,self.max_shenpi_id,)
 
-                # send data to mq one by one with max_shenpi_id updated
-                for _data in list_data:
-                    _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
+                if self.max_shenpi_id<60383953:
+                    self.max_shenpi_id = 60383953
 
-                    ots_dict = _data.getProperties_ots()
-                    if ots_dict["docid"]<self.base_shenpi_id:
-                        ots_dict["docid"] += self.base_shenpi_id
 
-                    if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
-                        if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
-                            self.max_shenpi_id = _id
-                        else:
-                            log("sent shenpi message to mq failed %s"%(_id))
-                            break
-                    else:
-                        if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
-                            self.max_shenpi_id = _id
-                        else:
-                            log("sent shenpi message to mq failed %s"%(_id))
-                            break
+            if self.max_shenpi_id is not None:
+                # select data in order
+
+                origin_max_shenpi_id = T_SHEN_PI_XIANG_MU.get_max_id(conn_oracle)
+
+                if origin_max_shenpi_id is not None:
+                    log("shenpi origin_max_shenpi_id:%d current_id:%d"%(origin_max_shenpi_id,self.max_shenpi_id))
+                    for _id_i in range(self.max_shenpi_id+1,origin_max_shenpi_id+1):
+                        list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,_id_i)
+
+                        # send data to mq one by one with max_shenpi_id updated
+                        for _data in list_data:
+
+                            _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
+
+                            ots_dict = _data.getProperties_ots()
+                            if ots_dict["docid"]<self.base_shenpi_id:
+                                ots_dict["docid"] += self.base_shenpi_id
+                                ots_dict["partitionkey"] = ots_dict["docid"]%500+1
+
+                            if ots_dict.get(T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS,"") !='[]':
+                                if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_attachment):
+                                    self.max_shenpi_id = _id
+                                else:
+                                    log("sent shenpi message to mq failed %s"%(_id))
+                                    break
+                            else:
+                                if send_msg_toacmq(self.pool_mq,json.dumps(ots_dict,cls=MyEncoder),self.mq_extract):
+                                    self.max_shenpi_id = _id
+                                else:
+                                    log("sent shenpi message to mq failed %s"%(_id))
+                                    break
+            self.pool_oracle.putConnector(conn_oracle)
 
         except Exception as e:
+            log("shenpi error")
             traceback.print_exc()
             self.pool_oracle.decrease()
 
+    def fix_shenpi(self):
+
+        pool_oracle = ConnectorPool(10,15,getConnection_oracle)
+        begin_id = 0
+        end_id = 64790010
+        thread_num = 15
+        step = (end_id-begin_id)//thread_num
+        list_items = []
+        for _i in range(thread_num):
+            _begin = _i*step
+            _end = (_i+1)*step-1
+            if _i==thread_num-1:
+                _end = end_id
+            list_items.append((_begin,_end,_i))
+        task_queue = Queue()
+        for item in list_items:
+            task_queue.put(item)
+
+        fix_count_list = []
+
+        def _handle(item,result_queue):
+            conn_oracle = pool_oracle.getConnector()
+            (begin_id,end_id,thread_id) = item
+
+            _count = 0
+            for _id_i in range(begin_id,end_id):
+                try:
+                    bool_query = BoolQuery(must_queries=[
+                        TermQuery("docchannel",302),
+                        TermQuery("original_id",_id_i)
+                    ])
+                    rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
+                                                                                        SearchQuery(bool_query,get_total_count=True))
+                    if total_count>0:
+                        continue
+
+                    # bool_query = BoolQuery(must_queries=[
+                    #     TermQuery("id",_id_i),
+                    # ])
+                    # rows,next_token,total_count,is_all_succeed = self.ots_client.search("t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index",
+                    #                                                                     SearchQuery(bool_query,get_total_count=True))
+                    # if total_count>0:
+                    #     continue
+
+                    try:
+                        list_data = T_SHEN_PI_XIANG_MU.select_rows(conn_oracle,_id_i)
+                    except Exception as e:
+                        continue
+
+                    # send data to mq one by one with max_shenpi_id updated
+                    for _data in list_data:
+
+                        _id = _data.getProperties().get(T_SHEN_PI_XIANG_MU_ID)
+
+                        ots_dict = _data.getProperties_ots()
+                        if ots_dict["docid"]<self.base_shenpi_id:
+                            ots_dict["docid"] += self.base_shenpi_id
+                            ots_dict["partitionkey"] = ots_dict["docid"]%500+1
+                        ots_dict["status"] = 201
+                        dict_1 = {}
+                        dict_2 = {}
+                        for k,v in ots_dict.items():
+                            if k!="dochtmlcon":
+                                dict_1[k] = v
+                            if k in ('partitionkey',"docid","dochtmlcon"):
+                                dict_2[k] = v
+                        d_1 = Document(dict_1)
+                        d_2 = Document(dict_2)
+                        d_1.update_row(self.ots_client)
+                        d_2.update_row(self.ots_capacity)
+                        _count += 1
+                except Exception as e:
+                    traceback.print_exc()
+
+                log("thread_id:%d=%d/%d/%d"%(thread_id,_id_i-begin_id,_count,end_id-begin_id))
+            fix_count_list.append(_count)
+            pool_oracle.putConnector(conn_oracle)
 
+        mt = MultiThreadHandler(task_queue,_handle,None,thread_count=thread_num)
+        mt.run()
+        print(fix_count_list,sum(fix_count_list))
 
     def ots2mq(self):
         try:
@@ -1432,13 +1558,34 @@ class Dataflow_init(Dataflow):
 
             rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
                                                                                 SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_docid)]),get_total_count=True,limit=100),
-                                                                                ColumnsToGet(return_type=ColumnReturnType.ALL))
+                                                                                ColumnsToGet(return_type=ColumnReturnType.NONE))
             list_data = getRow_ots(rows)
+            task_queue = Queue()
             for _data in list_data:
+                task_queue.put(_data)
+
+
+            while next_token:
+                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
+                                                                                    SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
+                                                                                    ColumnsToGet(return_type=ColumnReturnType.NONE))
+                list_data = getRow_ots(rows)
+
+                for _data in list_data:
+                    task_queue.put(_data)
+
+                if task_queue.qsize()>=1000:
+                    break
+
+            def _handle(_data,result_queue):
+
                 _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
                       document_tmp_docid:_data.get(document_tmp_docid),
                       document_tmp_status:0}
                 _document = Document(_d)
+                _document.fix_columns(self.ots_client,None,True)
+                _data = _document.getProperties()
+
                 page_attachments = _data.get(document_tmp_attachment_path,"[]")
 
                 _document_html = Document(_data)
@@ -1453,36 +1600,16 @@ class Dataflow_init(Dataflow):
                     _data[document_tmp_status] = status
                     send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
                 if send_succeed:
+                    _document.setValue(document_tmp_status,0,True)
                     _document.update_row(self.ots_client)
                 else:
                     log("send_msg_error2222")
-            while next_token:
-                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
-                                                                                    SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
-                                                                                    ColumnsToGet(return_type=ColumnReturnType.ALL))
-                list_data = getRow_ots(rows)
-                for _data in list_data:
-                    _d = {document_tmp_partitionkey:_data.get(document_tmp_partitionkey),
-                          document_tmp_docid:_data.get(document_tmp_docid),
-                          document_tmp_status:0}
-                    _document = Document(_d)
-                    page_attachments = _data.get(document_tmp_attachment_path,"[]")
 
-                    _document_html = Document(_data)
-                    _document_html.fix_columns(self.ots_capacity,[document_tmp_dochtmlcon],True)
+            if task_queue.qsize()>0:
+                mt = MultiThreadHandler(task_queue,_handle,None,15)
+                mt.run()
+
 
-                    if page_attachments!="[]":
-                        status = random.randint(1,10)
-                        _data[document_tmp_status] = status
-                        send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_attachment)
-                    else:
-                        status = random.randint(11,50)
-                        _data[document_tmp_status] = status
-                        send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(_document_html.getProperties(),cls=MyEncoder),self.mq_extract)
-                    if send_succeed:
-                        _document.update_row(self.ots_client)
-                    else:
-                        log("send_msg_error2222")
         except Exception as e:
             traceback.print_exc()
 
@@ -1501,6 +1628,8 @@ class Dataflow_init(Dataflow):
                 _document = Document_tmp(_d)
                 page_attachments = _data.get(document_tmp_attachment_path,"[]")
 
+                log("refix doc %s from document_tmp"%(str(_data.get(document_tmp_docid))))
+
                 _document_html = Document_html(_data)
                 _document_html.fix_columns(self.ots_client,[document_tmp_dochtmlcon],True)
 
@@ -1593,7 +1722,14 @@ class Dataflow_init(Dataflow):
         from BaseDataMaintenance.model.oracle.TuDiKuangChanTemp import TuDiKuangChanTemp
         from BaseDataMaintenance.model.oracle.ZhaoBiaoDaYiTemp import ZhaoBiaoDaYiTemp
         from BaseDataMaintenance.model.oracle.ZhaoBiaoWenJianTemp import ZhaoBiaoWenJianTemp
+
+        from BaseDataMaintenance.model.oracle.TouSuChuLiTemp import TouSuChuLiTemp
+        from BaseDataMaintenance.model.oracle.WeiFaJiLuTemp import WeiFaJiLuTemp
+        from BaseDataMaintenance.model.oracle.QiTaShiXinTemp import QiTaShiXin
+
+
         schedule = BlockingScheduler()
+
         schedule.add_job(self.temp2mq,"cron",args=(CaiGouYiXiangTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(PaiMaiChuRangTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoGongGaoTemp({}),),second="*/10")
@@ -1606,14 +1742,24 @@ class Dataflow_init(Dataflow):
         schedule.add_job(self.temp2mq,"cron",args=(TuDiKuangChanTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoDaYiTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoWenJianTemp({}),),second="*/10")
+
+        schedule.add_job(self.temp2mq,"cron",args=(TouSuChuLiTemp({}),),second="*/10")
+        schedule.add_job(self.temp2mq,"cron",args=(WeiFaJiLuTemp({}),),second="*/10")
+        schedule.add_job(self.temp2mq,"cron",args=(QiTaShiXin({}),),second="*/10")
+
         schedule.add_job(self.ots2mq,"cron",second="*/10")
         schedule.add_job(self.otstmp2mq,"cron",second="*/10")
         schedule.add_job(self.monitor_listener,"cron",minute="*/1")
+
+        schedule.add_job(self.shenpi2mq,"cron",minute="*/1")
         schedule.start()
 
 
 
 
+
+
+
 def transform_attachment():
     from BaseDataMaintenance.model.ots.attachment import attachment
     from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
@@ -1802,27 +1948,7 @@ def check_data_synchronization():
 
 current_path = os.path.abspath(os.path.dirname(__file__))
 
-def fixDoc_to_queue_init(filename=""):
-    import pandas as pd
-    from BaseDataMaintenance.model.oracle.GongGaoTemp import dict_oracle2ots
-    if filename=="":
-        filename = os.path.join(current_path,"check.xlsx")
-    df = pd.read_excel(filename)
-    if "docchannel" in dict_oracle2ots:
-        dict_oracle2ots.pop("docchannel")
-    row_name = ",".join(list(dict_oracle2ots.keys()))
-    conn = getConnection_oracle()
-    cursor = conn.cursor()
-    _count = 0
-    for uuid,tablename,_exists,_toolong in zip(df["uuid"],df["tablename"],df["exists"],df["tolong"]):
-        if _exists==0 and _toolong==0:
-            _count += 1
-            _source = str(tablename).replace("_TEMP","")
-            sql = " insert into %s(%s) select %s from %s where id='%s' "%(tablename,row_name,row_name,_source,uuid)
-            cursor.execute(sql)
-            log("%d:%s"%(_count,sql))
-    conn.commit()
-    conn.close()
+
 
 if __name__ == '__main__':
     # di = Dataflow_init()

+ 646 - 0
BaseDataMaintenance/maintenance/document/ApprovalData.py

@@ -0,0 +1,646 @@
+
+from BaseDataMaintenance.common.Utils import *
+from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity
+from tablestore import *
+import pandas as pd
+from queue import Queue
+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
+from BaseDataMaintenance.model.ots.document import Document
+
+import json
+from uuid import uuid4
+from bs4 import BeautifulSoup
+
+'''
+"approval": [
+        {
+            "approval_items": "", #审批事项
+            "approval_result": "", #审批结果
+            "approver": "",#审批部门
+            "city": "深圳",
+            "construct_company": "深圳市赛孚电子科技有限公司",# 建设单位
+            "construction_scale": "",#建设规模
+            "declare_company": "",#申报单位
+            "district": "光明",
+            "doc_num": "",#审批文号
+            "evaluation_agency": "",#环评机构
+            "legal_person": "陈雷", # 项目法人
+            "moneysource": "",# 资金来源
+            "phone": "",
+            "pro_type": "",#申报类型
+            "project_addr": "广东省深圳市光明区玉塘街道田寮社区第七工业区26栋301",
+            "project_code": "",
+            "project_name": "深圳市赛孚电子科技有限公司销售医用射线装置项目",
+            "properties": "新建", #建设性质
+            "province": "广东",
+            "time_commencement": "",# 开工时间
+            "time_completion": "",#竣工时间
+            "time_declare": "",#申报时间
+            "total_tendereeMoney": "200000", # 总投资
+            "year_limit": ""#建设年限,
+"compilation_unit": "编制单位", 
+"publisher": "发布单位",
+"time_approval":"审批时间",
+"time_release": "发布日期"
+        }
+    ]
+'''
+
+
+key_trans = {
+    "doctitle":"公告标题",
+    "page_time":"公告时间",
+    "province": "省份",
+    "city": "城市",
+    "district": "地区",
+
+    "approval_items": "审批事项",
+    "approval_result": "审批结果",
+    "declare_company": "申报单位",
+    "construct_company": "建设单位",
+    "evaluation_agency": "环评机构",
+    "approver": "审批部门",
+    "compilation_unit": "编制单位",
+    "publisher": "发布单位",
+
+    "total_tendereeMoney": "总投资",
+    "construction_scale": "建设规模",
+    "proportion":"建筑面积",
+    "usearea":"用地面积",
+
+    "doc_num": "审批文号",
+
+    "legal_person": "项目法人",
+    "moneysource": "资金来源",
+    "moneyuse":"资金构成",
+    "env_invest":"环保投资",
+    "phone": "电话",
+    "pro_type": "申报类型",
+    "project_addr": "项目地址",
+    "project_code": "项目编号",
+    "project_name": "项目名称",
+    "properties": "建设性质",
+    "time_commencement": "开工时间",
+    "time_completion": "竣工时间",
+    "time_declare": "申报时间",
+
+    "year_limit": "建设年限",
+
+    "time_approval":"审批时间",
+    "time_release": "发布日期"
+}
+
+key_trans_d = {"docid":"公告id"}
+key_trans_d.update(key_trans)
+
+
+
+
+def extract_proportion(content, has_preffix=True):
+    if not content:
+        return "", ""
+    # log("content")
+    # log(content)
+    suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+    reg_dict = {
+        0: "(?P<proportion>(总((建筑|建设)(面积|规模)|长|长度))" + suffix,
+        1: "(?P<proportion>((建筑|建设)(面积|规模)|全长)" + suffix,
+        2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
+    }
+
+    if not has_preffix:
+        reg_dict[3] = "(?P<proportion>" + suffix
+
+    _proportion = ""
+    for i in range(len(list(reg_dict.keys()))):
+        if _proportion:
+            break
+        _pattern = reg_dict.get(i)
+        # logging.info('content ' + str(content))
+        match = re.search(_pattern, str(content))
+        if match:
+            _proportion = match.groupdict().get("proportion", "")
+
+    if not _proportion:
+        return "", ""
+
+    # 统一格式
+    multiple_cnt = 1
+    digit = ""
+
+    # 确定具体数字
+    match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
+    if match:
+        # logging.info(str(_proportion) + '  ' + str(match.group()))
+        d1 = match.group('d1')
+        d2 = match.group('d2')
+        try:
+            d1 = int(re.sub(',', '', d1))
+        except:
+            return "", ""
+        if d2:
+            d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
+            # print('d1, d2', d1, d2)
+            d1 += d2
+        digit = d1
+    # print('digit', digit)
+
+    # 确定中文倍数
+    _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
+    match = re.search('[十百千万亿]+', _proportion2)
+    _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
+    if match:
+        for c in match.group():
+            multiple_cnt *= _dict.get(c)
+        _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
+    else:
+        _proportion3 = _proportion2
+    # print('multiple_cnt2', multiple_cnt)
+
+    # 确定面积/长度
+    match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
+    if match:
+        unit = '㎡'
+    else:
+        unit = 'm'
+
+    # 确定单位倍数
+    match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
+    if match:
+        if unit == 'm':
+            if re.search('[kK千公]', match.group()):
+                multiple_cnt *= 1000
+            elif re.search('[里]', match.group()):
+                multiple_cnt *= Decimal(str(500))
+        else:
+            if '亩' in match.group():
+                multiple_cnt *= Decimal(str(666.67))
+            elif '顷' in match.group():
+                multiple_cnt *= 10000
+            elif re.search('千米|公里|k[mM㎡]', match.group()):
+                multiple_cnt *= 1000000
+    # print('multiple_cnt1', multiple_cnt)
+
+    # 拼接
+    digit = str(digit * multiple_cnt) + unit
+
+    return _proportion, digit
+
+def extract_usearea(content, has_preffix=True):
+    if not content:
+        return "", ""
+    # log("content")
+    # log(content)
+    suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+    reg_dict = {
+        0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)|长|长度))" + suffix,
+        1: "(?P<proportion>((用地|占地|使用)(面积|规模)|全长)" + suffix,
+        2: "(?P<proportion>((用地|占地|使用)?面积)" + suffix
+    }
+
+    if not has_preffix:
+        reg_dict[3] = "(?P<proportion>" + suffix
+
+    _proportion = ""
+    for i in range(len(list(reg_dict.keys()))):
+        if _proportion:
+            break
+        _pattern = reg_dict.get(i)
+        # logging.info('content ' + str(content))
+        match = re.search(_pattern, str(content))
+        if match:
+            _proportion = match.groupdict().get("proportion", "")
+
+    if not _proportion:
+        return "", ""
+
+    # 统一格式
+    multiple_cnt = 1
+    digit = ""
+
+    # 确定具体数字
+    match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
+    if match:
+        # logging.info(str(_proportion) + '  ' + str(match.group()))
+        d1 = match.group('d1')
+        d2 = match.group('d2')
+        try:
+            d1 = int(re.sub(',', '', d1))
+        except:
+            return "", ""
+        if d2:
+            d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
+            # print('d1, d2', d1, d2)
+            d1 += d2
+        digit = d1
+    # print('digit', digit)
+
+    # 确定中文倍数
+    _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
+    match = re.search('[十百千万亿]+', _proportion2)
+    _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
+    if match:
+        for c in match.group():
+            multiple_cnt *= _dict.get(c)
+        _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
+    else:
+        _proportion3 = _proportion2
+    # print('multiple_cnt2', multiple_cnt)
+
+    # 确定面积/长度
+    match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
+    if match:
+        unit = '㎡'
+    else:
+        unit = 'm'
+
+    # 确定单位倍数
+    match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
+    if match:
+        if unit == 'm':
+            if re.search('[kK千公]', match.group()):
+                multiple_cnt *= 1000
+            elif re.search('[里]', match.group()):
+                multiple_cnt *= Decimal(str(500))
+        else:
+            if '亩' in match.group():
+                multiple_cnt *= Decimal(str(666.67))
+            elif '顷' in match.group():
+                multiple_cnt *= 10000
+            elif re.search('千米|公里|k[mM㎡]', match.group()):
+                multiple_cnt *= 1000000
+    # print('multiple_cnt1', multiple_cnt)
+
+    # 拼接
+    digit = str(digit * multiple_cnt) + unit
+
+    return _proportion, digit
+
+def extract_env_invest(content):
+    pattern = "环保投资[大概约为是::]*(?P<invs>\d+(\.\d+)?万?元)"
+
+    match = re.search(pattern,content)
+    if match is not None:
+        invest =  match.groupdict().get("invs","")
+        money = getUnifyMoney(invest)
+        if money>0:
+            return money
+    return ""
+
+def extract_moneyuse(content):
+    list_sentences = re.split(",|。",content)
+    list_data = []
+    pattern = "^.{,20}[费用|预备费|费][大概约为是::]*\d+(\.\d+)?万?元.{,20}$"
+    for sentence in list_sentences:
+        match = re.search(pattern,sentence)
+        if match is not None:
+            list_data.append(sentence)
+    return ",".join(list_data)
+
+def get_approval_data(ots_client,ots_capacity,docid):
+
+    bool_query = BoolQuery(must_queries=[
+        TermQuery("docid",docid)
+    ])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query),
+                                                                   ColumnsToGet(["doctitle","project_name","page_time","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
+    list_data = getRow_ots(rows)
+    for _d in list_data:
+        approval_json = _d.get("approval_json")
+        partitionkey = _d.get("partitionkey")
+        docid = _d.get("docid")
+        doctitle = _d.get("doctitle")
+        project_name = _d.get("project_name")
+        page_time = _d.get("page_time")
+        extract_json = _d.get("extract_json")
+
+        _d_html = {"partitionkey":partitionkey,"docid":docid}
+        _html = Document(_d_html)
+        _html.fix_columns(ots_capacity,["dochtmlcon"],True)
+        dochtml = _html.getProperties().get("dochtmlcon","")
+        doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
+        attachmenttextcon = ""
+        try:
+            _extract = json.loads(extract_json)
+        except Exception  as e:
+            _extract = {}
+        proportion = _extract.get("pb",{}).get("proportion")
+        _,usearea = extract_usearea(doctextcon+attachmenttextcon)
+        env_invest = extract_env_invest(doctextcon+attachmenttextcon)
+        moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
+
+        if approval_json:
+            list_approval = json.loads(approval_json)
+            for _appr in list_approval:
+                _appr["partitionkey"] = partitionkey
+                _appr["docid"] = docid
+                _appr["doctitle"] = doctitle
+                _appr["page_time"] = page_time
+                _appr["proportion"] = proportion
+                _appr["usearea"] = usearea
+                _appr["env_invest"] = env_invest
+                _appr["moneyuse"] = moneyuse
+
+                fix_area(ots_client,_appr)
+
+                construction_scale = _d.get("construction_scale","")
+                proportion,_ = extract_proportion(construction_scale)
+                if proportion!="":
+                    _appr["proportion"] = proportion
+                _,usearea = extract_usearea(construction_scale)
+                if usearea!="":
+                    _appr["usearea"] = usearea
+                env_invest = extract_env_invest(construction_scale)
+                if env_invest!="":
+                    _appr["env_invest"] = env_invest
+                moneyuse = extract_moneyuse(construction_scale)
+                if moneyuse!="":
+                    _appr["moneyuse"] = moneyuse
+
+            return list_approval
+
+
+def check_approval(appr1,appr2):
+    check_keys = ["declare_company","construct_company","total_tendereeMoney","proportion","usearea","doc_num","project_code"]
+    same_count = 0
+    for k in check_keys:
+        if k in appr1 and k in appr2:
+            if appr1[k]==appr2[k] and appr1[k] is not None and appr1[k]!="":
+                same_count += 1
+
+    if same_count>=1:
+        return True
+    return False
+
+
+def merge_approval_real(ots_client,ots_capacity,approval):
+    doc_num = approval.get("doc_num","")
+    doctitle = approval.get("doctitle","")
+    project_name = approval.get("project_name","")
+    project_code = approval.get("project_code","")
+
+    docid = approval.get("docid")
+    should_queries = []
+
+    if doc_num!="":
+        should_queries.append(MatchPhraseQuery("doctitle",doc_num))
+        should_queries.append(MatchPhraseQuery("doctextcon",doc_num))
+        should_queries.append(MatchPhraseQuery("attachmenttextcon",doc_num))
+    if doctitle!="":
+        should_queries.append(MatchPhraseQuery("doctitle",doctitle))
+        should_queries.append(MatchPhraseQuery("doctextcon",doctitle))
+        should_queries.append(MatchPhraseQuery("attachmenttextcon",doctitle))
+    if project_name!="":
+        should_queries.append(MatchPhraseQuery("doctitle",project_name))
+        should_queries.append(MatchPhraseQuery("doctextcon",project_name))
+        should_queries.append(MatchPhraseQuery("attachmenttextcon",project_name))
+    if project_code!="":
+        should_queries.append(MatchPhraseQuery("doctitle",project_code))
+        should_queries.append(MatchPhraseQuery("doctextcon",project_code))
+        should_queries.append(MatchPhraseQuery("attachmenttextcon",project_code))
+
+
+    _query = BoolQuery(should_queries=should_queries,must_not_queries=[TermQuery("docid",docid)])
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery("status",201,301),
+        _query
+    ])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query),
+                                                                   ColumnsToGet(["doctitle","page_time","project_name","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
+    list_data = getRow_ots(rows)
+    approvals = [approval]
+    for _d in list_data:
+        approval_json = _d.get("approval_json")
+        partitionkey = _d.get("partitionkey")
+        docid = _d.get("docid")
+        doctitle = _d.get("doctitle")
+        project_name = _d.get("project_name")
+        page_time = _d.get("page_time")
+        extract_json = _d.get("extract_json")
+
+
+        _d_html = {"partitionkey":partitionkey,"docid":docid}
+        _html = Document(_d_html)
+        _html.fix_columns(ots_capacity,["dochtmlcon"],True)
+        dochtml = _html.getProperties().get("dochtmlcon","")
+        doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
+        attachmenttextcon = ""
+
+        try:
+            _extract = json.loads(extract_json)
+        except Exception  as e:
+            _extract = {}
+        proportion = _extract.get("pb",{}).get("proportion")
+        _,usearea = extract_usearea(doctextcon+attachmenttextcon)
+        env_invest = extract_env_invest(doctextcon+attachmenttextcon)
+        moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
+        if approval_json:
+            list_approval = json.loads(approval_json)
+            for _appr in list_approval:
+                _appr["partitionkey"] = partitionkey
+                _appr["docid"] = docid
+                _appr["doctitle"] = doctitle
+                _appr["page_time"] = page_time
+                _appr["usearea"] = usearea
+                _appr["env_invest"] = env_invest
+                _appr["moneyuse"] = moneyuse
+
+                fix_area(ots_client,_appr)
+
+                construction_scale = _d.get("construction_scale","")
+                proportion,_ = extract_proportion(construction_scale)
+                if proportion!="":
+                    _appr["proportion"] = proportion
+                _,usearea = extract_usearea(construction_scale)
+                if usearea!="":
+                    _appr["usearea"] = usearea
+                env_invest = extract_env_invest(construction_scale)
+                if env_invest!="":
+                    _appr["env_invest"] = env_invest
+                moneyuse = extract_moneyuse(construction_scale)
+                if moneyuse!="":
+                    _appr["moneyuse"] = moneyuse
+                if check_approval(approval,_appr):
+                    approvals.append(_appr)
+    return approvals
+
+def get_enterprise_area(ots_client,name):
+    bool_query = BoolQuery(must_queries=[
+        TermQuery("name",name)
+    ])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
+                                                                   SearchQuery(bool_query),
+                                                                   ColumnsToGet(["province","city","district"],return_type=ColumnReturnType.SPECIFIED))
+    list_data = getRow_ots(rows)
+    _d = {}
+    if len(list_data)>0:
+        _d["province"] = list_data[0].get("province","")
+        _d["city"] = list_data[0].get("city","")
+        _d["district"] = list_data[0].get("district","")
+    return _d
+
+def area_count(_d):
+    keys = ["province","city","district"]
+    return sum([1 if _d.get(k,"") not in ("","全国","未知") else 0 for k in keys])
+
+def fix_area(ots_client,appr):
+    if appr.get("district","")!="":
+        return
+    declare_company = appr.get("declare_company","")
+    _d = get_enterprise_area(ots_client,declare_company)
+    if area_count(_d)>area_count(appr):
+        appr.update(_d)
+
+    construct_company = appr.get("construct_company","")
+    _d = get_enterprise_area(ots_client,construct_company)
+    if area_count(_d)>area_count(appr):
+        appr.update(_d)
+
+    approver = appr.get("approver","")
+    _d = get_enterprise_area(ots_client,approver)
+    if area_count(_d)>area_count(appr):
+        appr.update(_d)
+
+    compilation_unit = appr.get("compilation_unit","")
+    _d = get_enterprise_area(ots_client,approver)
+    if area_count(_d)>area_count(appr):
+        appr.update(_d)
+
+    publisher = appr.get("publisher","")
+    _d = get_enterprise_area(ots_client,publisher)
+    if area_count(_d)>area_count(appr):
+        appr.update(_d)
+
+
+
+def generate_projects(approvals):
+    project_id = str(uuid4())
+    approvals.sort(key=lambda x:x.get("page_time",""),reverse=False)
+    _dict = {}
+    for appr in approvals:
+        _d = {}
+        _d_area = {}
+        for k,v in appr.items():
+            if v is not None and v!="":
+                if k in ("province","city","district"):
+                    _d_area[k] = v
+                else:
+                    _d[k] = v
+        if _dict.get("province","")=="" and _d_area.get("province","")!="":
+            _dict.update(_d_area)
+        if _dict.get("city","")=="" and _d_area.get("city","")!="":
+            _dict.update(_d_area)
+        if _dict.get("district","")=="" and _d_area.get("district","")!="":
+            _dict.update(_d_area)
+        _dict.update(_d)
+    _dict["id"] = project_id
+    return _dict
+
+
+def merge_approval():
+    ots_client = getConnect_ots()
+    ots_capacity = getConnect_ots_capacity()
+
+    list_data = []
+
+    # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-11\20241104审批项目公告_审批要素.xlsx"
+    # df = pd.read_excel(filename)
+    # _count = 0
+    # for docid in df["公告id"]:
+    #     docid = int(docid)
+    #     _count += 1
+    #     # if _count>3000:
+    #     #     break
+    #     # if docid!=400066972170 and docid!=400066972181:
+    #     #     continue
+    #     # list_approval = get_approval_data(ots_client,docid)
+    #     # if list_approval:
+    #     #     list_data.extend(list_approval)
+    #     list_data.append(docid)
+
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery("status",201,301),
+        TermQuery("page_time","2024-11-04"),
+        TermQuery("docchannel",302),
+    ])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
+                                                                   ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
+    list_row = getRow_ots(rows)
+    for _data in list_row:
+        list_data.append(_data.get("docid"))
+
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                   ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
+        list_row = getRow_ots(rows)
+        for _data in list_row:
+            list_data.append(_data.get("docid"))
+        print("%d/%d"%(len(list_data),total_count))
+        # if len(list_data)>=2000:
+        #     break
+
+    task_queue = Queue()
+    for _data in list_data:
+        task_queue.put(_data)
+
+    result_queue = Queue()
+
+    def merge_approval_handle(docid,result_queue):
+        print("docid",docid)
+        list_approval = get_approval_data(ots_client,ots_capacity,docid)
+        if list_approval:
+            for appr in list_approval:
+                approvals = merge_approval_real(ots_client,ots_capacity,appr)
+                result_queue.put(approvals)
+
+    mt = MultiThreadHandler(task_queue,merge_approval_handle,result_queue,30)
+    mt.run()
+
+    list_approvals = []
+    try:
+        while 1:
+            item = result_queue.get(timeout=1)
+            list_approvals.append(item)
+    except:
+        pass
+
+    data_approval = []
+    data_approvals_p = []
+    for approvals in list_approvals:
+        _project = generate_projects(approvals)
+        _project_id = _project.get("id")
+
+        for _approval in approvals:
+
+            _d = {"项目id":_project_id}
+            for k,v in key_trans_d.items():
+                if k in _approval:
+                    _d[v] = _approval[k]
+                else:
+                    _d[v] = ""
+            data_approval.append(_d)
+        _d = {"项目id":_project_id}
+        for k,v in key_trans.items():
+            if k in _project:
+                _d[v] = _project[k]
+            else:
+                _d[v] = ""
+        data_approvals_p.append(_d)
+
+
+
+    df_approval = pd.DataFrame(data_approval)
+    df_approvals_p = pd.DataFrame(data_approvals_p)
+    df_approval.to_excel("a.xlsx")
+    df_approvals_p.to_excel("b.xlsx")
+
+
+
+
+
+if __name__ == '__main__':
+    merge_approval()

+ 98 - 19
BaseDataMaintenance/maintenance/enterprise/enterprise2Redis.py

@@ -64,8 +64,7 @@ class enterprise2Redis():
                                   http_auth=('elastic', 'WWBu9#1HWHo$$gJm'),
                                   port=9200)
         body = {
-            "_source": ["name", "history_names", 'legal_person', 'reg_capital', 'credit_code', 'tax_number',
-                        'reg_number', 'org_number',
+            "_source": ["name", 'credit_code',
                         "zhao_biao_number", "zhong_biao_number", "dai_li_number", "bid_number"],
             'query': {  # 查询命令
                 "bool": {
@@ -86,7 +85,7 @@ class enterprise2Redis():
                 }
             },
             "sort": [
-                {"create_time": "desc"}
+                {"update_time": "desc"}
             ]
         }
 
@@ -114,21 +113,34 @@ class enterprise2Redis():
         for item in result:
             item = item['_source']
             name = item['name']
-            history_names = item.get("history_names", "")
-            legal_person = item.get("legal_person", "")
-            reg_capital = item.get("reg_capital", "")
+            # history_names = item.get("history_names", "")
+            # legal_person = item.get("legal_person", "")
+            # reg_capital = item.get("reg_capital", "")
             credit_code = item.get("credit_code", "")
-            tax_number = item.get("tax_number", "")
-            reg_number = item.get("reg_number", "")
-            org_number = item.get("org_number", "")
+            credit_code = re.sub("\s","",credit_code)
+            credit_code = credit_code if re.search("^[\dA-Z]{2}\d{6}[\dA-Z]{10}$",credit_code) else ""
+            # tax_number = item.get("tax_number", "")
+            # tax_number = re.sub("\s","",tax_number)
+            # tax_number = tax_number if len(tax_number)>=15 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",tax_number) else ""
+            # reg_number = item.get("reg_number", "")
+            # reg_number = re.sub("\s","",reg_number)
+            # reg_number = reg_number if len(reg_number)>=10 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",reg_number) else ""
+            # org_number = item.get("org_number", "") # 已弃用,统一社会信用代码取代组织机构代码
+            # org_number = org_number if not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",org_number) else ""
             zhao_biao_number = item.get("zhao_biao_number", 0)
+            zhao_biao_number = zhao_biao_number if zhao_biao_number else 0
             zhong_biao_number = item.get("zhong_biao_number", 0)
+            zhong_biao_number = zhong_biao_number if zhong_biao_number else 0
             dai_li_number = item.get("dai_li_number", 0)
+            dai_li_number = dai_li_number if dai_li_number else 0
             bid_number = item.get("bid_number", 0)
+            bid_number = bid_number if bid_number else 0
 
             num = 0
-            for business in [history_names, legal_person, reg_capital, credit_code, tax_number, reg_number, org_number]:
-                if len(str(business).replace("-", "")) > 1:
+            for business in [credit_code]: # 新增实体只判断credit_code
+                business = re.sub("\s-","",str(business))
+                business = re.sub("^nan$","",business)
+                if len(business) > 1:
                     num += 1
             isLegal = isLegalNewName(name)
             if isLegal >= 0:
@@ -136,14 +148,16 @@ class enterprise2Redis():
                     legal_name_num += 1
                     _json = {"have_business": 1, "zhao_biao_number": zhao_biao_number,
                              "zhong_biao_number": zhong_biao_number,
-                             "dai_li_number": dai_li_number, "bid_number": bid_number}
+                             "dai_li_number": dai_li_number, "bid_number": bid_number,
+                             "credit_code":credit_code}
                     _json = json.dumps(_json, ensure_ascii=False)
                     add_redis_list.append((name, _json))
                 elif num == 0 and bid_number > 0 and len(name) > 4:
                     legal_name_num += 1
                     _json = {"have_business": 0, "zhao_biao_number": zhao_biao_number,
                              "zhong_biao_number": zhong_biao_number,
-                             "dai_li_number": dai_li_number, "bid_number": bid_number}
+                             "dai_li_number": dai_li_number, "bid_number": bid_number,
+                             "credit_code":credit_code}
                     _json = json.dumps(_json, ensure_ascii=False)
                     add_redis_list.append((name, _json))
 
@@ -259,13 +273,21 @@ def isLegalNewName(enterprise_name):
         return -1
     if re.search("[区市镇乡县洲州路街]$", enterprise_name) and not re.search("(超市|门市|保护区|园区|景区|校区|社区|服务区|工区|小区|集市|花市|夜市|学区|旅游区|矿区|林区|度假区|示范区|菜市)$", enterprise_name):
         return -1
-    if re.search("^个人|^个体|测试$", enterprise_name):
+    # if re.search("^.?(个人|个体|测试)|(个人|个体|测试).?$", enterprise_name):
+    if re.search("^.?测试|测试.?$", enterprise_name):
         return -1
     if re.search("个人|个体", enterprise_name):
-        _split = re.split("个人|个体", enterprise_name)
-        if len(_split[0]) <= 5:
-            return -1
-    if re.search("测试", enterprise_name) and len(enterprise_name) < 8:
+        if re.search("(个人|个体).?工商户",enterprise_name):
+            #  按照字数过滤,比如剔除个体工商户这些字眼之后还有6个字以上的,可能是有用的 2024/12/5新增
+            _name = re.sub("(个人|个体).?[工商户]*|[\(\)(){}\{\}\[\]【】]","",enterprise_name)
+            if len(re.findall("[\u4e00-\u9fa5]", _name))<=4:
+                return -1
+        else:
+            _split = re.split("个人|个体", enterprise_name)
+            if len(_split[0]) <= 5:
+                return -1
+
+    if (re.search("测试", enterprise_name) and len(enterprise_name) < 8) or len(re.findall("测试", enterprise_name))>1:
         return -1
     if re.search("^(省|自治[县州区]|市|县|区|镇|乡|街道)", enterprise_name) and not re.search(
             "^(镇江|乡宁|镇原|镇海|镇安|镇巴|镇坪|镇赉|镇康|镇沅|镇雄|镇远|镇宁|乡城|镇平|市中|市南|市北)", enterprise_name):
@@ -281,7 +303,61 @@ def isLegalNewName(enterprise_name):
         return 0
     return 1
 
-
+def test1():
+    legal_name_num = 0
+    add_redis_list = []
+    result =[{'_source': {'reg_number': '230602601147025', 'org_number': 'MADB2NAN6', 'update_time': '2024-08-01 07:28:14', 'credit_code': '92230602MADB2NAN6N', 'name': '大庆萨尔图区若飞物资经销处(个体工商户)'}, '_score': None, 'sort': ['2024-08-01 07:28:14'], '_index': 'enterprise_v3', '_type': '_doc', '_id': '大庆萨尔图区若飞物资经销处(个体工商户)'}]
+    for item in result:
+        item = item['_source']
+        name = item['name']
+        # history_names = item.get("history_names", "")
+        # legal_person = item.get("legal_person", "")
+        # reg_capital = item.get("reg_capital", "")
+        credit_code = item.get("credit_code", "")
+        credit_code = re.sub("\s", "", credit_code)
+        credit_code = credit_code if re.search("^[\dA-Z]{2}\d{6}[\dA-Z]{10}$", credit_code) else ""
+        # tax_number = item.get("tax_number", "")
+        # tax_number = re.sub("\s","",tax_number)
+        # tax_number = tax_number if len(tax_number)>=15 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",tax_number) else ""
+        # reg_number = item.get("reg_number", "")
+        # reg_number = re.sub("\s","",reg_number)
+        # reg_number = reg_number if len(reg_number)>=10 and not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",reg_number) else ""
+        # org_number = item.get("org_number", "") # 已弃用,统一社会信用代码取代组织机构代码
+        # org_number = org_number if not re.search("@|\d{4}-\d{2}-\d{2}|\.(com|cn|COM|CN)",org_number) else ""
+        zhao_biao_number = item.get("zhao_biao_number", 0)
+        zhao_biao_number = zhao_biao_number if zhao_biao_number else 0
+        zhong_biao_number = item.get("zhong_biao_number", 0)
+        zhong_biao_number = zhong_biao_number if zhong_biao_number else 0
+        dai_li_number = item.get("dai_li_number", 0)
+        dai_li_number = dai_li_number if dai_li_number else 0
+        bid_number = item.get("bid_number", 0)
+        bid_number = bid_number if bid_number else 0
+
+        num = 0
+        for business in [credit_code]:  # 新增实体只判断credit_code
+            business = re.sub("\s-", "", str(business))
+            business = re.sub("^nan$", "", business)
+            if len(business) > 1:
+                num += 1
+        isLegal = isLegalNewName(name)
+        if isLegal >= 0:
+            if num >= 1 and len(name) > 4:
+                legal_name_num += 1
+                _json = {"have_business": 1, "zhao_biao_number": zhao_biao_number,
+                         "zhong_biao_number": zhong_biao_number,
+                         "dai_li_number": dai_li_number, "bid_number": bid_number,
+                         "credit_code": credit_code}
+                _json = json.dumps(_json, ensure_ascii=False)
+                add_redis_list.append((name, _json))
+            elif num == 0 and bid_number > 0 and len(name) > 4:
+                legal_name_num += 1
+                _json = {"have_business": 0, "zhao_biao_number": zhao_biao_number,
+                         "zhong_biao_number": zhong_biao_number,
+                         "dai_li_number": dai_li_number, "bid_number": bid_number,
+                         "credit_code": credit_code}
+                _json = json.dumps(_json, ensure_ascii=False)
+                add_redis_list.append((name, _json))
+    print(add_redis_list)
 
 if __name__ == '__main__':
 
@@ -308,6 +384,9 @@ if __name__ == '__main__':
 
     # e = enterprise2Redis()
     # e.monitor_enterprise2redis()
+
+    # print(isLegalNewName('大庆萨尔图区若飞物资经销处(个体工商户)'))
+
     pass
 
 

+ 164 - 0
BaseDataMaintenance/maintenance/gpt_extract.py

@@ -0,0 +1,164 @@
+#coding:utf8
+
+from BaseDataMaintenance.chat.ERNIE_utils import *
+
+from BaseDataMaintenance.dataSource.source import getConnect_ots
+from BaseDataMaintenance.chat.chatUtil import *
+
+from tablestore import *
+from BaseDataMaintenance.common.Utils import getRow_ots,getCurrent_date,timeAdd
+from bs4 import BeautifulSoup
+import json
+import re
+import pandas as pd
+import time
+
+
+
+def get_columns(ots_client,docid,columns):
+
+    bool_query = BoolQuery(must_queries=[TermQuery("docid",docid)])
+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query),
+                                                                   ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    list_data = getRow_ots(rows)
+    _dict = {}
+    if len(list_data)==1:
+        _dict = list_data[0]
+    return _dict
+
+
+def jsonchat(msg,try_times):
+
+    try:
+        print(msg)
+    except Exception as e:
+        pass
+    while try_times>0:
+        try:
+            try_times -= 1
+            resp = chat(msg)
+            time.sleep(1)
+
+            if resp.status_code == 200:
+                result_dict = json.loads(resp.text)
+                result = result_dict.get("result", "")
+                error_msg = result_dict.get("error_msg")
+                if error_msg is not None:
+                    print("error_msg",error_msg)
+                    time.sleep(10)
+                    continue
+                _pattern = "```json(?P<json>.*)```"
+                _search = re.search(_pattern, result, re.DOTALL)
+                if _search is not None:
+                    _json = _search.groupdict().get("json")
+                    _d = json.loads(_json)
+                    return _json
+        except Exception as e:
+            pass
+
+
+def extract_tenderee():
+    filename = r'F:\Workspace2016\DataMining\data\2024-11-26_174430_数据导出.xlsx'
+    df = pd.read_excel(filename)
+
+    ots_client = getConnect_ots()
+
+    list_data = []
+
+    for docid in df["docid"]:
+        docid = int(docid)
+        # if docid!=559799502:
+        #     continue
+        _dict = get_columns(ots_client,docid,["doctextcon","attachmenttextcon","nlp_enterprise","nlp_enterprise_attachment"])
+        doctextcon = _dict.get("doctextcon","")
+        attachmenttextcon = _dict.get("attachmenttextcon","")
+        nlp_enterprise = _dict.get("nlp_enterprise","")
+        nlp_enterprise_attachment = _dict.get("nlp_enterprise_attachment","")
+
+        pre_tenderee = ""
+        if len(nlp_enterprise)>2:
+            _ent = json.loads(nlp_enterprise)
+            pre_tenderee = _ent[0]
+        if len(nlp_enterprise_attachment)>2:
+            _ent = json.loads(nlp_enterprise_attachment)
+            pre_tenderee = _ent[0]
+
+        msg = '''从内容中提取出招标人,招标人应该是公司实体,如果没有则返回"",返回结果为json格式{"tenderee":""}\n%s\n%s''' % (str(doctextcon),str(attachmenttextcon))
+        _json = jsonchat(msg,3)
+        new_tenderee = ""
+        if _json is not None:
+            _d = json.loads(_json)
+            new_tenderee = _d.get("tenderee")
+        new_d = {"docid":docid,"nlp_enterprise":nlp_enterprise,"nlp_enterprise_attachment":nlp_enterprise_attachment,
+                 "pre_tenderee":pre_tenderee,"new_tenderee":new_tenderee}
+        list_data.append(new_d)
+        print(new_d)
+    df1 = pd.DataFrame(list_data)
+    df1.to_excel("tenderee_extract.xlsx",columns=["docid","nlp_enterprise","nlp_enterprise_attachment","pre_tenderee","new_tenderee"])
+
+def prompt_tenderee():
+    _prompt = '招标人,招标人应该是公司实体,如果没有则返回""'
+    _ret = {"招标人":""}
+    return _prompt,_ret
+
+def prompt_budget():
+    _prompt = "预算金额,如果没有则默认0"
+    _ret = {"预算金额":0}
+    return _prompt,_ret
+
+def prompt_win_tenderer():
+    _prompt = '中标人及其中标金额,中标人应该是公司实体,中标金额没有则默认0,中标人与中标金额放到一个字典中,如果有多个,则在数组中分别返回,如果没有则返回空数组'
+    _ret = {"中标人及金额":[{"中标人":"","中标金额":0}]}
+    return _prompt,_ret
+
+def extract_bidding_budget():
+    pass
+
+def extract_win_tenderer():
+    pass
+
+
+def get_data_to_qualify(ots_client,count=-1):
+    current_date = getCurrent_date('%Y-%m-%d')
+    last_date = timeAdd(current_date,-1)
+    bool_query = BoolQuery(
+        must_queries=[
+            RangeQuery("crtime",last_date,current_date),
+            RangeQuery("status",201,301),
+            TermsQuery("docchannel",[52,101,119,120])
+        ]
+    )
+
+    list_data = []
+    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query,limit=100,get_total_count=True),
+                                                                   ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
+    list_data.extend(getRow_ots(rows))
+    while 1:
+        if next_token is None or len(list_data)>=30*10000:
+            break
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                   SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                   ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
+        list_data.extend(getRow_ots(rows))
+        if count>0 and list_data>=count:
+            break
+    return list_data
+
+
+def quality_inspection():
+
+
+def merge_extract_json():
+
+
+if __name__ == '__main__':
+    extract_tenderee()
+
+
+
+
+
+
+

+ 3 - 3
BaseDataMaintenance/maintenance/preproject/fillColumns.py

@@ -232,7 +232,7 @@ class PreprojectFill():
             win_tenderer_concat = _row.get(preproject_last_win_tenderer_contact)
             win_tenderer_phone = _row.get(preproject_last_win_tenderer_phone)
 
-
+            # 有联系人先根据联系人取电话
             if tenderee is not None and tenderee!="":
                 # if (tenderee_concat is None or tenderee_concat=="") and (tenderee_phone is None or tenderee_phone==""):
                 if tenderee_phone is None or tenderee_phone=="":
@@ -543,8 +543,8 @@ class PreprojectFill():
                         # in_doctextcon, last_doctitle, last_tenderee_contact, last_tenderee_phone
 
                         _preproject = Preproject(result_row)
-                        if not _preproject.exists_row(self.ots_client):
-                            _preproject.update_row(self.ots_client)
+                        # if not _preproject.exists_row(self.ots_client):
+                        _preproject.update_row(self.ots_client)
 
         _mul = MultiThreadHandler(self.purchaseIntention_process_queue,comsumer_handle,None,20)
         _mul.run()

Diff do ficheiro suprimidas por serem muito extensas
+ 6 - 3
BaseDataMaintenance/maintenance/product/extract_data.py


+ 4 - 3
BaseDataMaintenance/maintenance/product/htmlparser.py

@@ -120,11 +120,12 @@ class ParseDocument():
             _html = ""
         self.html = _html
 
-        # self.soup = BeautifulSoup(self.html,"lxml")
+
         # self.soup = BeautifulSoup(self.html,"html.parser")
         self.auto_merge_table = auto_merge_table
 
-        self.soup = BeautifulSoup(self.html,"html5lib")
+        self.soup = BeautifulSoup(self.html,"lxml")
+        # self.soup = BeautifulSoup(self.html,"html5lib")
         _body = self.soup.find("body")
         if _body is not None:
             self.soup = _body
@@ -199,7 +200,7 @@ class ParseDocument():
                 if v is not None:
                     groups.append((k,v))
         if len(groups):
-            # groups.sort(key=lambda x:x[0])
+            groups.sort(key=lambda x:x[0])
             return groups
         return None
 

Diff do ficheiro suprimidas por serem muito extensas
+ 21 - 1
BaseDataMaintenance/maxcompute/1.py


+ 218 - 13
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -237,6 +237,7 @@ class f_get_extractCount(object):
     def evaluate(self, extractjson):
         if extractjson is not None:
             _extract = json.loads(extractjson)
+            return _extract.get("extract_count",0)
         else:
             _extract = {}
         dict_pack = _extract.get("prem",{})
@@ -776,25 +777,33 @@ def getSimLevel(str1,str2):
     return _v
 
 def getLength(_str):
-    return len(_str if _str is not None else "")
+    return len(str(_str) if _str is not None else "")
 
 def check_money(bidding_budget_less,bidding_budget_greater,
                 win_bid_price_less,win_bid_price_greater,
                 moneys_less,moneys_greater,
                 moneys_attachment_less,moneys_attachment_greater):
 
+    bidding_budget_less_source = bidding_budget_less
+    bidding_budget_greater_source = bidding_budget_greater
+    win_bid_price_less_source = win_bid_price_less
+    win_bid_price_greater_source = win_bid_price_greater
     #只判断最高前六位
     if getLength(bidding_budget_less)>0:
+        bidding_budget_less_source = float(bidding_budget_less_source)
         bidding_budget_less = round(float(bidding_budget_less))
         bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
     if getLength(bidding_budget_greater)>0:
+        bidding_budget_greater_source = float(bidding_budget_greater_source)
         bidding_budget_greater = round(float(bidding_budget_greater))
         bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
 
     if getLength(win_bid_price_less)>0:
+        win_bid_price_less_source = float(win_bid_price_less_source)
         win_bid_price_less = round(float(win_bid_price_less))
         win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
     if getLength(win_bid_price_greater)>0:
+        win_bid_price_greater_source = float(win_bid_price_greater_source)
         win_bid_price_greater = round(float(win_bid_price_greater))
         win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
 
@@ -815,14 +824,21 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 budget_is_same = True
             if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
                 budget_is_same = True
+            if bidding_budget_less_source in moneys_greater or bidding_budget_less_source in moneys_attachment_greater:
+                budget_is_same = True
             if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
                 budget_is_same = True
+            if bidding_budget_greater_source in moneys_less or bidding_budget_greater_source in moneys_attachment_less:
+                budget_is_same = True
             if budget_is_same=="":
                 return False
 
     if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
+
+
         price_less = float(win_bid_price_less)
         price_greater = float(win_bid_price_greater)
+
         if price_less!=price_greater:
 
             if min(price_less,price_greater)>0:
@@ -832,8 +848,12 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 price_is_same = True
             if price_less in moneys_greater or price_less in moneys_attachment_greater:
                 price_is_same = True
+            if win_bid_price_less_source in moneys_greater or win_bid_price_less_source in moneys_attachment_greater:
+                price_is_same = True
             if price_greater in moneys_less or price_greater in moneys_attachment_less:
                 price_is_same = True
+            if win_bid_price_greater_source in moneys_less or win_bid_price_greater_source in moneys_attachment_less:
+                price_is_same = True
             if price_is_same=="":
                 return False
     return True
@@ -867,6 +887,85 @@ def check_entity(nlp_enterprise_less,nlp_enterprise_greater,
         return False
     return True
 
+
+def check_punish(punish_less,punish_greater):
+    same_count = 0
+    not_same_count = 0
+    _flag = True
+    keys = list(set(list(punish_less.keys())) | set(list(punish_greater.keys())))
+    for k in keys:
+        v1 = punish_less.get(k)
+        v2 = punish_greater.get(k)
+        if getLength(v1)>0 and getLength(v2)>0:
+            if k=="punish_code":
+                if not check_codes([v1],[v2]):
+                    not_same_count += 1
+                    _flag = False
+                else:
+                    same_count += 1
+            if k=="punishDecision":
+                if getSimilarityOfString(v1,v2)>0.8:
+                    same_count += 1
+            if k in ("complainants","punishPeople","institutions"):
+                if v1==v2:
+                    same_count += 1
+                else:
+                    not_same_count == 1
+                    _flag = False
+    return _flag,same_count,not_same_count
+
+def check_source_type(source_type_less,source_type_greater):
+    if getLength(source_type_less)>0 and getLength(source_type_greater)>0:
+        if source_type_less!=source_type_greater:
+            return False
+    return True
+def check_approval(approval_less,approval_greater,b_log):
+
+    if b_log:
+        logging.info("approval_less %s==approval_greater %s"%(approval_less,approval_greater))
+    for _less in approval_less:
+        for _greater in approval_greater:
+            same_count = 0
+            not_same_count = 0
+            flag = True
+            keys = ["source_stage","source_type","doc_num","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","evaluation_agency","legal_person","compilation_unit","time_approval"]
+            for k in keys:
+                v1 = _less.get(k)
+                v2 = _greater.get(k)
+                if getLength(v1)>0 and getLength(v2)>0:
+                    if k in ("source_stage","source_type"):
+                        if v1!=v2:
+                            flag = False
+
+                    if k in ("project_code","doc_num"):
+                        if check_codes([v1],[v2]):
+                            same_count += 1
+                        else:
+                            not_same_count -= 1
+                            if b_log:
+                                logging.info("check approval %s false %s-%s"%(k,v1,v2))
+                            flag = False
+                    if k in ("approval_items","approval_result","project_name"):
+                        if getSimilarityOfString(v1,v2)>0.8:
+                            same_count += 1
+                        else:
+                            not_same_count -= 1
+                    if k in ("approver","construct_company","declare_company","evaluation_agency","legal_person","compilation_unit"):
+                        if v1==v2:
+                            same_count += 1
+                        else:
+                            not_same_count -= 1
+                            if b_log:
+                                logging.info("check approval %s false %s-%s"%(k,v1,v2))
+                            flag = False
+            if flag and same_count>1:
+                return flag,same_count,not_same_count
+    flag = True
+    if len(approval_less)>0 and len(approval_greater)>0:
+        flag = False
+    return flag,0,0
+
+
 def check_codes(project_codes_less,project_codes_greater):
     #check the similarity
     is_same = False
@@ -875,6 +974,8 @@ def check_codes(project_codes_less,project_codes_greater):
 
     for project_code_less in project_codes_less:
         for project_code_greater in project_codes_greater:
+            project_code_less = str(project_code_less).upper()
+            project_code_greater = str(project_code_greater).upper()
             code_sim = getSimilarityOfString(project_code_less,project_code_greater)
             if project_code_less is not None and project_code_greater is not None:
                 if code_sim>0.6:
@@ -900,6 +1001,7 @@ num_pattern = re.compile("^\d+(?:\.\d+)?$")
 num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
 location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
 building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
+rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
 date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
 def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
     if code_greater is None:
@@ -961,7 +1063,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
                 return False
 
     #check location and keywords
-    for _p in [num1_pattern,building_pattern]:
+    for _p in [num1_pattern,building_pattern,rebid_pattern]:
         num_all_l = re.findall(_p,doctitle_refind_less)
         num_all_g = re.findall(_p,doctitle_refind_greater)
         set_num_l = set(num_all_l)
@@ -995,19 +1097,70 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
                     return False
     return True
 
+
+def product_dump(list_product):
+    _product_l_l = []
+    list_product.sort(key=lambda x:len(x))
+    for _l in list_product:
+        _exists = False
+        for l1 in _product_l_l:
+            if l1 in _l:
+                _exists = True
+                break
+        if not _exists:
+            _product_l_l.append(_l)
+    return _product_l_l
 def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
     if getLength(product_less)>0 and getLength(product_greater)>0:
 
         _product_l = product_less.split(split_char)
+        _product_l = product_dump(_product_l)
         _product_g = product_greater.split(split_char)
+        _product_g = product_dump(_product_g)
+        _title_l = doctitle_refine_less
+        _title_g = doctitle_refine_greater
         same_count = 0
         if len(_product_l)>len(_product_g):
             a = _product_g
             _product_g = _product_l
             _product_l = a
+            _title_l = doctitle_refine_greater
+            _title_g = doctitle_refine_less
+        set_product_l_in_title = set()
+        set_product_g_in_title = set()
+        for _l in _product_l:
+            if _title_l.find(_l)>=0:
+                set_product_l_in_title.add(_l)
+        for _g in _product_g:
+            if _title_g.find(_g)>=0:
+                set_product_g_in_title.add(_g)
+        # 限制标题出现的产品要有重叠
+        if len(set_product_l_in_title)>0 and len(set_product_g_in_title)>0:
+
+            
+            _set_union = set_product_l_in_title & set_product_g_in_title
+
+            # 不同的部门若有重叠则通过
+            diff_l = set_product_l_in_title-_set_union
+            diff_g = set_product_g_in_title-_set_union
+
+            diff_dump = product_dump(list(diff_l.union(diff_g)))
+            if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
+                return False
+
+            # 过于严格,暂时取消
+            # if len(_set_union)==0:
+            #     return False
+            # if len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
+            #     _l1 = list(set_product_l_in_title)
+            #     _l2 = list(set_product_g_in_title)
+            #     _l1.extend(_l2)
+            #     _l1 = product_dump(_l1)
+            #     if len(_l1)!=len(_set_union):
+            #         return False
         for _l in _product_l:
             for _g in _product_g:
-                if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
+                if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
                     same_count += 1
                     break
         if same_count/len(_product_l)>=0.5:
@@ -1020,12 +1173,15 @@ def check_package(package_less,package_greater,split_char=","):
 
         _product_l = package_less.split(split_char)
         _product_g = package_greater.split(split_char)
+        same_level = False
         for _l in _product_l:
             for _g in _product_g:
+                if abs(len(_l)-len(_g))<=2:
+                    save_level = True
                 if _l==_g:
                     return True
-
-        return False
+        if same_level:
+            return False
     return True
 
 def check_time(json_time_less,json_time_greater):
@@ -1056,7 +1212,7 @@ def check_time(json_time_less,json_time_greater):
         return 0
     return 1
 
-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]",punish_less = {},punish_greater = {},approval_less = [],approval_greater = [],source_type_less = None,source_type_greater=None):
     if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
         return 1
 
@@ -1100,6 +1256,11 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
             if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
                 return 1
 
+    #同一个站源,都有附件但附件没有重叠则不去重
+    if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
+        if b_log:
+            logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
+        return 0
 
     if isinstance(project_codes_less,str):
         project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
@@ -1130,6 +1291,33 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
         same_count += 1
     if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
         same_count += 1
+
+    _flag,_c1,_c2 = check_punish(punish_less,punish_greater)
+    if not _flag:
+        if b_log:
+            logging.info("check_punish failed")
+        return 0
+    else:
+        if b_log:
+            logging.info("check_punish true %d"%(_c1))
+        same_count += _c1
+
+    _flag,_c1,_c2 = check_approval(approval_less,approval_greater,b_log)
+    if not _flag:
+        if b_log:
+            logging.info("check approval failed")
+        return 0
+    else:
+        if b_log:
+            logging.info("check approval true %d"%(_c1))
+        same_count += _c1
+
+    _flag = check_source_type(source_type_less,source_type_greater)
+    if not _flag:
+        if b_log:
+            logging.info("check source type failed")
+        return 0
+
     base_prob = 0
     if min_counts<3:
         base_prob = 0.9
@@ -1140,12 +1328,17 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
     else:
         base_prob = 0.6
     _prob = base_prob*same_count/all_count
-    if min(extract_count_less,extract_count_greater)<=3:
-        if _prob<0.1:
-            _prob = 0.15
+    if min(extract_count_less,extract_count_greater)<=3 and max(extract_count_less,extract_count_greater)>=5:
+        if _prob<0.1 and str(page_time_less)==str(page_time_greater):
+            if str(docchannel_less) not in ("302","303"):
+                _prob = 0.15
         if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
+            if b_log:
+                logging.info("province not same:%s-%s"%(province_less,province_greater))
             return 0
     if _prob<0.1:
+        if b_log:
+            logging.info("prob too low:%f"%(_prob))
         return _prob
 
     check_result = {"pass":1}
@@ -1207,8 +1400,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
         else:
             check_result["entity"] = 1
 
-    logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
-    logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
+
     if not check_money(bidding_budget_less,bidding_budget_greater,
                        win_bid_price_less,win_bid_price_greater,
                        moneys_less,moneys_greater,
@@ -1266,6 +1458,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
             check_result["time"] = 1
 
     if hard_level==2 and check_result["product"]<=1:
+        if b_log:
+            logging.inf("hard_level %s and check_product less than 2"%(str(hard_level)))
         return 0
     if check_result.get("pass",0)==0:
         if b_log:
@@ -1506,7 +1700,11 @@ class f_dumplicate_check(BaseUDTF):
             page_attachments_less = '[]'
         if page_attachments_greater is None:
             page_attachments_greater = '[]'
-        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
+        punish_less = _extract_less.get("punish",{})
+        punish_greater = _extract_greater.get("punish",{})
+        approval_less = _extract_less.get("approval",[])
+        approval_greater = _extract_greater.get("approval",[])
+        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
         self.forward(_prob)
 
 @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
@@ -1686,6 +1884,8 @@ class f_redump_probability_final_check(BaseUDAF):
                 web_source_no_greater = document_greater["web_source_no"]
                 extract_json_greater = document_greater["extract_json"]
                 page_attachments_greater = document_greater["page_attachments"]
+
+
                 _pass = True
 
                 for document_less in final_group:
@@ -1730,7 +1930,12 @@ class f_redump_probability_final_check(BaseUDAF):
                     if page_attachments_greater is None:
                         page_attachments_greater = '[]'
 
-                    _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
+                    punish_less = _extract_less.get("punish",{})
+                    punish_greater = _extract_greater.get("punish",{})
+                    approval_less = _extract_less.get("approval",[])
+                    approval_greater = _extract_greater.get("approval",[])
+
+                    _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
 
                     if _prob<0.1:
                         _pass = False

+ 237 - 26
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -87,6 +87,15 @@ project_nlp_enterprise = "nlp_enterprise"
 project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
 project_update_time = "update_time"
 project_tmp_attrs = "tmp_attrs"
+project_tenderee_code = "tenderee_code"
+project_agency_code = "agency_code"
+project_candidates = "candidates"
+
+project_win_tenderer_code = "win_tenderer_code"
+project_second_tenderer_code = "second_tenderer_code"
+project_third_tenderer_code = "third_tenderer_code"
+project_win_tenderer_joints = "win_tenderer_joints"
+project_multi_winners = "multi_winners"
 
 document_partitionkey = "partitionkey"
 document_docid = "docid"
@@ -148,6 +157,9 @@ document_time_release = "time_release"
 document_info_source = "info_source"
 document_nlp_enterprise = "nlp_enterprise"
 document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
+document_tenderee_code = "tenderee_code"
+document_agency_code = "agency_code"
+document_candidates = "candidates"
 
 document_tmp_partitionkey = "partitionkey"
 document_tmp_docid = "docid"
@@ -183,6 +195,9 @@ document_tmp_opertime = "opertime"
 document_tmp_docchannel = "docchannel"
 document_tmp_original_docchannel = "original_docchannel"
 
+document_tmp_source_stage = "source_stage"
+document_tmp_source_type = "source_type"
+
 document_tmp_extract_json = "extract_json"
 document_tmp_industry_json = "industry_json"
 document_tmp_other_json = "other_json"
@@ -1516,7 +1531,7 @@ def generate_common_properties(list_docs):
     #计数法选择
     choose_dict = {}
     project_dict = {}
-    for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
+    for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_tenderee_code,document_agency_code]:
         for _doc in list_docs:
             _value = _doc.get(_key,"")
             if _value!="":
@@ -1616,6 +1631,9 @@ def generate_common_properties(list_docs):
     remove_docids = set()
     set_nlp_enterprise = set()
     set_nlp_enterprise_attachment = set()
+
+    set_candidates = set()
+    list_candidates = []
     for _doc in list_docs:
         table_name = _doc.get("table_name")
         status = _doc.get(document_status,0)
@@ -1632,13 +1650,30 @@ def generate_common_properties(list_docs):
 
         is_multipack = True if len(sub_docs)>1 else False
         extract_count = _doc.get(document_tmp_extract_count,0)
+        candidates = _doc.get(document_candidates,"[]")
+
+        _province = _doc.get(document_province,"")
+        _city = _doc.get(document_city,"")
+        _district = _doc.get(document_district,"")
+
+        tenderee = _doc.get(document_tenderee,"")
+        agency = _doc.get(document_agency,"")
+
 
         try:
             set_nlp_enterprise |= set(json.loads(_doc.get(document_nlp_enterprise,"[]")))
             set_nlp_enterprise_attachment |= set(json.loads(_doc.get(document_nlp_enterprise_attachment,"[]")))
+
+            for item in json.loads(candidates):
+                if item.get("name") is not None and item.get("name") not in set_candidates:
+                    list_candidates.append(item)
+                    set_candidates.add(item.get("name"))
+
         except Exception as e:
             traceback.print_exc()
 
+
+
         if product is not None:
             list_product.extend(product.split(","))
 
@@ -1651,7 +1686,7 @@ def generate_common_properties(list_docs):
 
         if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
             zhao_biao_page_time = page_time
-        if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
+        if zhong_biao_page_time=="" and _docchannel in (101,118,119,120,121,122):
             zhong_biao_page_time = page_time
         is_visuable = 0
         if table_name=="document":
@@ -1675,7 +1710,12 @@ def generate_common_properties(list_docs):
                               document_page_time:page_time,
                               document_status:201 if is_visuable==1 else 401,
                               "is_multipack":is_multipack,
-                              document_tmp_extract_count:extract_count
+                              document_tmp_extract_count:extract_count,
+                              document_tenderee:tenderee,
+                              document_agency:agency,
+                              document_province:_province,
+                              document_city:_city,
+                              document_district:_district
                               }
                              )
 
@@ -1691,6 +1731,7 @@ def generate_common_properties(list_docs):
     project_dict[project_product] = ",".join(list(set(list_product)))
     project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
     project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
+    project_dict[project_candidates] = json.dumps(list_candidates[:100],ensure_ascii=False)
 
     return project_dict
 
@@ -1716,6 +1757,7 @@ def generate_packages_properties(list_docs):
                 win_tenderer = _d.get(project_win_tenderer,"")
                 win_bid_price = _d.get(project_win_bid_price,"")
 
+
                 if sub_project_name=="Project":
 
                     win_exists = False
@@ -1937,7 +1979,7 @@ class f_generate_projects_from_document(BaseUDTF):
                 _product = list_product[_i%len(list_product)]
                 self.forward(_uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
 
-@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
+@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
 class f_generate_projects_from_project(BaseUDTF):
 
     def __init__(self):
@@ -2012,7 +2054,16 @@ class f_generate_projects_from_project(BaseUDTF):
                 info_source,
                 nlp_enterprise,
                 nlp_enterprise_attachment,
-                update_time):
+                update_time,
+                tenderee_code,
+                agency_code,
+                candidates,
+                win_tenderer_code,
+                second_tenderer_code,
+                third_tenderer_code,
+                win_tenderer_joints,
+                multi_winners
+                ):
         attrs_dict = {}
 
         attrs_dict[project_uuid] = uuid
@@ -2081,9 +2132,18 @@ class f_generate_projects_from_project(BaseUDTF):
         attrs_dict[project_nlp_enterprise_attachment] = nlp_enterprise_attachment
         attrs_dict[project_update_time] = update_time
 
+        attrs_dict[project_tenderee_code] = tenderee_code
+        attrs_dict[project_agency_code] = agency_code
+        attrs_dict[project_candidates] = candidates
+        attrs_dict[project_win_tenderer_code] = win_tenderer_code
+        attrs_dict[project_second_tenderer_code] = second_tenderer_code
+        attrs_dict[project_third_tenderer_code] = third_tenderer_code
+        attrs_dict[project_win_tenderer_joints] = win_tenderer_joints
+        attrs_dict[project_multi_winners] = multi_winners
 
         popNoneFromDict(attrs_dict)
 
+
         attrs_json = json.dumps(attrs_dict,ensure_ascii=False)
         if bidding_budget is None:
             bidding_budget = -1
@@ -2129,7 +2189,7 @@ def dumplicate_projects(list_projects,b_log=False):
     appendKeyvalueCount(list_projects)
     list_projects.sort(key=lambda x:str(x.get(project_page_time,"")))
     list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
-    cluster_projects = list_projects[:10]
+    cluster_projects = list_projects[:100]
     _count = 10
     log("dumplicate projects rest %d"%len(cluster_projects))
     while _count>0:
@@ -2170,7 +2230,7 @@ def update_projects_by_project(project_dict,projects):
     _dict = {}
     #更新公共属性
     for k,v in project_dict.items():
-        if k in (project_project_dynamics,project_page_time,project_sub_project_name,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment):
+        if k in (project_project_dynamics,project_page_time,project_sub_project_name,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment,project_candidates):
             continue
         for _proj in projects:
             if k not in _proj:
@@ -2203,20 +2263,40 @@ def update_projects_by_project(project_dict,projects):
     set_delete_uuid = set()
     set_nlp_enterprise = set()
     set_nlp_enterprise_attachment = set()
+    set_update_uuid = set()
+
+    set_candidates = set()
+
+
+    try:
+        set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
+        set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
+        list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
+    except Exception as e:
+        pass
+
     for _proj in projects:
         _docids = _proj.get(project_docids,"")
         _codes = _proj.get(project_project_codes,"")
         _product = _proj.get(project_product,"")
         _uuid = _proj.get(project_uuid,"")
+        update_uuid = _proj.get("project_uuid","")
         delete_uuid = _proj.get(project_delete_uuid,"")
         set_docid = set_docid | set(_docids.split(","))
         set_code = set_code | set(_codes.split(","))
         set_product = set_product | set(_product.split(","))
         set_uuid = set_uuid | set(_uuid.split(","))
+        set_update_uuid = set_update_uuid | set(update_uuid.split(","))
         set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
         try:
             set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
             set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
+
+            for item in json.loads(_proj.get(project_candidates,"[]")):
+                if item.get("name") is not None and item.get("name") not in set_candidates:
+                    list_candidates.append(item)
+                    set_candidates.add(item.get("name"))
+
         except Exception as e:
             pass
     set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
@@ -2225,12 +2305,9 @@ def update_projects_by_project(project_dict,projects):
 
     set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
     set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
+    set_update_uuid = set_update_uuid | set(project_dict.get("project_uuid","").split(","))
+
 
-    try:
-        set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
-        set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
-    except Exception as e:
-        pass
 
     append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
     append_dict[project_docid_number] = len(set_docid)
@@ -2238,8 +2315,10 @@ def update_projects_by_project(project_dict,projects):
     append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
     append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
     append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
+    append_dict["update_uuid"] = ",".join([a for a in list(set_update_uuid) if a!=""])
     append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
     append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
+    append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
 
     dict_dynamic = {}
     set_docid = set()
@@ -2568,6 +2647,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
 
 
 def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=False,simple_check=False):
+
     docids = _proj.get(project_docids,"")
     page_time = _proj.get(project_page_time,"")
     project_codes = _proj.get(project_project_codes,"")
@@ -2620,6 +2700,14 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
     project_dynamics_to_merge = _dict.get(project_project_dynamics)
 
+    # if len(set([docids,docids_to_merge])&set(["576859812","545764033"]))==2:
+    #     if return_prob:
+    #         return True,1
+    #     return True
+
+    if b_log:
+        log("check %s-%s ,%s-%s"%(docids,docids_to_merge,sub_project_name,sub_project_name_to_merge))
+
     is_few = False
     if (0 if project_codes=="" else 1) + (0 if project_name=="" else 1) + (0 if bidding_budget<0 else 1) +(0 if tenderee=="" else 1) + (0 if win_bid_price<0 else 1) + (0 if win_tenderer=="" else 1)<=1:
         is_few = True
@@ -2678,21 +2766,20 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
     prob_count += _codes_check
 
-    if is_few:
-        if _codes_check!=1:
-            if _title_check!=1:
-                if return_prob:
-                    return False,0
-                return False
-            if len(enterprise)>0 and len(enterprise_to_merge)>0:
-                if len(enterprise & enterprise_to_merge)==0:
-                    if return_prob:
-                        return False,0
-                    return False
-            if _product_check==-1:
+    if _codes_check!=1:
+        if _title_check!=1:
+            if return_prob:
+                return False,0
+            return False
+        if len(enterprise)>0 and len(enterprise_to_merge)>0:
+            if len(enterprise & enterprise_to_merge)==0:
                 if return_prob:
                     return False,0
                 return False
+        if _product_check==-1:
+            if return_prob:
+                return False,0
+            return False
 
     min_count = 2
     if product=="" or product_to_merge=="":
@@ -2737,8 +2824,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*300,return_prob=Fa
 
     _prob = prob_count/8
 
-    if b_log:
-        log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
+
     if _prob<0.15:
         if b_log:
             log("prob less than 0.15 prob_count:%d"%(prob_count))
@@ -2923,17 +3009,128 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
+def update_document_from_dynamic(_proj):
+    try:
+        list_dynamic = []
+        try:
+            list_dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
+        except Exception as e:
+            pass
+
+        dict_update_dict = {}
+        dict_column_count = {}
+        dict_addr_count = {}
+        for _dynamic in list_dynamic:
+            docid = _dynamic.get(document_docid)
+            tenderee = _dynamic.get(document_tenderee)
+            agency = _dynamic.get(document_agency)
+            province = _dynamic.get(document_province)
+            city = _dynamic.get(document_city)
+            district = _dynamic.get(document_district)
+
+
+            if getLength(tenderee)>0:
+                if tenderee not in dict_column_count:
+                    dict_column_count[tenderee] = {"count":1,"type":document_tenderee,"value":tenderee}
+                else:
+                    dict_column_count[tenderee]["count"] += 1
+            if getLength(agency)>0:
+                if agency not in dict_column_count:
+                        dict_column_count[agency] = {"count":1,"type":document_agency,"value":agency}
+                else:
+                    dict_column_count[agency]["count"] += 1
+
+            if province is not None and city is not None and district is not None:
+                addr = "%s%s%s"%(province,city,district)
+                if addr not in dict_addr_count:
+                    dict_addr_count[addr] = {"count":1}
+                    dict_addr_count[addr][document_province] = province
+                    dict_addr_count[addr][document_city] = city
+                    dict_addr_count[addr][document_district] = district
+                    if district!="":
+                        dict_addr_count[addr]["level"] = 3
+                    elif city!="":
+                        dict_addr_count[addr]["level"] = 2
+                    else:
+                        dict_addr_count[addr]["level"] = 1
+                else:
+                    dict_addr_count[addr]["count"] += 1
+
+        dict_list_v = {}
+        for k,v in dict_column_count.items():
+            _type = v.get("type")
+            if _type not in dict_list_v:
+                dict_list_v[_type] = []
+            dict_list_v[_type].append(v)
+        for k,v in dict_list_v.items():
+            v.sort(key=lambda x:x["count"],reverse=True)
+            if len(v)>0:
+                _proj[k] = v[0]["value"]
+                for _dynamic in list_dynamic:
+                    docid = _dynamic.get(document_docid)
+                    _v = _dynamic.get(k)
+                    if _v is not None and _v!="":
+                        if _v!=v[0]["value"]:
+                            if docid not in dict_update_dict:
+                                dict_update_dict[docid] = {document_docid:docid}
+                            dict_update_dict[docid][k] = v[0]["value"]
+        list_v = []
+        for k,v in dict_addr_count.items():
+            list_v.append(v)
+        list_v.sort(key=lambda x:x.get("count",0),reverse=True)
+        list_v.sort(key=lambda x:x.get("level",0),reverse=True)
+        if len(list_v)>0:
+            province = list_v[0].get(document_province)
+            city = list_v[0].get(document_city)
+            district = list_v[0].get(document_district)
+
+            _proj[document_province] = province
+            _proj[document_city] = city
+            _proj[document_district] = district
+            for _dynamic in list_dynamic:
+                docid = _dynamic.get(document_docid)
+
+                if document_province in _dynamic:
+                    if _dynamic.get(document_province,"")==province or _dynamic.get(document_province,"") in ("全国","未知",""):
+                        if province!=_dynamic.get(document_province,"") or city!=_dynamic.get(document_city,"") or district!=_dynamic.get(document_district,""):
+                            if docid not in dict_update_dict:
+                                dict_update_dict[docid] = {document_docid:docid}
+                            dict_update_dict[docid][document_province] = province
+                            dict_update_dict[docid][document_city] = city
+                            dict_update_dict[docid][document_district] = district
+        update_v = []
+        for k,v in dict_update_dict.items():
+            update_v.append(v)
+        _proj["document_update"] = update_v
+    except Exception as e:
+        pass
+
+
+
+
 def to_project_json(projects):
 
     list_proj = []
     for _proj in projects:
         _uuid = _proj.get(project_uuid,"")
+        update_uuid = _proj.get("update_uuid","")
+        _project_uuid = _proj.get("project_uuid","")
         if "enterprise" in _proj:
             _proj.pop("enterprise")
         list_uuid = [a for a in _uuid.split(",") if a!=""]
+        list_update_uuid = [a for a in update_uuid.split(",") if a!=""]
+        if _project_uuid:
+            list_update_uuid.append(_project_uuid)
+        list_update_uuid = list(set(list_update_uuid))
         if len(list_uuid)>0:
             _proj["keep_uuid"] = list_uuid[0]
             _proj["delete_uuid"] = ",".join(list_uuid[1:])
+            list_update_uuid.extend(list_uuid[1:])
+            _proj["update_uuid"] = ",".join(list_update_uuid)
+        elif len(list_update_uuid)>0:
+            _proj["keep_uuid"] = list_update_uuid[0]
+            _proj["delete_uuid"] = _proj.get("delete_uuid","")
+            _proj["update_uuid"] = ",".join(list_update_uuid[1:])
         else:
             _proj["keep_uuid"] = _proj.get("keep_uuid","")
             to_delete = _proj.get("to_delete","")
@@ -2944,6 +3141,9 @@ def to_project_json(projects):
         list_proj.append(_proj)
         if project_uuid in _proj:
             _proj.pop(project_uuid)
+        if "project_uuid" in _proj:
+            _proj.pop("project_uuid")
+        update_document_from_dynamic(_proj)
     return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
 
 def get_page_time_dis(page_time,n_page_time):
@@ -2964,6 +3164,15 @@ def check_page_time_dup(page_time,n_page_time):
     return False
 
 
+def check_fix_document(doctitle,n_doctitle):
+    _fix = re.search("更正|更新|变更|澄清",doctitle)
+    _n_fix = re.search("更正|更新|变更|澄清",n_doctitle)
+    if _fix is not None and _n_fix is not None:
+        return True
+    if _fix is  None and _n_fix is None:
+        return True
+    return False
+
 def dumplicate_document_in_merge(list_projects,dup_docid):
     '''
     合并时去重
@@ -3013,6 +3222,8 @@ def dumplicate_document_in_merge(list_projects,dup_docid):
                             continue
                         if is_multipack or n_is_multipack:
                             continue
+                        if not check_fix_document(doctitle,n_doctitle):
+                            continue
                         n_title_search = re.search("[一二三四五六七八九十1-9]+(?:次|标|包)",n_doctitle)
                         if title_search is None and n_title_search is None:
                             pass

+ 17 - 0
BaseDataMaintenance/model/oracle/QiTaShiXinTemp.py

@@ -0,0 +1,17 @@
+
+import traceback
+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
+
+dict_replace = {""}
+
+class QiTaShiXin(SouSuTemp):
+
+    def __init__(self,_dict):
+        SouSuTemp.__init__(self,_dict)
+        self.table_name = "bxkc.t_qi_ta_shi_xin_temp"
+        self.setValue("docchannel",303,True)
+        self.setValue("original_type","qi_ta_shi_xin",True)
+
+    def getPrimary_keys(self):
+        return ["ID"]
+

+ 38 - 10
BaseDataMaintenance/model/oracle/T_SHEN_PI_XIANG_MU.py

@@ -41,6 +41,8 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
     def getProperties_ots(self):
         new_dict = {}
         for k,v in self.__dict__.items():
+            if k=="all_columns":
+                continue
             if v is not None:
                 if isinstance(v,(str,int,float)):
                     pass
@@ -52,12 +54,20 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
         docid = int(new_dict.get("id",0))
         partition_key = docid%500+1
 
-        new_dict["partition_key"] = partition_key
+        new_dict["partitionkey"] = partition_key
         new_dict["docid"] = docid
         new_dict["original_id"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
+        new_dict["uuid"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
         new_dict.pop(T_SHEN_PI_XIANG_MU_ID)
 
-        new_dict["uuid"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
+        try:
+            if new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_STAGE) is not None:
+                new_dict[T_SHEN_PI_XIANG_MU_SOURCE_STAGE] = int(new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_STAGE,0))
+            if new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_TYPE) is not None:
+                new_dict[T_SHEN_PI_XIANG_MU_SOURCE_TYPE] = int(new_dict.get(T_SHEN_PI_XIANG_MU_SOURCE_TYPE,0))
+        except Exception as e:
+            pass
+
 
         new_dict["crtime"] = new_dict.get(T_SHEN_PI_XIANG_MU_CREATE_TIME)
         new_dict["docchannel"] = 302
@@ -65,11 +75,13 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
         new_dict["doctitle"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_TITLE,"")
         new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_TITLE)
 
-        new_dict["dochtmlcon"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
-        new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
+        new_dict["dochtmlcon"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_CONTENT,"")
+        if T_SHEN_PI_XIANG_MU_PAGE_CONTENT in new_dict:
+            new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
 
-        new_dict["detail_link"] = new_dict.get(T_SHEN_PI_XIANG_MU_DETAILLINK)
-        new_dict.pop(T_SHEN_PI_XIANG_MU_DETAILLINK)
+        new_dict["detail_link"] = new_dict.get(T_SHEN_PI_XIANG_MU_DETAILLINK,"")
+        if T_SHEN_PI_XIANG_MU_DETAILLINK in new_dict:
+            new_dict.pop(T_SHEN_PI_XIANG_MU_DETAILLINK)
 
         new_dict[T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS] = new_dict.get(T_SHEN_PI_XIANG_MU_ATTACHMENT_PATH,"[]")
 
@@ -81,15 +93,31 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
             new_dict["original_docchannel"] = new_dict["docchannel"]
         return new_dict
 
-    def select_rows(conn,max_shenpi_id,limit=500):
+    @staticmethod
+    def get_max_id(conn):
+        cursor = conn.cursor()
+        sql = "select max(id) from %s"%("bxkc.t_shen_pi_xiang_mu_new")
+
+        cursor.execute(sql)
+        rows = cursor.fetchall()
+
+        if len(rows)>0:
+            max_id = rows[0][0]
+            log("selext_max_id:%d"%(max_id))
+            return max_id
+        return None
+
+
+    @staticmethod
+    def select_rows(conn,_id,limit=500):
         list_result = []
         s_limit = ""
         if limit is not None:
             s_limit = "limit %d"%limit
-        s_where = " where id>%d "%(max_shenpi_id)
+        s_where = " where id=%d "%(_id)
 
         cursor = conn.cursor()
-        sql = "select %s from %s %s %s order by id asc"%("*","t_shen_pi_xiang_mu_new",s_where,s_limit)
+        sql = "select %s from %s %s "%("*","bxkc.t_shen_pi_xiang_mu_new",s_where)
         log("select rows:%s"%(sql))
         cursor.execute(sql)
 
@@ -98,7 +126,7 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
         for row in rows:
             _dict = {}
             for _vol,_val in zip(vol,row):
-                _name = _vol[0]
+                _name = str(_vol[0]).lower()
                 _dict[_name] = _val
             list_result.append(T_SHEN_PI_XIANG_MU(_dict))
         return list_result

+ 17 - 0
BaseDataMaintenance/model/oracle/TouSuChuLiTemp.py

@@ -0,0 +1,17 @@
+
+import traceback
+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
+
+dict_replace = {""}
+
+class TouSuChuLiTemp(SouSuTemp):
+
+    def __init__(self,_dict):
+        SouSuTemp.__init__(self,_dict)
+        self.table_name = "bxkc.t_tou_su_chu_li_temp"
+        self.setValue("docchannel",303,True)
+        self.setValue("original_type","tou_su_chu_li",True)
+
+    def getPrimary_keys(self):
+        return ["ID"]
+

+ 215 - 0
BaseDataMaintenance/model/oracle/TouSuTemp.py

@@ -0,0 +1,215 @@
+
+import traceback
+from BaseDataMaintenance.model.oracle.BaseModel import BaseModel
+from datetime import datetime
+from BaseDataMaintenance.common.Utils import getCurrent_date,log
+
+dict_oracle2ots = {"WEB_SOURCE_NO":"web_source_no",
+                    "AREA":"area",
+                    "PROVINCE":"province",
+                    "CITY":"city",
+                    "WEB_SOURCE_NAME":"web_source_name",
+                    "INFO_SOURCE":"info_source",
+                    "INFO_TYPE":"info_type",
+                    "INDUSTRY":"industry",
+                    "ID":"uuid",
+                    "PAGE_TITLE":"doctitle",
+                    "PAGE_TIME":"page_time",
+                    "PAGE_CONTENT":"dochtmlcon",
+                    "ATTACHMENT_PATH":"page_attachments",
+                    "CREATE_TIME":"crtime",
+                    "DISTRICT":"district",
+                    "DETAILLINK":"detail_link",
+                   "RECORD_ID":"record_id",
+                   "PUNISHNO":"punishno",
+                   "INSTITUTION":"institution",
+                   "PUNISHTIME":"punish_time",
+                   "PUNISHTYPE":"punish_type",
+                   "COMPLAINANT":"complainant",
+                   "PUNISHPERPLE":"punish_perple",
+                   "PUNISHWHETHER":"punish_whether",
+                   "PUNISHDECISION":"punish_decision",
+                   "docchannel":"docchannel",
+                   "original_type":"original_type"}
+
+
+class SouSuTemp(BaseModel):
+
+    def __init__(self,_dict):
+        self.all_columns = []
+        for k,v in _dict.items():
+            self.setValue(k,v,True)
+
+    def getPrimary_keys(self):
+        raise NotImplementedError()
+
+    def getProperties(self):
+        return self.__dict__
+
+    def getProperties_ots(self):
+        new_dict = {}
+        for k,v in self.__dict__.items():
+            if k in dict_oracle2ots:
+                n_k = dict_oracle2ots[k]
+                if v is not None:
+                    if isinstance(v,(str,int,float)):
+                        pass
+                    elif isinstance(v,(datetime)):
+                        v = v.strftime("%Y-%m-%d %H:%M:%S")
+                    else:
+                        v = str(v)
+                    new_dict[n_k] = v
+        opertime = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
+        publishtime = "%s %s"%(new_dict.get("page_time",""),opertime.split(" ")[1])
+        new_dict["opertime"] = opertime
+        new_dict["publishtime"] = publishtime
+        if "docchannel" in new_dict:
+            new_dict["original_docchannel"] = new_dict["docchannel"]
+        return new_dict
+
+    def setValue(self,k,v,isColumn=False):
+        if "all_columns" not in self.__dict__:
+            self.all_columns = []
+        self.__dict__[k] = v
+        if isColumn:
+            if k not in (set(self.all_columns)):
+                self.all_columns.append(k)
+
+    def delete_row(self,conn):
+        try:
+            cursor = conn.cursor()
+            sql = "delete %s  "%(self.table_name)
+            s_where = " where 1=1 "
+            _set_keys = set(self.getPrimary_keys())
+            has_key = False
+            if len(_set_keys)==0:
+                return
+            for k,v in self.__dict__.items():
+                if k in _set_keys:
+                    if v is None or str(v)=="":
+                        raise RuntimeError("主键%s为空"%k)
+                    s_where += " and %s="%k
+                    if isinstance(v,str):
+                        s_where += "'%s' "%v
+                    else:
+                        s_where += "%d "%v
+                    has_key = True
+            log("delete sql:%s-%s %s"%(str(has_key),sql,s_where))
+            if has_key:
+                sql = "%s %s"%(sql,s_where)
+                update_rows = cursor.execute(sql)
+                conn.commit()
+                return update_rows
+        except Exception as e:
+            traceback.print_exc()
+        return 0
+
+    def insert_row(self,conn):
+        try:
+            cursor = conn.cursor()
+            sql = "insert into %s"%(self.table_name)
+            s_columns = "("
+            s_values = "values("
+            _set_columns = set(self.all_columns)
+            for k,v in self.__dict__.items():
+                if k in _set_columns:
+                    if v is not None and str(v)!="":
+                        s_columns += "%s,"%k
+
+                        if isinstance(v,(int,)):
+                            s_values += "%d,"%v
+
+                        elif isinstance(v,(datetime)):
+                            s_values += "to_date('%s','yyyy-MM-dd HH24:mi:ss'),"%v.strftime("%Y-%m-%d %H:%M:%S")
+                        else:
+                            s_values += "'%s',"%str(v).replace("'","\'")
+            s_columns = "%s)"%s_columns[:-1]
+            s_values = "%s)"%s_values[:-1]
+            sql = "%s%s%s"%(sql,s_columns,s_values)
+            print("sql",sql)
+            cursor.execute(sql)
+            conn.commit()
+        except Exception as e:
+            traceback.print_exc()
+
+
+    def update_row(self,conn,conditions=[]):
+        cursor = conn.cursor()
+        sql = "update %s set "%(self.table_name)
+        s_columns = ""
+        s_where = " where 1=1 "
+        _set_columns = set(self.all_columns)
+        _set_keys = set(self.getPrimary_keys())
+        for k,v in self.__dict__.items():
+            if k in _set_columns and k not in _set_keys:
+                if v is not None and str(v)!="":
+                    s_columns += "%s="%k
+                    if isinstance(v,str):
+                        s_columns += "'%s',"%v
+                    else:
+                        s_columns += "%d,"%v
+            elif k in _set_keys:
+                if v is None or str(v)=="":
+                    raise RuntimeError("主键%s为空"%k)
+                s_where += " and %s="%k
+                if isinstance(v,str):
+                    s_where += "'%s' "%v
+                else:
+                    s_where += "%d "%v
+        s_columns = "%s"%s_columns[:-1]
+        sql = "%s%s%s"%(sql,s_columns,s_where)
+        update_rows = cursor.execute(sql)
+        conn.commit()
+        return update_rows
+
+
+
+    def exists(self,conn):
+        s_where = " where 1=1 "
+        _set_columns = set(self.all_columns)
+        _set_keys = set(self.getPrimary_keys())
+        for k,v in self.__dict__.items():
+            if k in _set_keys:
+                if v is None or str(v)=="":
+                    raise RuntimeError("主键%s为空"%k)
+                s_where += " and %s="%k
+                if isinstance(v,str):
+                    s_where += "'%s' "%v
+                else:
+                    s_where += "%d "%v
+        cursor = conn.cursor()
+        sql = "select count(1) from %s %s"%(self.table_name,s_where)
+        cursor.execute(sql)
+        rows = cursor.fetchall()
+        if rows[0][0]==0:
+            return False
+        return True
+
+    @staticmethod
+    def select_rows(conn,cls,table_name,conditions,rows_to_get="*",limit=60):
+        list_result = []
+        s_limit = ""
+        if limit is not None:
+            s_limit = " and rownum<=%d"%limit
+        if len(conditions)>0:
+            s_where = " where %s"%(" and ".join(conditions))
+        else:
+            s_where = " where 1=1 "
+
+        cursor = conn.cursor()
+        sql = "select %s from %s %s %s"%(rows_to_get,table_name,s_where,s_limit)
+        log(sql)
+        cursor.execute(sql)
+
+        vol = cursor.description
+        rows = cursor.fetchall()
+        for row in rows:
+            _dict = {}
+            for _vol,_val in zip(vol,row):
+                _name = _vol[0]
+                _dict[_name] = _val
+            list_result.append(cls(_dict))
+        return list_result
+
+
+

+ 56 - 0
BaseDataMaintenance/model/oracle/WeiFaJiLuTemp.py

@@ -0,0 +1,56 @@
+
+import traceback
+from BaseDataMaintenance.model.oracle.TouSuTemp import SouSuTemp
+
+dict_replace = {""}
+
+class WeiFaJiLuTemp(SouSuTemp):
+
+    def __init__(self,_dict):
+        SouSuTemp.__init__(self,_dict)
+        self.table_name = "bxkc.t_wei_fa_ji_lu_temp"
+        self.setValue("docchannel",303,True)
+        self.setValue("original_type","wei_fa_ji_lu",True)
+
+    def getPrimary_keys(self):
+        return ["ID"]
+
+    @staticmethod
+    def synchonize():
+        try:
+            print("123")
+            from BaseDataMaintenance.dataSource.source import getConnection_oracle
+            conn = getConnection_oracle()
+            cursor = conn.cursor()
+            has_commit = 0
+            while 1:
+                sql = '''
+                INSERT INTO bxkc.t_wei_fa_ji_lu_temp
+SELECT *
+FROM (
+         SELECT w.*
+         FROM bxkc.t_wei_fa_ji_lu w
+                  LEFT JOIN bxkc.id_wei_fa_ji_lu b ON w.id = b.id
+         WHERE b.id IS not NULL
+     ) res
+WHERE ROWNUM < 10001
+                '''
+                cursor.execute(sql)
+                row_effected = cursor.rowcount
+
+                if row_effected==0:
+                    break
+                print("row_effected",row_effected)
+                sql1 = '''
+                delete bxkc.id_wei_fa_ji_lu where id in (select id from bxkc.t_wei_fa_ji_lu_temp)
+                '''
+                cursor.execute(sql1)
+                conn.commit()
+
+
+        except Exception as e:
+            traceback.print_exc()
+
+
+if __name__ == '__main__':
+    WeiFaJiLuTemp.synchonize()

+ 3 - 0
BaseDataMaintenance/model/ots/BaseModel.py

@@ -20,9 +20,12 @@ class BaseModel():
         raise NotImplementedError
 
     def setValue(self,k,v,isColumn=True):
+        if k=="all_columns":
+            return
         if "all_columns" not in self.__dict__ or not isinstance(self.__dict__["all_columns"],(list)):
             self.all_columns = []
         self.__dict__[k] = v
+
         if isColumn:
             if k not in (set(self.all_columns)):
                 self.all_columns.append(k)

+ 59 - 32
BaseDataMaintenance/model/ots/document.py

@@ -307,10 +307,11 @@ def turn_document_status():
 
         bool_query = BoolQuery(
             must_queries=[
-                # MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
-                WildcardQuery("web_source_no","03716-*"),
-                RangeQuery("page_time","2024-04-24"),
-                TermQuery("save",1)
+                MatchPhraseQuery("doctitle","破产清算案"),
+                MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
+                # WildcardQuery("web_source_no","03716-*"),
+                # RangeQuery("product_number",500),
+                # TermQuery("save",1)
                 # RangeQuery("status",0,1),
                 # NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")),
                 # TermQuery("docid",397656324)
@@ -341,25 +342,25 @@ def turn_document_status():
         #
         # )
 
-        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
-                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
-                                                                       columns_to_get=ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
-        list_data = getRow_ots(rows)
-        print(total_count)
-        _count = len(list_data)
-        for _data in list_data:
-            _document = Document_tmp(_data)
-            task_queue.put(_document)
-        while next_token:
-            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
-                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-                                                                           columns_to_get=ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
-            list_data = getRow_ots(rows)
-            _count += len(list_data)
-            print("%d/%d"%(_count,total_count))
-            for _data in list_data:
-                _document = Document_tmp(_data)
-                task_queue.put(_document)
+        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
+        # list_data = getRow_ots(rows)
+        # print(total_count)
+        # _count = len(list_data)
+        # for _data in list_data:
+        #     _document = Document(_data)
+        #     task_queue.put(_document)
+        # while next_token:
+        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
+        #     list_data = getRow_ots(rows)
+        #     _count += len(list_data)
+        #     print("%d/%d"%(_count,total_count))
+        #     for _data in list_data:
+        #         _document = Document(_data)
+        #         task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:
@@ -367,19 +368,39 @@ def turn_document_status():
         #              document_partitionkey:int(docid)%500+1,
         #              }
         #     task_queue.put(Document(_dict))
-        # import pandas as pd
-        # df = pd.read_excel(r"F:\Workspace2016\DataMining\export\abc1.xlsx")
-        # for docid in df["docid1"]:
-        #     _dict = {document_docid:int(docid),
-        #              document_partitionkey:int(docid)%500+1,
-        #              }
-        #     task_queue.put(Document(_dict))
+        import pandas as pd
+        df = pd.read_csv(r"C:\Users\Administrator\Desktop\export_241224_6.csv")
+        list_docid = df["docid"]
+        # list_docid = [519497468]
+
+        # list_docid = []
+        # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
+        # with open(filename,"r",encoding="utf8") as f:
+        #     while 1:
+        #         line = f.readline()
+        #         if not line:
+        #             break
+        #         line = line.strip()
+        #         docid = line.split('-')[-1]
+        #         if re.search("^\d+$",docid) is not None:
+        #             list_docid.append(int(docid))
+
+        for docid,construct_company,recall_flag in zip(list_docid,df["construct_company"],df["recall_flag"]):
+            if recall_flag == 1:
+                _dict = {document_docid:int(docid),
+                         document_partitionkey:int(docid)%500+1,
+                         "construct_company":construct_company
+                         }
+                task_queue.put(Document(_dict))
         # for docid in df["docid2"]:
         #     _dict = {document_docid:int(docid),
         #              document_partitionkey:int(docid)%500+1,
         #              }
         #     task_queue.put(Document(_dict))
-        # log("task_queue size:%d"%(task_queue.qsize()))
+        log("task_queue size:%d"%(task_queue.qsize()))
+
+
+
 
     def _handle(item,result_queue,ots_client):
         #change attach value
@@ -405,8 +426,14 @@ def turn_document_status():
         # item.setValue(document_province,"广东",True)
         # item.setValue(document_city,"珠海",True)
         # item.setValue(document_district,"金湾区",True)
-        item.setValue(document_status,66,True)
+        # item.setValue(document_status,66,True)
         # print(item.getProperties())
+        # item.setValue(document_status,1,True)
+        # product = item.getProperties().get(document_product)
+        # l_product = product.split(",")
+        # n_product = ",".join(l_product[:500])
+        # item.setValue(document_product,n_product,True)
+        # item.fix_columns(ots_client,["extract_json","doctitle",""],True)
         item.update_row(ots_client)
         # log("update %d status done"%(item.getProperties().get(document_docid)))
         pass

+ 20 - 18
BaseDataMaintenance/model/ots/document_tmp.py

@@ -254,6 +254,7 @@ def turn_document_tmp_status():
     ots_client = getConnect_ots()
 
     def producer1(task_queue,ots_client):
+        a = ''
         for l_a in a.split("\n"):
             l_a = l_a.strip()
             if l_a !="":
@@ -266,8 +267,8 @@ def turn_document_tmp_status():
         bool_query = BoolQuery(
             must_queries=[
                 # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
-                TermQuery("save",1),
-                RangeQuery("status",72),
+                # TermQuery("save",66),
+                RangeQuery("status",1,51),
                 # BoolQuery(should_queries=[
                 #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
                 #                           # MatchPhraseQuery("doctitle","中国电信"),
@@ -280,11 +281,11 @@ def turn_document_tmp_status():
                 #                                  ]
                 # )
             ],
-            # must_not_queries=[
-            #     TermQuery("docid",288599518)
-            #     # ExistsQuery("status"),
-            #     # ExistsQuery("page_time"),
-            #                   ]
+            must_not_queries=[
+                # TermQuery("docid",288599518)
+                # ExistsQuery("doctitle"),
+                # ExistsQuery("page_time"),
+                              ]
         )
 
         rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
@@ -297,6 +298,7 @@ def turn_document_tmp_status():
         for _data in list_data:
             _document = Document_tmp(_data)
             task_queue.put(_document)
+        print(list_data)
         while next_token:
             rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
@@ -348,23 +350,23 @@ def turn_document_tmp_status():
         # _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '')
         # item.setValue(document_tmp_extract_json,_extract_json,True)
         # json.loads(_extract_json)
-        # item.setValue(document_tmp_status,71,True)
+        item.setValue(document_tmp_status,0,True)
         # item.setValue(document_tmp_save,1,True)
         # if item.exists_row(ots_client):
         #     item.update_row(ots_client)
         # print(item.getProperties())
-        # item.update_row(ots_client)
+        item.update_row(ots_client)
         # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
         # item.delete_row(ots_client)
-        from BaseDataMaintenance.model.ots.document import Document
-
-        Doc = Document(item.getProperties())
-        if Doc.fix_columns(ots_client,["status"],True):
-            if Doc.getProperties().get("status",0)>=401:
-                print(Doc.getProperties().get("docid"),"redo")
-                item.setValue("status",66,True)
-                item.update_row(ots_client)
-        pass
+        # from BaseDataMaintenance.model.ots.document import Document
+        #
+        # Doc = Document(item.getProperties())
+        # if Doc.fix_columns(ots_client,["status"],True):
+        #     if Doc.getProperties().get("status",0)>=401:
+        #         print(Doc.getProperties().get("docid"),"redo")
+        #         item.setValue("status",66,True)
+        #         item.update_row(ots_client)
+        # pass
 
     t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
     t_producer.start()

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff