2 Commits f408beeb94 ... d6e339a17c

Auteur SHA1 Message Date
  fangjiasheng d6e339a17c Merge remote-tracking branch 'origin/master' il y a 4 mois
  fangjiasheng bfd0108671 审批项目实时生成 il y a 4 mois

+ 1 - 0
BaseDataMaintenance/dataSource/source.py

@@ -83,6 +83,7 @@ is_internal = False
 if platform.system()=="Windows":
     OTS_URL = "https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com"
     OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
+    OTS_URL = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
 else:
     OTS_URL = "https://bxkc-ots.cn-hangzhou.vpc.tablestore.aliyuncs.com"
     check_url = "oss-cn-hangzhou-internal.aliyuncs.com"

+ 1689 - 0
BaseDataMaintenance/maintenance/approval_project/ap_project_production.py

@@ -0,0 +1,1689 @@
+# encoding: utf-8
+import copy
+import json
+import logging
+import multiprocessing
+import os
+import re
+import sys
+import time
+import traceback
+from concurrent.futures.thread import ThreadPoolExecutor
+from datetime import datetime, timedelta
+from decimal import Decimal
+from pprint import pprint
+from uuid import uuid4
+
+from bs4 import BeautifulSoup
+from tablestore import BoolQuery, RangeQuery, TermQuery, MatchPhraseQuery, Sort, FieldSort, ColumnReturnType, SortOrder, \
+    SearchQuery, ColumnsToGet
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
+from BaseDataMaintenance.common.Utils import getRow_ots, getUnifyMoney
+from BaseDataMaintenance.dataSource.source import getConnect_ots, getConnect_ots_capacity
+from BaseDataMaintenance.model.ots.approval_project import approval_project
+from BaseDataMaintenance.model.ots.document import Document
+
+
+def log(*args):
+    print_str = ""
+    print_str += str(globals().get('docid')) + " - "
+    for obj in args:
+        print_str += str(obj) + ' '
+    logging.info(print_str[:-1])
+
+
+approval_col_dict = {
+    "doctitle": "公告标题",
+    "page_time": "公告时间",
+    "province": "省份",
+    "city": "城市",
+    "district": "地区",
+
+    "approval_items": "审批事项",
+    "approval_result": "审批结果",
+    "declare_company": "申报单位",
+    "construct_company": "建设单位",
+    "evaluation_agency": "环评机构",
+    "approver": "审批部门",
+    "compilation_unit": "编制单位",
+    "publisher": "发布单位",
+
+    "total_tenderee_money": "总投资",
+    "construction_scale": "建设规模",
+    "proportion": "建筑面积",
+    "use_area": "用地面积",
+
+    "doc_num": "审批文号",
+
+    "legal_person": "项目法人",
+    "moneysource": "资金来源",
+    "money_use": "资金构成",
+    "env_invest": "环保投资",
+    "phone": "电话",
+    "pro_type": "申报类型",
+    "project_addr": "项目地址",
+    "project_code": "项目编号",
+    "project_name": "项目名称",
+    "properties": "建设性质",
+    "time_commencement": "开工时间",
+    "time_completion": "竣工时间",
+    "time_declare": "申报时间",
+
+    "year_limit": "建设年限",
+
+    "time_approval": "审批时间",
+    "time_release": "发布日期"
+}
+
+approval_cols = [
+    'uuid',
+    'docids',
+    'area',
+    'province',
+    'city',
+    'district',
+    'source_stage',
+    'source_type',
+    'approval_items',
+    'approval_result',
+    'approver',
+    'construct_company',
+    'construct_company_code',
+    'construction_scale',
+    'declare_company',
+    'doc_nums',
+    'evaluation_agency',
+    'legal_person',
+    'phone',
+    'pro_type',
+    'properties',
+    'time_declare',
+    'year_limit',
+    'compilation_unit',
+    'publisher',
+    'time_approval',
+    'moneysource',
+    'project_addr',
+    'project_name',
+    'time_commencement',
+    'time_completion',
+    'time_release',
+    'env_invest',
+    'proportion',
+    'length',
+    'use_area',
+    'construct_company_province',
+    'construct_company_city',
+    'construct_company_district',
+    'money_use',
+    'declare_type',
+    'page_time',
+    'doctitle',
+    'project_codes',
+    'total_tenderee_money',
+    'total_tenderee_money_unit',
+
+    'docids_cnt',
+    'merge_uuids',
+]
+
+
+def get_enterprise_area(ots_client, name):
+    bool_query = BoolQuery(must_queries=[
+        TermQuery("name", name)
+    ])
+    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                      SearchQuery(bool_query),
+                                                                      ColumnsToGet(["province", "city", "district"],
+                                                                                   return_type=ColumnReturnType.SPECIFIED))
+    list_data = getRow_ots(rows)
+    _d = {}
+    if len(list_data) > 0:
+        _d["province"] = list_data[0].get("province", "")
+        _d["city"] = list_data[0].get("city", "")
+        _d["district"] = list_data[0].get("district", "")
+    return _d
+
+
+def area_count(_d):
+    keys = ["province", "city", "district"]
+    return sum([1 if _d.get(k, "") not in ("", "全国", "未知") else 0 for k in keys])
+
+
+def fix_area(ots_client, appr):
+    if appr.get("district", "") != "":
+        return
+    declare_company = appr.get("declare_company", "")
+    _d = get_enterprise_area(ots_client, declare_company)
+    if area_count(_d) > area_count(appr):
+        appr.update(_d)
+
+    construct_company = appr.get("construct_company", "")
+    _d = get_enterprise_area(ots_client, construct_company)
+    print('get_enterprise_area _d', _d)
+    if area_count(_d) > area_count(appr):
+
+        appr.update(_d)
+
+    approver = appr.get("approver", "")
+    _d = get_enterprise_area(ots_client, approver)
+    if area_count(_d) > area_count(appr):
+        appr.update(_d)
+
+    compilation_unit = appr.get("compilation_unit", "")
+    _d = get_enterprise_area(ots_client, compilation_unit)
+    if area_count(_d) > area_count(appr):
+        appr.update(_d)
+
+    publisher = appr.get("publisher", "")
+    _d = get_enterprise_area(ots_client, publisher)
+    if area_count(_d) > area_count(appr):
+        appr.update(_d)
+
+
+class APPredictor:
+    def __init__(self):
+        pass
+
+    def chinese_to_arabic(self, ch_str):
+        chinese_number_dict = {
+            '一': 1,
+            '二': 2,
+            '两': 2,
+            '三': 3,
+            '四': 4,
+            '五': 5,
+            '六': 6,
+            '七': 7,
+            '八': 8,
+            '九': 9,
+            '十': 10,
+            '拾': 10,
+            '百': 100,
+            '千': 1000,
+        }
+        no_list = []
+        for c in ch_str:
+            if c not in chinese_number_dict.keys():
+                return None
+            no_list.append(chinese_number_dict.get(c))
+        arabic_num = 0
+        mul_no = None
+        for i, no in enumerate(no_list):
+            if no in [10, 100, 1000]:
+                if mul_no is None:
+                    arabic_num += no
+                else:
+                    arabic_num += no * mul_no
+                mul_no = None
+            else:
+                mul_no = no
+        if mul_no:
+            arabic_num += mul_no
+
+        return arabic_num
+
+    def standard_properties(self, d):
+        # properties 建设性质 未披露 迁建 新建 技术改造 改建 其他 扩建
+        # 未分类 改扩建 技改 技改及其他 拆建 装修装饰 迁扩建 其他 其它
+        # 新建(迁建)□改建□扩建□技术改造
+        # 新建建设类项目 改建建设类项目 新建工程 新建,
+        # 新建,建设内容...
+        # 扩建,...
+        # 新建□改扩建□技改☑
+        # (1)新建√(2)改扩建(3)技改(4)迁建
+        # 新建,占地面积106.4亩项组织组建力度,今年以来,该区走访以业务关系为纽带,以点带面 未披露
+        properties_type_dict = {
+            '未披露': '未披露',
+            '迁建': '迁建',
+            '新建': '新建',
+            '技术改造': '技术改造',
+            '技改': '技术改造',
+            '改建': '改建',
+            '其他': '其他',
+            '其它': '其他',
+            '扩建': '扩建',
+            '拆建': '拆建',
+            '改扩建': '改建,扩建',
+            '迁扩建': '迁建,扩建',
+            '装修装饰': '装修装饰',
+            '新建(迁建)': '新建,迁建',
+            '迁改扩建': '迁建,改建,扩建',
+            '装修工程': '装修装饰',
+            '旧路改造': '改建',
+            '新建(迁扩建)': '新建,迁建,扩建',
+            '原规模技改': '技术改造',
+            '改扩建(产能核增)': '改扩建'
+        }
+        p_base_type = ['未披露', '迁建', '新建', '技术改造', '改建', '其他', '扩建', '拆建', '装修装饰']
+        p_all_type = list(properties_type_dict.keys())
+        p_all_type.sort(key=lambda x: len(x), reverse=True)
+        true_char = '[☑■√R◼回]'
+        false_char = '[□£☐口]'
+        char_reg = '[☑■√R◼回□£☐口]'
+        type_reg = '|'.join(p_all_type)
+        # p_type_reg_list = [
+        #     '(?P<t1>{})(项目|工程|建设类项目|及其他|建设类|(补办))'.format(type_reg),
+        #     f'^[\u4e00-\u9fff]{{2,4}}(?P<t1>{type_reg})$',
+        #     f'^(?P<t1>{type_reg})[、](?P<t2>{type_reg})$',
+        #     '(?P<t1>{})((补办)|工程|)[,,;;](?!{}).*'.format(type_reg, type_reg),
+        #     f'^((?P<c1>{char_reg})?)(?P<t1>{type_reg})((?P<c2>{char_reg})?)(?P<t2>{type_reg})((?P<c3>{char_reg})?)(?P<t3>{type_reg})$',
+        #     f'^(?P<c1>{char_reg}){{0,2}}(?P<t1>{type_reg})(?P<c2>{char_reg}){{0,2}}(?P<t2>{type_reg})?(?P<c3>{char_reg}){{0,2}}(?P<t3>{type_reg})?(?P<c4>{char_reg}){{0,2}}(?P<t4>{type_reg})?(?P<c5>{char_reg}){{0,2}}(?P<t5>{type_reg})?(?P<c6>{char_reg}){{0,2}}(?P<t6>{type_reg})?',
+        # ]
+
+        # python3.5 不支持f表达式
+        p_type_reg_list = [
+            '(?P<t1>{})(项目|工程|建设类项目|及其他|建设类|(补办))'.format(type_reg),
+            '^[\u4e00-\u9fff]{{2,4}}(?P<t1>{})$'.format(type_reg),
+            '^(?P<t1>{})[、](?P<t2>{})$'.format(type_reg, type_reg),
+            '(?P<t1>{})((补办)|工程|)[,,;;](?!{}).*'.format(type_reg, type_reg),
+            '^((?P<c1>{})?)(?P<t1>{})((?P<c2>{})?)(?P<t2>{})((?P<c3>{})?)(?P<t3>{})$'
+                .format(char_reg, type_reg, char_reg, type_reg, char_reg, type_reg),
+            '^(?P<c1>{}){{0,2}}(?P<t1>{})(?P<c2>{}){{0,2}}(?P<t2>{})?(?P<c3>{}){{0,2}}(?P<t3>{})?(?P<c4>{}){{0,2}}(?P<t4>{})?(?P<c5>{}){{0,2}}(?P<t5>{})?(?P<c6>{}){{0,2}}(?P<t6>{})?'
+                .format(char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg),
+        ]
+
+        # for d in dict_list:
+        properties = str(d.get('properties'))
+        properties = re.sub(' ', '', properties)
+        match_flag = 0
+        if properties not in ['None', None, '']:
+            if properties in p_all_type:
+                d['properties'] = properties
+                match_flag = 1
+            else:
+                for index, reg in enumerate(p_type_reg_list):
+                    match = re.search(reg, properties)
+                    if match:
+                        match_flag = 1
+                        keys = list(match.groupdict().keys())
+                        # 普通文本类型
+                        if len(keys) == 1 and keys[0] == 't1':
+                            t = match.groupdict().get('t1', '')
+                            t = properties_type_dict.get(t, '')
+                            d['properties'] = t
+                            break
+                        if len(keys) == 1 and 't' in keys:
+                            t = match.groupdict().get(keys[0], '')
+                            t = properties_type_dict.get(t, '')
+                            d['properties'] = t
+                            break
+                        if len(keys) == 2 and 't1' in keys and 't2' in keys:
+                            t_list = [match.groupdict().get('t1', ''), match.groupdict().get('t2', '')]
+                            t_list = [properties_type_dict.get(x, '') for x in t_list]
+                            t_list.sort(key=lambda x: x)
+                            d['properties'] = ','.join(t_list)
+                            break
+                        # 括号打勾类型
+                        if index == 4:
+                            if match.groupdict().get('c1'):
+                                t = match.groupdict().get('t1', '')
+                            elif match.groupdict().get('c2'):
+                                t = match.groupdict().get('t2', '')
+                            elif match.groupdict().get('c3'):
+                                t = match.groupdict().get('t3', '')
+                            else:
+                                t = '未披露'
+                            t = properties_type_dict.get(t, '')
+                            d['properties'] = t
+                            break
+
+                        # 选项框类型
+                        match = re.finditer(true_char, properties)
+                        match = list(match)
+                        t_list = []
+                        for m in match:
+                            for t in p_all_type:
+                                if properties[m.end():m.end() + len(t)] == t:
+                                    t_list.append(t)
+                                    break
+                        if t_list:
+                            t_list = ','.join([properties_type_dict.get(x) for x in t_list])
+                            t_list = t_list.split(',')
+                            t_list.sort(key=lambda x: x)
+                            d['properties'] = ','.join(t_list)
+                        else:
+                            d['properties'] = '未披露'
+                        break
+        if match_flag and ',' not in d.get('properties'):
+            t_list = properties_type_dict.get(d.get('properties'), '').split(',')
+            t_list.sort(key=lambda x: x)
+            d['properties'] = ','.join(t_list)
+
+        if not match_flag and d.get('properties') not in ['None', None, '']:
+            # 规则匹配不到,判断是否有选项框
+            match = re.finditer(true_char, properties)
+            match = list(match)
+            t_list = []
+            for m in match:
+                for t in p_all_type:
+                    if properties[m.end():m.end() + len(t)] == t:
+                        t_list.append(t)
+                        break
+            if t_list:
+                t_list = ','.join([properties_type_dict.get(x) for x in t_list])
+                t_list = t_list.split(',')
+                t_list.sort(key=lambda x: x)
+                d['properties'] = ','.join(t_list)
+            else:
+                d['properties'] = ''
+
+        if d.get('properties') in ['None', None, '']:
+            d['properties'] = '未披露'
+        return d
+
+    def standard_approval_result(self, d):
+        # approval_result 审批结果
+        # approval_result_type_dict = {
+        #     '不予许可/未通过': '不予许可|不准予许可|办结(不通过)|不符合产业政策已撤销|办结(受理不通过)|办结(办理不通过)|不予受理|无需受理|受理不通过|不予备案|拒收|退回|驳回|退件|退办|不合格|不同意',
+        #     '准予许可/通过': '准予许可|许可/同意|已审[批核]|审[批核]通过|已审[批核]通过|批准|许可|合格|同意|准予备案',
+        #     '在办状态': '在办|审批中|办理中|部门审批中|部门办理|待审批|待补全|待预审|待接件|审批|审查|部门开始办理|待确认中介机构|审[核批查]中|尚未审[核批查]|待联审|待互联网预审|待答复',
+        #     '已完成审批': '批复|已办结|办结|批复办结|已批复|审批通过|办结成功|办结(通过)|正常办结|已办结(正式出文)|正式出文|正常|办理完成|结案',
+        #     '已受理': '已受理|受理|通过|受理通过|进入受理|已接件|补齐补正受理|预审通过|已登记|已备案|确认备案|收件|接件|已立项|已提交|已处理|登记|备案(未抽中)|已发布',
+        #     '其他': '补齐补正通知|补正(开始)|补齐补正受理|等等其他|申报|特别程序|撤销|补办|撤件|补正|补件|归档|送达|挂起|补齐补证',
+        # }
+
+        approval_result_type_dict = {
+            '不予许可/未通过': '不予许可/未通过|不予许可|不准予许可|办结(不通过)|不符合产业政策已撤销|办结(受理不通过)|办结(办理不通过)|不予受理|无需受理|受理不通过|不予备案|拒收|退回|驳回|退件|退办|不合格|不同意|不符合要求|申请材料虚假|不满足条件|未通过审核|项目终止|资质不符|未获批准|无法办理|申请无效|已驳回|撤销申请|备案已撤销',
+            '准予许可/通过': '准予许可/通过|准予许可|许可/同意|已审[批核]|审[批核]通过|已审[批核]通过|批准|许可|合格|同意|准予备案|符合规定|申请成功|符合条件|审核通过|项目获批|资质合格|可以办理|申请有效|办结|准予许可|办结(通过)|办结(准予许可)|确认备案',
+            '在办状态': '在办状态|在办|审批中|办理中|部门审批中|部门办理|待审批|待补全|待预审|待接件|审批|审查|部门开始办理|待确认中介机构|审[核批查]中|尚未审[核批查]|待联审|待互联网预审|待答复|申报|特别程序申请|特别程序|待补办|规划条件核实(个人住宅)|收件|补正(结束)|申办|已提交|建设单位提交|待承办|尚未审核|拟同意|已立项|补齐补证|正在审核|审批流程中|办理流程中|等待反馈|需补充材料|正在审查|等待审批结果|处理中|审核中',
+            '已完成审批': '已完成审批|批复|已办结|办结|批复办结|已批复|审批通过|办结成功|办结(通过)|正常办结|已办结(正式出文)|正式出文|正常|办理完成|结案|审批结束|审批完成|审批通过并办结|审批流程结束|审批成功|审批办结|审批完结|已发布|符合性判定|已处理|送达',
+            '已受理': '已受理|受理|通过|受理通过|进入受理|已接件|补齐补正受理|预审通过|已登记|已备案|确认备案|收件|接件|已立项|已提交|已处理|登记|备案(未抽中)|已发布|已接收申请|申请已受理|受理申请|正在受理|已进入受理流程|申请受理中|受理成功|已受理待审核|已撤件',
+            '其他': '补齐补正通知|补正(开始)|补齐补正受理|等等其他|申报|特别程序|撤销|补办|撤件|补正|补件|归档|送达|挂起|补齐补证|暂缓审批|暂不受理|需重新申报|暂停办理|暂无结果|需进一步研究|暂存待查|暂不通过|抽中|未抽中|许可|新设',
+        }
+        # 未分类: 申报 审查 办结|准予许可 特别程序申请 特别程序 撤销申请 归档 待补办 已撤件 备案已撤销 新设
+        # 已驳回 办结(通过) 办结(准予许可) 部门开始办理 规划条件核实(个人住宅) 收件 退件 挂起 批准 合格 不合格 确认备案
+        # 待确认中介机构 退办 归档 接件 补正(结束) 许可 抽中 未抽中 同意 补件 申办 结案 已发布 符合性判定
+        # 已处理 送达 尚未审核 拟同意 已立项 待联审 已提交 待互联网预审 建设单位提交 待承办 备案(未抽中)
+        # 补齐补证 已发布
+
+        # for d in dict_list:
+        approval_result = d.get('approval_result')
+        new_approval_result = ""
+        if approval_result:
+            for t, reg in approval_result_type_dict.items():
+                if re.search(reg, str(approval_result)):
+                    new_approval_result = t
+                    break
+        d['approval_result'] = new_approval_result
+        return d
+
+    def standard_year_limit(self, d):
+        # year_limit 建设年限 int类型 单位是月
+        year_limit_reg_list = [
+            '^(?P<year_num>\d{1,2})年$',
+            '^(?P<year_num_ch>[一二三四五六七八九十]{1,2})年$',
+            '^(?P<month_num>\d+)个月$',
+            '^(?P<month_num_ch>[一二三四五六七八九十]{1,3})个月$',
+            '^(?P<day_num>\d+)([天日]|个?日历天)$',
+            '^(?P<day_num_ch>[一二三四五六七八九十])([天日]|个?日历天)$',
+            '^(?P<y_range1>\d{4,})年?[-至到—一]{1,2}(?P<y_range2>\d{4,})年?$',
+            '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})月?$',
+            '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?(?P<d_range1>\d{1,2})日?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})[月-]?(?P<d_range2>\d{1,2})日?$',
+            '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?(?P<d_range1>\d{1,2})日?[ ]?(?P<h_range1>\d{1,2})[时::]?(?P<mi_range1>\d{1,2})[分::]?(?P<s_range1>\d{1,2})(\.0)?秒?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})[月-]?(?P<d_range2>\d{1,2})日?[ ]?(?P<h_range2>\d{1,2})[时::]?(?P<mi_range2>\d{1,2})[分::]?(?P<s_range2>\d{1,2})(\.0)?秒?$',
+        ]
+
+        # for d in dict_list:
+        year_limit = d.get('year_limit')
+        time_commencement = d.get('time_commencement')
+        time_completion = d.get('time_completion')
+        if year_limit:
+            new_year_limit = 0
+            for reg_index, reg in enumerate(year_limit_reg_list):
+                match = re.search(reg, str(year_limit))
+                if not match:
+                    # print('not match!!')
+                    continue
+                # print('reg_index', reg_index)
+                if reg_index == 0:
+                    new_year_limit = int(match.group('year_num')) * 12 * 30
+                elif reg_index == 1:
+                    new_year_limit = self.chinese_to_arabic(match.group('year_num_ch')) * 12 * 30
+                elif reg_index == 2:
+                    new_year_limit = int(match.group('month_num')) * 30
+                elif reg_index == 3:
+                    new_year_limit = self.chinese_to_arabic(match.group('month_num_ch')) * 30
+                elif reg_index == 4:
+                    new_year_limit = int(match.group('day_num'))
+                elif reg_index == 5:
+                    new_year_limit = self.chinese_to_arabic(match.group('day_num_ch'))
+                elif reg_index in [6, 7, 8, 9]:
+                    year1, year2 = match.groupdict().get('y_range1'), match.groupdict().get('y_range2')
+                    month1, month2 = match.groupdict().get('m_range1'), match.groupdict().get('m_range2')
+                    day1, day2 = match.groupdict().get('d_range1'), match.groupdict().get('d_range2')
+                    if month1:
+                        month1, month2 = [x if len(x) == 2 else '0' + x for x in [month1, month2]]
+                    if day1:
+                        day1, day2 = [x if len(x) == 2 else '0' + x for x in [day1, day2]]
+                        date_str1 = '{}-{}-{}'.format(year1, month1, day1)
+                        date1 = datetime.strptime(date_str1, '%Y-%m-%d')
+                        date_str2 = '{}-{}-{}'.format(year2, month2, day2)
+                        date2 = datetime.strptime(date_str2, '%Y-%m-%d')
+                        new_year_limit = (date2 - date1).days
+                    elif month1:
+                        date_str1 = '{}-{}'.format(year1, month1)
+                        date1 = datetime.strptime(date_str1, '%Y-%m')
+                        date_str2 = '{}-{}'.format(year2, month2)
+                        date2 = datetime.strptime(date_str2, '%Y-%m')
+                        new_year_limit = (date2 - date1).days
+                    else:
+                        date1 = datetime.strptime(year1, '%Y')
+                        date2 = datetime.strptime(year2, '%Y')
+                        new_year_limit = (date2 - date1).days
+                    # 从建设年限补充开工竣工时间
+                    if not time_commencement:
+                        d['time_commencement'] = datetime.strftime(date1, '%Y-%m-%d')
+                    if not time_completion:
+                        d['time_completion'] = datetime.strftime(date2, '%Y-%m-%d')
+                break
+            if new_year_limit > 0:
+                d['year_limit'] = new_year_limit
+            else:
+                d.pop('year_limit')
+
+        if d.get('year_limit') in [None, 'None', '', 0]:
+            d['year_limit'] = ''
+        return d
+
+    def standard_time_commencement_completion(self, d):
+        # time_commencement 开工时间 可以从建设年限year_limit里面提取
+        # time_completion 竣工时间 可以从建设年限year_limit里面提取 不能比开工时间早
+        # for d in dict_list:
+        time_commencement = d.get('time_commencement')
+        time_completion = d.get('time_completion')
+        if time_commencement and time_completion:
+            try:
+                date1 = datetime.strptime(time_commencement, '%Y-%m-%d')
+                date2 = datetime.strptime(time_completion, '%Y-%m-%d')
+            except:
+                try:
+                    date1 = datetime.strptime(time_commencement, '%Y-%m')
+                    date2 = datetime.strptime(time_completion, '%Y-%m')
+                    d['time_commencement'] = datetime.strftime(date1, '%Y-%m-%d')
+                    d['time_completion'] = datetime.strftime(date2, '%Y-%m-%d')
+                except:
+                    d['time_commencement'] = ''
+                    d['time_completion'] = ''
+                    date1, date2 = None, None
+            if date1 is not None and date1 > date2:
+                d['time_commencement'] = ''
+                d['time_completion'] = ''
+        return d
+
+    def standard_money_or_area(self, d):
+        # 建筑面积、用地面积、长度 float类型两位小数
+        # 总投资、环保投资 float类型两位小数
+        # for d in dict_list:
+        for col in ['proportion', 'use_area', 'length']:
+            value = d.get(col)
+            if value:
+                try:
+                    d[col] = round(float(value), 2)
+                except:
+                    value = re.sub('[\u4e00-\u9fa5]', '', value)
+                    value = re.sub('m2', '㎡', value)
+                    d[col] = round(float(value[:-1]), 2)
+
+        for col in ['total_tenderee_money', 'env_invest']:
+            value = d.get(col)
+            if value:
+                d[col] = round(float(value), 2)
+        return d
+
+    def standard_pro_type(self, d):
+        # declare_type 申报类型 从原来项目类型pro_type中拆出
+        # 若成功匹配,原来的pro_type置为空
+        # for d in dict_list:
+        declare_type = d.get('declare_type')
+        if declare_type:
+            d['pro_type'] = ''
+        return d
+
+    def extract_env_invest(self, content):
+        """
+        环保投资
+
+        :param content:
+        :return:
+        """
+        pattern = "环保投资[大概约为是::]*(?P<invs>\d+(\.\d+)?万?元)"
+        match = re.search(pattern, content)
+        if match is not None:
+            invest = match.groupdict().get("invs", "")
+            money = getUnifyMoney(invest)
+            if money > 0:
+                return money
+        return ""
+
+    def extract_money_use(self, content):
+        """
+        资金构成
+
+        :param content:
+        :return:
+        """
+        list_sentences = re.split(",|。", content)
+        list_data = []
+        pattern = "^.{,20}[费用|预备费|费][大概约为是::]*\d+(\.\d+)?万?元.{,20}$"
+        for sentence in list_sentences:
+            match = re.search(pattern, sentence)
+            if match is not None:
+                list_data.append(sentence)
+        return ",".join(list_data)
+
+    def extract_use_area(self, content, has_preffix=True):
+        """
+        用地面积
+
+        :param content:
+        :param has_preffix:
+        :return:
+        """
+        if not content:
+            return "", ""
+        # log("content")
+        # log(content)
+        suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+        # reg_dict = {
+        #     0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)|长|长度))" + suffix,
+        #     1: "(?P<proportion>((用地|占地|使用)(面积|规模)|全长)" + suffix,
+        #     # 2: "(?P<proportion>((用地|占地|使用)?面积)" + suffix
+        #     2: "(?P<proportion>((用地|占地|使用)面积)" + suffix
+        # }
+        reg_dict = {
+            0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)))" + suffix,
+            1: "(?P<proportion>((用地|占地|使用)(面积|规模))" + suffix,
+            2: "(?P<proportion>((用地|占地|使用)面积)" + suffix
+        }
+
+        if not has_preffix:
+            reg_dict[3] = "(?P<proportion>" + suffix
+
+        _proportion = ""
+        for i in range(len(list(reg_dict.keys()))):
+            if _proportion:
+                break
+            _pattern = reg_dict.get(i)
+            # logging.info('content ' + str(content))
+            match = re.search(_pattern, str(content))
+            if match:
+                _proportion = match.groupdict().get("proportion", "")
+
+        if not _proportion:
+            return "", ""
+
+        # 统一格式
+        multiple_cnt = 1
+        digit = ""
+
+        # 确定具体数字
+        match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
+        if match:
+            # logging.info(str(_proportion) + '  ' + str(match.group()))
+            d1 = match.group('d1')
+            d2 = match.group('d2')
+            try:
+                d1 = int(re.sub(',', '', d1))
+            except:
+                return "", ""
+            if d2:
+                d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
+                # print('d1, d2', d1, d2)
+                d1 += d2
+            digit = d1
+        # print('digit', digit)
+
+        # 确定中文倍数
+        _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
+        match = re.search('[十百千万亿]+', _proportion2)
+        _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
+        if match:
+            for c in match.group():
+                multiple_cnt *= _dict.get(c)
+            _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
+        else:
+            _proportion3 = _proportion2
+        # print('multiple_cnt2', multiple_cnt)
+
+        # 确定面积/长度
+        match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
+        if match:
+            unit = '㎡'
+        else:
+            unit = 'm'
+
+        # 确定单位倍数
+        match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
+        if match:
+            if unit == 'm':
+                if re.search('[kK千公]', match.group()):
+                    multiple_cnt *= 1000
+                elif re.search('[里]', match.group()):
+                    multiple_cnt *= Decimal(str(500))
+            else:
+                if '亩' in match.group():
+                    multiple_cnt *= Decimal(str(666.67))
+                elif '顷' in match.group():
+                    multiple_cnt *= 10000
+                elif re.search('千米|公里|k[mM㎡]', match.group()):
+                    multiple_cnt *= 1000000
+        # print('multiple_cnt1', multiple_cnt)
+
+        # 拼接
+        digit = str(digit * multiple_cnt) + unit
+
+        return _proportion, digit
+
+    def extract_proportion(self, content, has_preffix=True):
+        if not content:
+            return "", ""
+        # log("content")
+        # log(content)
+        suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+        reg_dict = {
+            0: "(?P<proportion>(总((建筑|建设)(面积|规模)))" + suffix,
+            1: "(?P<proportion>((建筑|建设)(面积|规模))" + suffix,
+            2: "(?P<proportion>((建筑|建设|区域)?面积|项目规模)" + suffix
+        }
+
+        if not has_preffix:
+            reg_dict[3] = "(?P<proportion>" + suffix
+
+        _proportion = ""
+        for i in range(len(list(reg_dict.keys()))):
+            if _proportion:
+                break
+            _pattern = reg_dict.get(i)
+            # logging.info('content ' + str(content))
+            match = re.search(_pattern, str(content))
+            if match:
+                _proportion = match.groupdict().get("proportion", "")
+
+        if not _proportion:
+            return "", ""
+
+        # 统一格式
+        multiple_cnt = 1
+        digit = ""
+
+        # 确定具体数字
+        match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
+        if match:
+            # logging.info(str(_proportion) + '  ' + str(match.group()))
+            d1 = match.group('d1')
+            d2 = match.group('d2')
+            try:
+                d1 = int(re.sub(',', '', d1))
+            except:
+                return "", ""
+            if d2:
+                d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
+                # print('d1, d2', d1, d2)
+                d1 += d2
+            digit = d1
+        # print('digit', digit)
+
+        # 确定中文倍数
+        _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
+        match = re.search('[十百千万亿]+', _proportion2)
+        _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
+        if match:
+            for c in match.group():
+                multiple_cnt *= _dict.get(c)
+            _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
+        else:
+            _proportion3 = _proportion2
+        # print('multiple_cnt2', multiple_cnt)
+
+        # 确定面积/长度
+        match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
+        if match:
+            unit = '㎡'
+        else:
+            unit = 'm'
+
+        # 确定单位倍数
+        match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
+        if match:
+            if unit == 'm':
+                if re.search('[kK千公]', match.group()):
+                    multiple_cnt *= 1000
+                elif re.search('[里]', match.group()):
+                    multiple_cnt *= Decimal(str(500))
+            else:
+                if '亩' in match.group():
+                    multiple_cnt *= Decimal(str(666.67))
+                elif '顷' in match.group():
+                    multiple_cnt *= 10000
+                elif re.search('千米|公里|k[mM㎡]', match.group()):
+                    multiple_cnt *= 1000000
+        # print('multiple_cnt1', multiple_cnt)
+
+        # 拼接
+        digit = str(digit * multiple_cnt) + unit
+
+        return _proportion, digit
+
+    def extract_length(self, content, has_preffix=True):
+        if not content:
+            return "", ""
+        # log("content")
+        # log(content)
+        suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+        reg_dict = {
+            0: "(?P<proportion>(总((建筑|建设)长|长度))" + suffix,
+            1: "(?P<proportion>((建筑|建设)全长)" + suffix,
+            2: "(?P<proportion>((建筑|建设|区域)?全长)" + suffix
+        }
+
+        if not has_preffix:
+            reg_dict[3] = "(?P<proportion>" + suffix
+
+        _proportion = ""
+        for i in range(len(list(reg_dict.keys()))):
+            if _proportion:
+                break
+            _pattern = reg_dict.get(i)
+            # logging.info('content ' + str(content))
+            match = re.search(_pattern, str(content))
+            if match:
+                _proportion = match.groupdict().get("proportion", "")
+
+        if not _proportion:
+            return "", ""
+
+        # 统一格式
+        multiple_cnt = 1
+        digit = ""
+
+        # 确定具体数字
+        match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
+        if match:
+            # logging.info(str(_proportion) + '  ' + str(match.group()))
+            d1 = match.group('d1')
+            d2 = match.group('d2')
+            try:
+                d1 = int(re.sub(',', '', d1))
+            except:
+                return "", ""
+            if d2:
+                d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
+                # print('d1, d2', d1, d2)
+                d1 += d2
+            digit = d1
+        # print('digit', digit)
+
+        # 确定中文倍数
+        _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
+        match = re.search('[十百千万亿]+', _proportion2)
+        _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
+        if match:
+            for c in match.group():
+                multiple_cnt *= _dict.get(c)
+            _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
+        else:
+            _proportion3 = _proportion2
+        # print('multiple_cnt2', multiple_cnt)
+
+        # 确定面积/长度
+        match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
+        if match:
+            unit = '㎡'
+        else:
+            unit = 'm'
+
+        # 确定单位倍数
+        match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
+        if match:
+            if unit == 'm':
+                if re.search('[kK千公]', match.group()):
+                    multiple_cnt *= 1000
+                elif re.search('[里]', match.group()):
+                    multiple_cnt *= Decimal(str(500))
+            else:
+                if '亩' in match.group():
+                    multiple_cnt *= Decimal(str(666.67))
+                elif '顷' in match.group():
+                    multiple_cnt *= 10000
+                elif re.search('千米|公里|k[mM㎡]', match.group()):
+                    multiple_cnt *= 1000000
+        # print('multiple_cnt1', multiple_cnt)
+
+        # 拼接
+        digit = str(digit * multiple_cnt) + unit
+
+        return _proportion, digit
+
+    def extract_declare_type(self, pro_type):
+        """
+        申报类型从pro_type项目类型中提取
+
+        :param pro_type:
+        :return:
+        """
+        declare_type_list = ['立项', '审批', '备案', '核准', '未知', ]
+        declare_type = ''
+        if pro_type:
+            reg = '(?P<type>{})'.format('|'.join(declare_type_list))
+            match = re.search(reg, str(pro_type))
+            if match:
+                declare_type = match.group('type')
+        return declare_type
+
+    def run_standard(self, rs_dic):
+        # 字段标准化 2025-01-13
+        # rs_dic = self.standard_data([rs_dic])[0]
+        rs_dic = self.standard_properties(rs_dic)
+        rs_dic = self.standard_approval_result(rs_dic)
+        rs_dic = self.standard_year_limit(rs_dic)
+        rs_dic = self.standard_time_commencement_completion(rs_dic)
+        rs_dic = self.standard_money_or_area(rs_dic)
+        rs_dic = self.standard_pro_type(rs_dic)
+        return rs_dic
+
+
+class APProduction:
+    def __init__(self, test_mode=0, update=1, show=0):
+        self.ots_client = getConnect_ots()
+        self.ots_capacity = getConnect_ots_capacity()
+        self.test_mode = test_mode
+        self.update = update
+        self.show = show
+
+        # 处理到的时间
+        self.opertime_txt = os.path.abspath(os.path.dirname(__file__)) + '/opertime.txt'
+        with open(self.opertime_txt, 'r') as f:
+            self.opertime_last = f.read()
+
+        self.document_cols = ["docid", "doctitle", "project_name", "page_time",
+                              "project_code", "approval_json", "extract_json",
+                              'total_tenderee_money', 'total_tenderee_money_unit',
+                              'time_approval',
+                              'construct_company', 'construct_company_code',
+                              'compilation_unit', 'evaluation_agency', 'declare_company',
+                              'approver', 'publisher'
+                              ]
+
+        self.ap_predictor = APPredictor()
+
+        self.none_list = [None, '', '-', '全国', '未知', "None"]
+
+    def ap_data_flow(self, docid):
+        flow_start_time = time.time()
+        log('process docid ' + str(docid))
+
+        # 获取公告的数据
+        # 单个审批公告可能同时发布多个不同的审批项目
+        start_time = time.time()
+        one_docid_approvals = self.get_approval_data_from_document_by_docid(docid)
+        if self.show:
+            log('len(one_docid_approvals)', len(one_docid_approvals))
+            print('one_docid_approvals', one_docid_approvals)
+        # log('time1 ' + str(round(time.time()-start_time, 2)))
+
+        # 跳过单个审批公告多个审批记录的
+        if len(one_docid_approvals) > 1:
+            log('跳过单个审批公告多个审批记录的')
+            print('len(one_docid_approvals)', len(one_docid_approvals))
+            return []
+
+        start_time = time.time()
+        match_approvals_list = self.get_data_from_document_by_rules(one_docid_approvals)
+        if self.show:
+            log('len(match_approvals_list)', len(match_approvals_list))
+            # print('match_approvals_list', match_approvals_list)
+        log('time2 ' + str(round(time.time()-start_time, 2)))
+        if match_approvals_list and len(match_approvals_list[0]) == 0:
+            log('by_rules cnt too many! return')
+            return []
+
+        start_time = time.time()
+        merge_approvals_list = self.merge_approval(match_approvals_list, one_docid_approvals)
+        if self.show:
+            log('len(merge_approvals_list)', len(merge_approvals_list))
+        # log('time3 ' + str(round(time.time()-start_time, 2)))
+
+        start_time = time.time()
+        approval_project_list = self.generate_project(merge_approvals_list)
+        if self.show:
+            log('len(approval_project_list)', len(approval_project_list))
+            for approval_project in approval_project_list:
+                print(approval_project.get('uuid'), approval_project.get('docids_cnt'))
+                print(approval_project.get('docids'))
+        # log('time4 ' + str(round(time.time()-start_time, 2)))
+
+        start_time = time.time()
+        match_projects_list = self.find_exist_projects_by_docid(approval_project_list)
+        if self.show:
+            log('len(match_projects_list)', len(match_projects_list))
+            for match_projects in match_projects_list:
+                print([x.get('uuid') for x in match_projects], len(match_projects))
+        # log('time5 ' + str(round(time.time()-start_time, 2)))
+
+        start_time = time.time()
+        merge_project_list = self.merge_project(match_projects_list)
+        if self.show:
+            log('len(merge_project_list)', len(merge_project_list))
+        # log('time6 ' + str(round(time.time()-start_time, 2)))
+
+        start_time = time.time()
+        self.update_project(merge_project_list)
+        # log('time7 ' + str(round(time.time()-start_time, 2)))
+
+        log('approval_project_list len:', len(approval_project_list),
+            'cost:', round(time.time() - flow_start_time, 2))
+        return approval_project_list
+
+    def get_approval_docid_from_document(self, max_cnt=20):
+        # 查询这个时间点数量,后续查数据数量需略微大于该时间点数量,否则会死循环
+        bool_query = BoolQuery(must_queries=[
+            RangeQuery("status", 201, 301),
+            TermQuery("persistence_time", self.opertime_last),
+            TermQuery("docchannel", 302),
+        ])
+        total_cnt = self.search_util("document", "document_index",
+                                     bool_query, ['docid'], None,
+                                     only_return_total_cnt=1)
+        if max_cnt <= total_cnt:
+            max_cnt = total_cnt + 1
+            log('reset max_cnt', str(max_cnt))
+
+        bool_query = BoolQuery(must_queries=[
+            RangeQuery("status", 201, 301),
+            RangeQuery("persistence_time", self.opertime_last, None),
+            TermQuery("docchannel", 302),
+        ])
+        # 需时间正序执行
+        asc_flag = 1
+        row_list = self.search_util("document", "document_index", bool_query,
+                                    ["docid", "persistence_time"], "persistence_time",
+                                    limit=max_cnt, asc=asc_flag)
+
+        id_list = []
+        persistence_time_list = []
+        for _data in row_list:
+            id_list.append(_data.get("docid"))
+            persistence_time_list.append(_data.get("persistence_time"))
+        return id_list, persistence_time_list
+
+    def get_approval_data_from_document_by_docid(self, docid):
+        bool_query = BoolQuery(must_queries=[
+            TermQuery("docid", docid),
+            RangeQuery('status', 201, 301)
+        ])
+
+        if self.test_mode:
+            rows = self.search_util("document", "document_index",
+                                    bool_query, self.document_cols, 'page_time')
+        else:
+            self.document_cols += ['persistence_time']
+            rows = self.search_util("document", "document_index",
+                                    bool_query, self.document_cols, 'persistence_time')
+
+        # 处理下数据,后期可能删除;因为按docid搜只有一个或没有,直接返回
+        if len(rows) == 1:
+            # 这里会把json里的都处理,可能会产生多个不同审批项目,都返回
+            approval_list = self.process_approval_json(rows[0])
+        else:
+            approval_list = []
+        return approval_list
+
+    def get_data_from_document_by_rules(self, approvals):
+        def task(approval):
+            doc_num = approval.get("doc_num", "")
+            doctitle = approval.get("doctitle", "")
+            project_name = approval.get("project_name", "")
+            project_code = approval.get("project_code", "")
+            docid = approval.get("docid")
+            should_queries = []
+            if doc_num != "":
+                should_queries.append(MatchPhraseQuery("doctitle", doc_num))
+                should_queries.append(MatchPhraseQuery("doctextcon", doc_num))
+                should_queries.append(MatchPhraseQuery("attachmenttextcon", doc_num))
+            if doctitle != "":
+                should_queries.append(MatchPhraseQuery("doctitle", doctitle))
+                should_queries.append(MatchPhraseQuery("doctextcon", doctitle))
+                should_queries.append(MatchPhraseQuery("attachmenttextcon", doctitle))
+            if project_name != "":
+                should_queries.append(MatchPhraseQuery("doctitle", project_name))
+                should_queries.append(MatchPhraseQuery("doctextcon", project_name))
+                should_queries.append(MatchPhraseQuery("attachmenttextcon", project_name))
+            if project_code != "":
+                should_queries.append(MatchPhraseQuery("doctitle", project_code))
+                should_queries.append(MatchPhraseQuery("doctextcon", project_code))
+                should_queries.append(MatchPhraseQuery("attachmenttextcon", project_code))
+            _query = BoolQuery(should_queries=should_queries, must_not_queries=[TermQuery("docid", docid)])
+
+            # # 两两组合
+            # should_doc_num = []
+            # should_doctitle = []
+            # should_project_name = []
+            # should_project_code = []
+            # if doc_num != "":
+            #     should_doc_num.append(MatchPhraseQuery("doctitle", doc_num))
+            #     should_doc_num.append(MatchPhraseQuery("doctextcon", doc_num))
+            #     should_doc_num.append(MatchPhraseQuery("attachmenttextcon", doc_num))
+            # if doctitle != "":
+            #     should_doctitle.append(MatchPhraseQuery("doctitle", doctitle))
+            #     should_doctitle.append(MatchPhraseQuery("doctextcon", doctitle))
+            #     should_doctitle.append(MatchPhraseQuery("attachmenttextcon", doctitle))
+            # if project_name != "":
+            #     should_project_name.append(MatchPhraseQuery("doctitle", project_name))
+            #     should_project_name.append(MatchPhraseQuery("doctextcon", project_name))
+            #     should_project_name.append(MatchPhraseQuery("attachmenttextcon", project_name))
+            # if project_code != "":
+            #     should_project_code.append(MatchPhraseQuery("doctitle", project_code))
+            #     should_project_code.append(MatchPhraseQuery("doctextcon", project_code))
+            #     should_project_code.append(MatchPhraseQuery("attachmenttextcon", project_code))
+            #
+            # all_should_list = []
+            # if should_doc_num and should_doctitle:
+            #     q = BoolQuery(must_queries=[
+            #             BoolQuery(should_queries=should_doc_num),
+            #             BoolQuery(should_queries=should_doctitle)
+            #         ])
+            #     all_should_list.append(q)
+            #
+            # if should_doc_num and should_project_name:
+            #     q = BoolQuery(must_queries=[
+            #             BoolQuery(should_queries=should_doc_num),
+            #             BoolQuery(should_queries=should_project_name)
+            #         ])
+            #     all_should_list.append(q)
+            #
+            # if should_doc_num and should_project_code:
+            #     q = BoolQuery(must_queries=[
+            #             BoolQuery(should_queries=should_doc_num),
+            #             BoolQuery(should_queries=should_project_code)
+            #         ])
+            #     all_should_list.append(q)
+            #
+            # if should_project_code and should_doctitle:
+            #     q = BoolQuery(must_queries=[
+            #             BoolQuery(should_queries=should_project_code),
+            #             BoolQuery(should_queries=should_doctitle)
+            #         ])
+            #     all_should_list.append(q)
+            #
+            # if should_project_code and should_project_name:
+            #     q = BoolQuery(must_queries=[
+            #             BoolQuery(should_queries=should_project_code),
+            #             BoolQuery(should_queries=should_project_name)
+            #         ])
+            #     all_should_list.append(q)
+            #
+            # if len(all_should_list) == 0:
+            #     return [approval]
+            #
+            # _query = BoolQuery(should_queries=all_should_list)
+
+            bool_query = BoolQuery(must_queries=[
+                RangeQuery("status", 201, 301),
+                _query
+            ])
+
+            # 先查数量
+            total_cnt = self.search_util("document", "document_index",
+                                         bool_query, ['docid'], None,
+                                         only_return_total_cnt=1)
+            if total_cnt >= 50:
+                log('by_rules total_cnt > 50 ' + str(total_cnt))
+                return []
+
+            rows = self.search_util("document", "document_index",
+                                    bool_query, self.document_cols, 'page_time')
+            return [approval] + rows
+
+        approvals_list = self.multi_thread_util(approvals, task, 3)
+        return approvals_list
+
+    def merge_approval(self, match_approvals_list, origin_approvals):
+        merge_approvals_list = []
+        for index, match_approvals in enumerate(match_approvals_list):
+            # 这里match_approvals是从origin_approval通过规则搜出来的一批符合的
+            # match_approvals origin_approval 一一对应
+            origin_approval = origin_approvals[index]
+            merge_approvals = []
+            for _d in match_approvals:
+                # 这里每个符合的审批都要把json里的都处理一遍,再判断是否跟原审批匹配
+                approval_json_list = self.process_approval_json(_d)
+                # approval_json_list = [y for x in approval_json_list for y in x]
+                merge_approvals += self.check_approval(approval_json_list, origin_approval)
+            merge_approvals = [json.loads(x) for x in list(set([json.dumps(x) for x in merge_approvals]))]
+            merge_approvals_list.append(merge_approvals)
+
+        return merge_approvals_list
+
+    def generate_project(self, merge_approvals_list):
+        approval_project_list = []
+        for merge_approvals in merge_approvals_list:
+            if not merge_approvals:
+                continue
+
+            project_id = str(uuid4())
+            merge_approvals.sort(key=lambda x: x.get("page_time", ""), reverse=False)
+
+            merge_approvals[0]['uuid'] = project_id
+            _dict = self.get_proper_col_value(merge_approvals)
+
+            _dict["id"] = project_id
+            _dict["uuid"] = project_id
+
+            project_dict = {}
+            for col in approval_cols + ['docids', 'docids_cnt']:
+                project_dict[col] = _dict.get(col, '')
+            current_time = datetime.now()
+            current_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
+            project_dict['update_time'] = current_time
+            project_dict['status'] = '1'
+
+            approval_project_list.append(project_dict)
+        return approval_project_list
+
+    def find_exist_projects_by_docid(self, project_list):
+        match_projects_list = []
+        for project in project_list:
+            docids = project.get('docids').split(',')
+            bool_query = BoolQuery(must_queries=[
+                BoolQuery(should_queries=[TermQuery("docids", x) for x in docids]),
+                # BoolQuery(must_not_queries=[TermQuery('status', '404')]),
+            ])
+            columns = ["docid", "doctitle", "project_name", "page_time", "project_code", "approval_json",
+                       "extract_json"]
+            rows = self.search_util("approval_project", "approval_project_index",
+                                    bool_query, columns, None)
+            match_projects = [project] + rows
+            match_projects_list.append(match_projects)
+        return match_projects_list
+
+    def merge_project(self, match_projects_list):
+        merge_project_list = []
+        for match_projects in match_projects_list:
+            merge_uuids = [x.get('uuid') for x in match_projects]
+            if len(merge_uuids) > 1:
+                # 排除当前项目
+                merge_uuids = merge_uuids[1:]
+            else:
+                merge_uuids = []
+
+            merge_project = self.get_proper_col_value(match_projects)
+            merge_project['merge_uuids'] = merge_uuids
+            merge_project_list.append(merge_project)
+        return merge_project_list
+
+    def update_project(self, project_list):
+        if self.update:
+            for project in project_list:
+                if self.show:
+                    print('update_project dict: ')
+                    pprint(project)
+                _approval_project = approval_project(project)
+                _approval_project.update_project(self.ots_client)
+            log('update to approval_project success!', len(project_list))
+        else:
+            if self.show:
+                for project in project_list:
+                    print('update_project dict: ')
+                    pprint(project)
+            log('not update to approval_project!')
+
+    def update_opertime(self, p_time_list):
+        data_format = "%Y-%m-%d %H:%M:%S"
+        opertime_last = datetime.strptime(self.opertime_last, data_format)
+        for persistence_time in p_time_list:
+            if persistence_time is None:
+                continue
+            opertime = datetime.strptime(persistence_time, data_format)
+            if opertime > opertime_last:
+                opertime_last = opertime
+        opertime_last = opertime_last.strftime(data_format)
+        with open(self.opertime_txt, 'w') as f:
+            f.write(opertime_last)
+        return opertime_last
+
+    def process_approval_json(self, approval_dict):
+        """
+
+        :param approval_dict: 数据dict
+        :param check: 是否检查与原来的审批公告属于同一审批项目
+        :return:
+        """
+
+        _d = approval_dict
+        approval_json = _d.get("approval_json")
+        partitionkey = _d.get("partitionkey")
+        docid = _d.get("docid")
+        doctitle = _d.get("doctitle")
+        project_name = _d.get("project_name")
+        page_time = _d.get("page_time")
+        has_standard = _d.get("standard")
+
+        extract_json = _d.get("extract_json")
+        # pro_type = extract_json.get("pro_type")
+        # approval_result = extract_json.get("approval_result")
+        _d_html = {"partitionkey": partitionkey, "docid": docid}
+        _html = Document(_d_html)
+        _html.fix_columns(self.ots_capacity, ["dochtmlcon"], True)
+        dochtml = _html.getProperties().get("dochtmlcon", "")
+        doctextcon = BeautifulSoup(dochtml, "lxml").get_text()
+        attachmenttextcon = ""
+        try:
+            _extract = json.loads(extract_json)
+            for key in _extract.keys():
+                if _d.get(key) in self.none_list and _extract.get(key) not in self.none_list:
+                    _d[key] = _extract.get(key)
+        except Exception  as e:
+            _extract = {}
+
+        # 若要素提取已经标准化过了,不用重新跑
+        if has_standard:
+            proportion = _d.get('proportion')
+            use_area = _d.get('use_area')
+            env_invest = _d.get('env_invest')
+            money_use = _d.get('money_use')
+            declare_type = _d.get('declare_type')
+            length = _d.get('length')
+        else:
+            proportion = _extract.get("pb", {}).get("proportion")
+            _, use_area = self.ap_predictor.extract_use_area(doctextcon + attachmenttextcon)
+            env_invest = self.ap_predictor.extract_env_invest(doctextcon + attachmenttextcon)
+            money_use = self.ap_predictor.extract_money_use(doctextcon + attachmenttextcon)
+            declare_type = self.ap_predictor.extract_declare_type(_d.get('pro_type'))
+            _, length = self.ap_predictor.extract_length(doctextcon + attachmenttextcon)
+            _d["proportion"] = proportion
+            _d["use_area"] = use_area
+            _d["env_invest"] = env_invest
+            _d["money_use"] = money_use
+            _d["declare_type"] = declare_type
+            _d["length"] = length
+
+            # 标准化
+            _d = self.ap_predictor.run_standard(_d)
+
+        approval_list = []
+        if approval_json:
+            approval_list = json.loads(approval_json)
+            for _appr in approval_list:
+                for key in _d.keys():
+                    if key not in _appr:
+                        _appr[key] = _d.get(key)
+
+                fix_area(self.ots_client, _appr)
+
+                # 建设单位信用代码、地区
+                if _d.get("construct_company"):
+                    new_d = self.get_construct_code_area(_d.get("construct_company"))
+                    _appr.update(new_d)
+
+                # 再从各自的项目概况中提取
+                construction_scale = _d.get("construction_scale", "")
+                proportion, _ = self.ap_predictor.extract_proportion(construction_scale)
+                if proportion != "":
+                    _appr["proportion"] = proportion
+                _, use_area = self.ap_predictor.extract_use_area(construction_scale)
+                if use_area != "":
+                    _appr["use_area"] = use_area
+                env_invest = self.ap_predictor.extract_env_invest(construction_scale)
+                if env_invest != "":
+                    _appr["env_invest"] = env_invest
+                money_use = self.ap_predictor.extract_money_use(construction_scale)
+                if money_use != "":
+                    _appr["money_use"] = money_use
+                # 标准化
+                _appr = self.ap_predictor.run_standard(_appr)
+        return approval_list
+
+    def get_proper_col_value(self, dict_list):
+        """
+        获取合适的字段值
+
+        :return:
+        """
+        merge_dict = {}
+        if dict_list:
+            merge_dict['uuid'] = dict_list[0].get('uuid')
+
+        # 处理多值字段
+        many_value_cols = ['docids', 'doc_nums', 'project_codes']
+        for col in many_value_cols:
+            for d in dict_list:
+                if d.get(col) is not None:
+                    continue
+                col_values = [str(x.get(col[:-1])) for x in dict_list]
+                d[col] = ','.join(col_values)
+                if col == 'docids':
+                    d['docids_cnt'] = len(col_values)
+
+        # 合并多值字段
+        many_value_cols = ['docids', 'doc_nums', 'project_codes', 'properties']
+        for col in many_value_cols:
+            value_list = []
+            for d in dict_list:
+                value = d.get(col)
+                if value in self.none_list:
+                    continue
+                value_list += value.split(',')
+            value_list = list(set(value_list))
+            for v in self.none_list:
+                if v in value_list:
+                    value_list.remove(v)
+            value_list.sort(key=lambda x: x)
+            if col == 'properties' and len(value_list) > 1 and '未披露' in value_list:
+                value_list.remove('未披露')
+            max_cnt = 200
+            if col == 'docids' and len(value_list) < max_cnt:
+                merge_dict['docids_cnt'] = len(value_list)
+            # 限制个数
+            value_list = value_list[:max_cnt]
+            merge_dict[col] = ','.join(value_list)
+
+        # 数字字段
+        num_cols = ['year_limit', 'env_invest', 'proportion', 'length', 'use_area', 'total_tenderee_money']
+        for col in num_cols:
+            max_value = 0
+            for d in dict_list:
+                value = d.get(col, 0)
+                if value in [None, '', 'None']:
+                    value = 0
+                if value > max_value:
+                    max_value = value
+            if max_value != 0:
+                merge_dict[col] = max_value
+            else:
+                merge_dict[col] = ''
+
+        # 日期字段
+        time_cols = ['time_commencement', 'time_completion', 'time_release', 'page_time',
+                     'time_declare', 'time_approval']
+        for col in time_cols:
+            # if col in ['update_time']:
+            #     format_string = "%Y-%m-%d %H:%M:%S"
+            # else:
+            format_string = "%Y-%m-%d"
+
+            max_time = None
+            for d in dict_list:
+                value = d.get(col)
+                try:
+                    value = datetime.strptime(value, format_string)
+                except:
+                    continue
+                if max_time is None or value > max_time:
+                    max_time = value
+            if max_time is not None:
+                merge_dict[col] = datetime.strftime(max_time, format_string)
+            else:
+                merge_dict[col] = ''
+
+        # 地区字段
+        area_cols = ['area', 'province', 'city', 'district']
+        area_list, province_list, city_list, district_list = [[] for _ in range(4)]
+        a_p_c_d_dict = {}
+        # 有些district是dict
+        extra_dict_list = []
+        for d in dict_list:
+            district = d.get('district')
+            if type(district) == dict:
+                extra_dict_list.append(district)
+                d['district'] = ''
+
+        for d in dict_list + extra_dict_list:
+            area = d.get('area')
+            province = d.get('province')
+            city = d.get('city')
+            district = d.get('district')
+            district_list.append(district)
+            area_list.append(area)
+            province_list.append(province)
+            city_list.append(city)
+
+            # 重复加1
+            a_p_c_d = []
+            for v in [area, province, city, district]:
+                if v not in self.none_list:
+                    a_p_c_d.append(v)
+            a_p_c_d = '-'.join(a_p_c_d)
+            if a_p_c_d in a_p_c_d_dict.keys():
+                a_p_c_d_dict[a_p_c_d] += 1
+            else:
+                a_p_c_d_dict[a_p_c_d] = 1
+
+        # 包含关系再加1
+        keys = list(a_p_c_d_dict.keys())
+        for i in range(len(keys)):
+            a_p_c_d_1 = keys[i]
+            for j in range(i + 1, len(keys)):
+                a_p_c_d_2 = keys[j]
+                if len(a_p_c_d_1) >= len(a_p_c_d_2) and a_p_c_d_2 in a_p_c_d_1:
+                    a_p_c_d_dict[a_p_c_d_1] += a_p_c_d_dict.get(a_p_c_d_2)
+                elif len(a_p_c_d_1) < len(a_p_c_d_2) and a_p_c_d_1 in a_p_c_d_2:
+                    a_p_c_d_dict[a_p_c_d_2] += a_p_c_d_dict.get(a_p_c_d_1)
+
+        if self.show:
+            print('a_p_c_d_dict', a_p_c_d_dict)
+        if '' in a_p_c_d_dict.keys():
+            a_p_c_d_dict.pop('')
+
+        # 地区匹配次数取最多
+        a_p_c_d_list = [[key, value] for key, value in a_p_c_d_dict.items()]
+        a_p_c_d_list.sort(key=lambda x: x[1])
+        a_p_c_d = a_p_c_d_list[-1][0].split('-')
+        for value in a_p_c_d:
+            if value in area_list:
+                merge_dict['area'] = value
+            elif value in province_list:
+                merge_dict['province'] = value
+            elif value in city_list:
+                merge_dict['city'] = value
+            elif value in district_list:
+                merge_dict['district'] = value
+
+        # 地址字段,取最长的
+        address_cols = ['project_addr', 'construction_scale']
+        for col in address_cols:
+            longer_value = ''
+            for d in dict_list:
+                value = d.get(col)
+                if value in self.none_list:
+                    continue
+                if len(value) > len(longer_value):
+                    longer_value = value
+            merge_dict[col] = longer_value
+
+        # 取最多的
+        name_cols = ['project_name', 'doctitle']
+        for col in name_cols:
+            value_dict = {}
+            for d in dict_list:
+                value = d.get(col)
+                if value in self.none_list:
+                    continue
+                if value in value_dict:
+                    value_dict[value] += 1
+                else:
+                    value_dict[value] = 1
+            value_list = [[k, v] for k, v in value_dict.items()]
+            if value_list:
+                value_list.sort(key=lambda x: (x[1], len(x[0])))
+                merge_dict[col] = value_list[-1][0]
+
+        # 剩下的为空就填
+        for col in approval_cols:
+            if col in many_value_cols + num_cols + time_cols + area_cols + address_cols + name_cols:
+                continue
+            # 按page_time排序
+            dict_list.sort(key=lambda x: x.get('page_time'), reverse=True)
+            for d in dict_list:
+                if merge_dict.get(col) in [None, '']:
+                    merge_dict[col] = d.get(col)
+
+        # 一些改了名字的字段
+        # for col in ['moneyuse']:
+        #     for d in dict_list:
+        #         merge_dict['money_use'] = d.get(col)
+
+        for col in approval_cols:
+            if col in area_cols:
+                continue
+            if merge_dict.get(col) in self.none_list:
+                merge_dict[col] = ''
+        return merge_dict
+
+    def get_construct_code_area(self, name):
+        bool_query = BoolQuery(must_queries=[
+            TermQuery("name", name)
+        ])
+
+        cols = ["credit_code", "province", "city", "district"]
+        data_list = self.search_util("enterprise", "enterprise_index", bool_query,
+                                     cols, None, )
+        _d = {}
+        if len(data_list) > 0:
+            _d["construct_company_code"] = data_list[0].get("credit_code", "")
+            _d["construct_company_province"] = data_list[0].get("province", "")
+            _d["construct_company_city"] = data_list[0].get("city", "")
+            _d["construct_company_district"] = data_list[0].get("district", "")
+        return _d
+
+    def check_approval(self, approval_list, origin_approval):
+        appr1 = origin_approval
+        same_approval_list = [origin_approval]
+        for appr2 in approval_list:
+            check_keys = ["declare_company", "construct_company", "total_tenderee_money", "proportion", "use_area",
+                          "doc_num",
+                          "project_code"]
+            same_count = 0
+            for k in check_keys:
+                if k in appr1 and k in appr2:
+                    if appr1[k] == appr2[k] and appr1[k] is not None and appr1[k] != "":
+                        same_count += 1
+
+            if same_count >= 1:
+                same_approval_list.append(appr2)
+        return same_approval_list
+
+    def search_util(self, table, table_index, bool_query, columns, sort_col, limit=999999, show_total=0, while_next=1,
+                    asc=0, only_return_total_cnt=0):
+        if sort_col:
+            if asc:
+                sort = Sort(sorters=[FieldSort(sort_col, SortOrder.ASC)])
+            else:
+                sort = Sort(sorters=[FieldSort(sort_col, SortOrder.DESC)])
+        else:
+            sort = None
+        return_type = ColumnReturnType.SPECIFIED
+        rows, next_token, total_count, is_all_succeed = self.ots_client.search(table, table_index,
+                                                                               SearchQuery(bool_query, sort=sort,
+                                                                                           limit=100,
+                                                                                           get_total_count=True),
+                                                                               ColumnsToGet(columns, return_type))
+        if only_return_total_cnt:
+            return total_count
+
+        list_data = getRow_ots(rows)
+        if show_total:
+            print('search total_count', total_count)
+        if len(list_data) >= limit:
+            print('limit ', limit, len(list_data))
+            return list_data[:limit]
+
+        if while_next:
+            while next_token and len(list_data) < limit:
+                rows, next_token, total_count, is_all_succeed = self.ots_client.search(table, table_index,
+                                                                                       SearchQuery(bool_query,
+                                                                                                   next_token=next_token,
+                                                                                                   limit=100,
+                                                                                                   get_total_count=True),
+                                                                                       ColumnsToGet(columns,
+                                                                                                    return_type))
+                list_data += getRow_ots(rows)
+        return list_data
+
+    def multi_thread_util(self, _list, task, threads):
+        result_list = []
+        # 创建一个线程池,参数为线程数量
+        with ThreadPoolExecutor(max_workers=threads) as executor:
+            # 使用线程池映射任务函数和参数
+            futures = [executor.submit(task, _list[i]) for i in range(len(_list))]
+
+            # 等待所有任务完成,并获取返回值
+            for future in futures:
+                result = future.result()  # 获取任务的返回值
+                # print(f"Task returned: {result}")
+                result_list.append(result)
+        return result_list
+
+
+def run_flow(data, test_mode=0, update=1):
+    docid, test_mode, update = data
+    ds = APProduction(test_mode, update)
+    print('run_flow, test_mode, update', test_mode, update)
+    # globals()['docid'] = data
+    try:
+        aps = ds.ap_data_flow(docid)
+    except:
+        traceback.print_exc()
+        log('error ' + str(docid))
+        aps = []
+    return aps
+
+
+if __name__ == "__main__":
+    test_mode = 0
+    test_docid_list = [400010986401]
+    update = 1
+
+    process_num = 7
+    sleep_sec = 100
+    while True:
+        sleep_flag = 1
+        ds = APProduction()
+
+        if test_mode:
+            tmp_data_list = test_docid_list
+            persistence_time_list = []
+        else:
+            # 获取新增审批公告
+            tmp_data_list, persistence_time_list = ds.get_approval_docid_from_document(max_cnt=50)
+        log('len(tmp_data_list)', len(tmp_data_list))
+
+        tmp_data_list = [[x, test_mode, update] for x in tmp_data_list]
+
+        # 取不到就等待再循环
+        if len(tmp_data_list) <= 0:
+            print('sleep for', sleep_sec)
+            time.sleep(sleep_sec)
+            continue
+
+        # 多进程合并
+        mp_start_time = time.time()
+        with multiprocessing.Pool(processes=process_num) as pool:
+            results = pool.map(run_flow, tmp_data_list)
+            all_ap_num = len(results)
+            pool.close()
+            pool.join()
+
+        # 更新数据时间
+        if not test_mode:
+            last_time_str = ds.update_opertime(persistence_time_list)
+            # 判断时间
+            data_format = "%Y-%m-%d %H:%M:%S"
+            last_time = datetime.strptime(last_time_str, data_format)
+            now = datetime.now()
+            print('*' * 30, last_time_str)
+            # 执行结束的太快,等待再处理
+            if now - last_time >= timedelta(seconds=sleep_sec):
+                sleep_flag = 0
+
+        log('mp process data num:', len(tmp_data_list), 'update ap num:', all_ap_num,
+            'cost:', round(time.time() - mp_start_time, 2),
+            'avg:', round((time.time() - mp_start_time) / len(tmp_data_list), 2))
+
+        # 测试模式不循环
+        if test_mode:
+            break
+
+        if sleep_flag:
+            print('sleep for', sleep_sec)
+            time.sleep(sleep_sec)

+ 1 - 0
BaseDataMaintenance/maintenance/approval_project/opertime.txt

@@ -0,0 +1 @@
+2024-12-04 17:46:28

+ 1 - 0
BaseDataMaintenance/maintenance/proposedBuilding/opertime.txt

@@ -0,0 +1 @@
+2024-07-30 04:00:00

+ 145 - 0
BaseDataMaintenance/model/ots/approval_project.py

@@ -0,0 +1,145 @@
+# encoding:utf-8
+import json
+import traceback
+from tablestore import TermQuery, BoolQuery, Condition, Row, OTSServiceError, OTSClientError, ColumnsToGet, SearchQuery, \
+    ColumnReturnType
+
+from BaseDataMaintenance.common.Utils import getRow_ots, log
+from BaseDataMaintenance.model.ots.BaseModel import BaseModel
+from datetime import datetime
+
+approval_project_uuid = 'uuid'
+approval_project_docids = 'docids'
+approval_project_area = 'area'
+approval_project_province = 'province'
+approval_project_city = 'city'
+approval_project_district = 'district'
+approval_project_source_stage = 'source_stage'
+approval_project_source_type = 'source_type'
+approval_project_approval_items = 'approval_items'
+approval_project_approval_result = 'approval_result'
+approval_project_approver = 'approver'
+approval_project_construct_company = 'construct_company'
+approval_project_construct_company_code = 'construct_company_code'
+approval_project_construction_scale = 'construction_scale'
+approval_project_declare_company = 'declare_company'
+approval_project_doc_nums = 'doc_nums'
+approval_project_evaluation_agency = 'evaluation_agency'
+approval_project_legal_person = 'legal_person'
+approval_project_phone = 'phone'
+approval_project_pro_type = 'pro_type'
+approval_project_properties = 'properties'
+approval_project_time_declare = 'time_declare'
+approval_project_year_limit = 'year_limit'
+approval_project_compilation_unit = 'compilation_unit'
+approval_project_publisher = 'publisher'
+approval_project_time_approval = 'time_approval'
+approval_project_moneysource = 'moneysource'
+approval_project_project_addr = 'project_addr'
+approval_project_project_name = 'project_name'
+approval_project_time_commencement = 'time_commencement'
+approval_project_time_completion = 'time_completion'
+approval_project_time_release = 'time_release'
+approval_project_env_invest = 'env_invest'
+approval_project_proportion = 'proportion'
+approval_project_length = 'length'
+approval_project_use_area = 'use_area'
+approval_project_construct_company_province = 'construct_company_province'
+approval_project_construct_company_city = 'construct_company_city'
+approval_project_construct_company_district = 'construct_company_district'
+approval_project_money_use = 'money_use'
+approval_project_declare_type = 'declare_type'
+approval_project_page_time = 'page_time'
+approval_project_doctitle = 'doctitle'
+approval_project_project_codes = 'project_codes'
+approval_project_total_tenderee_money = 'total_tenderee_money'
+approval_project_total_tenderee_money_unit = 'total_tenderee_money_unit'
+approval_project_status = 'status'
+approval_project_update_time = 'update_time'
+
+
+class approval_project(BaseModel):
+    def __init__(self, _dict):
+        for k, v in _dict.items():
+            self.setValue(k, v, True)
+        self.table_name = "approval_project"
+
+    def getPrimary_keys(self):
+        return ["uuid"]
+
+    def getAttribute_turple(self):
+        _list = []
+        for _key in self.getAttribute_keys():
+            if _key == "all_columns":
+                continue
+            _v = self.getProperties().get(_key)
+            if _v is not None:
+                if isinstance(_v, list):
+                    _v = json.dumps(_v)
+                _list.append((_key, _v))
+        return _list
+
+    def update_row(self, ots_client, retrytimes=3, delete=0):
+        primary_key = self.getPrimaryKey_turple()
+        update_of_attribute_columns = {
+            'PUT': self.getAttribute_turple()
+        }
+        row = Row(primary_key, update_of_attribute_columns)
+        condition = Condition('IGNORE')
+        for i in range(retrytimes):
+            try:
+                if self.exists_row(ots_client) and delete:
+                    self.delete_row(ots_client)
+                consumed, return_row = ots_client.update_row(self.table_name, row, condition)
+                return True
+            # 客户端异常,一般为参数错误或者网络异常。
+            except OTSClientError as e:
+                traceback.print_exc()
+                log("update row failed, http_status:%s, error_message:%s" % (
+                    str(e.get_http_status()), e.get_error_message()))
+                # raise e
+            # 服务端异常,一般为参数错误或者流控错误。
+            except OTSServiceError as e:
+                traceback.print_exc()
+                log("update row failed, http_status:%s, error_code:%s, error_message:%s, request_id:%s" % (
+                    str(e.get_http_status()), e.get_error_code(), e.get_error_message(), e.get_request_id()))
+                # raise e
+            except Exception as e:
+                traceback.print_exc()
+                pass
+                # raise e
+        return False
+        # log ('update succeed, consume %s write cu.' % consumed.write)
+
+    def update_project(self, ots_client):
+        current_time = datetime.now()
+        current_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
+
+        # 判断是否已存在docids重复的,都更改status为404,更新时间
+        merge_uuids = self.__dict__.get('merge_uuids')
+        old_uuid = None
+        if len(merge_uuids) > 0:
+            old_uuid = merge_uuids[0]
+            for uuid in merge_uuids:
+                _dict = {
+                    'uuid': uuid,
+                    'update_time': current_time,
+                    'status': 404,
+                }
+                delete_project = approval_project(_dict)
+                delete_project.update_row(ots_client)
+
+        # 复用uuid
+        if old_uuid is not None:
+            print('old_uuid', old_uuid)
+            self.setProperties('uuid', old_uuid)
+        self.setValue('update_time', current_time)
+        self.setValue('status', 1)
+        self.__dict__.pop('merge_uuids')
+        self.update_row(ots_client)
+
+
+if __name__ == "__main__":
+    value_list = [['abc', 1], ['ab', 1]]
+    value_list.sort(key=lambda x: (x[1], len(x[0])))
+    print(value_list)