|
@@ -0,0 +1,1689 @@
|
|
|
|
+# encoding: utf-8
|
|
|
|
+import copy
|
|
|
|
+import json
|
|
|
|
+import logging
|
|
|
|
+import multiprocessing
|
|
|
|
+import os
|
|
|
|
+import re
|
|
|
|
+import sys
|
|
|
|
+import time
|
|
|
|
+import traceback
|
|
|
|
+from concurrent.futures.thread import ThreadPoolExecutor
|
|
|
|
+from datetime import datetime, timedelta
|
|
|
|
+from decimal import Decimal
|
|
|
|
+from pprint import pprint
|
|
|
|
+from uuid import uuid4
|
|
|
|
+
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+from tablestore import BoolQuery, RangeQuery, TermQuery, MatchPhraseQuery, Sort, FieldSort, ColumnReturnType, SortOrder, \
|
|
|
|
+ SearchQuery, ColumnsToGet
|
|
|
|
+
|
|
|
|
+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
|
|
|
|
+from BaseDataMaintenance.common.Utils import getRow_ots, getUnifyMoney
|
|
|
|
+from BaseDataMaintenance.dataSource.source import getConnect_ots, getConnect_ots_capacity
|
|
|
|
+from BaseDataMaintenance.model.ots.approval_project import approval_project
|
|
|
|
+from BaseDataMaintenance.model.ots.document import Document
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def log(*args):
|
|
|
|
+ print_str = ""
|
|
|
|
+ print_str += str(globals().get('docid')) + " - "
|
|
|
|
+ for obj in args:
|
|
|
|
+ print_str += str(obj) + ' '
|
|
|
|
+ logging.info(print_str[:-1])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+approval_col_dict = {
|
|
|
|
+ "doctitle": "公告标题",
|
|
|
|
+ "page_time": "公告时间",
|
|
|
|
+ "province": "省份",
|
|
|
|
+ "city": "城市",
|
|
|
|
+ "district": "地区",
|
|
|
|
+
|
|
|
|
+ "approval_items": "审批事项",
|
|
|
|
+ "approval_result": "审批结果",
|
|
|
|
+ "declare_company": "申报单位",
|
|
|
|
+ "construct_company": "建设单位",
|
|
|
|
+ "evaluation_agency": "环评机构",
|
|
|
|
+ "approver": "审批部门",
|
|
|
|
+ "compilation_unit": "编制单位",
|
|
|
|
+ "publisher": "发布单位",
|
|
|
|
+
|
|
|
|
+ "total_tenderee_money": "总投资",
|
|
|
|
+ "construction_scale": "建设规模",
|
|
|
|
+ "proportion": "建筑面积",
|
|
|
|
+ "use_area": "用地面积",
|
|
|
|
+
|
|
|
|
+ "doc_num": "审批文号",
|
|
|
|
+
|
|
|
|
+ "legal_person": "项目法人",
|
|
|
|
+ "moneysource": "资金来源",
|
|
|
|
+ "money_use": "资金构成",
|
|
|
|
+ "env_invest": "环保投资",
|
|
|
|
+ "phone": "电话",
|
|
|
|
+ "pro_type": "申报类型",
|
|
|
|
+ "project_addr": "项目地址",
|
|
|
|
+ "project_code": "项目编号",
|
|
|
|
+ "project_name": "项目名称",
|
|
|
|
+ "properties": "建设性质",
|
|
|
|
+ "time_commencement": "开工时间",
|
|
|
|
+ "time_completion": "竣工时间",
|
|
|
|
+ "time_declare": "申报时间",
|
|
|
|
+
|
|
|
|
+ "year_limit": "建设年限",
|
|
|
|
+
|
|
|
|
+ "time_approval": "审批时间",
|
|
|
|
+ "time_release": "发布日期"
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+approval_cols = [
|
|
|
|
+ 'uuid',
|
|
|
|
+ 'docids',
|
|
|
|
+ 'area',
|
|
|
|
+ 'province',
|
|
|
|
+ 'city',
|
|
|
|
+ 'district',
|
|
|
|
+ 'source_stage',
|
|
|
|
+ 'source_type',
|
|
|
|
+ 'approval_items',
|
|
|
|
+ 'approval_result',
|
|
|
|
+ 'approver',
|
|
|
|
+ 'construct_company',
|
|
|
|
+ 'construct_company_code',
|
|
|
|
+ 'construction_scale',
|
|
|
|
+ 'declare_company',
|
|
|
|
+ 'doc_nums',
|
|
|
|
+ 'evaluation_agency',
|
|
|
|
+ 'legal_person',
|
|
|
|
+ 'phone',
|
|
|
|
+ 'pro_type',
|
|
|
|
+ 'properties',
|
|
|
|
+ 'time_declare',
|
|
|
|
+ 'year_limit',
|
|
|
|
+ 'compilation_unit',
|
|
|
|
+ 'publisher',
|
|
|
|
+ 'time_approval',
|
|
|
|
+ 'moneysource',
|
|
|
|
+ 'project_addr',
|
|
|
|
+ 'project_name',
|
|
|
|
+ 'time_commencement',
|
|
|
|
+ 'time_completion',
|
|
|
|
+ 'time_release',
|
|
|
|
+ 'env_invest',
|
|
|
|
+ 'proportion',
|
|
|
|
+ 'length',
|
|
|
|
+ 'use_area',
|
|
|
|
+ 'construct_company_province',
|
|
|
|
+ 'construct_company_city',
|
|
|
|
+ 'construct_company_district',
|
|
|
|
+ 'money_use',
|
|
|
|
+ 'declare_type',
|
|
|
|
+ 'page_time',
|
|
|
|
+ 'doctitle',
|
|
|
|
+ 'project_codes',
|
|
|
|
+ 'total_tenderee_money',
|
|
|
|
+ 'total_tenderee_money_unit',
|
|
|
|
+
|
|
|
|
+ 'docids_cnt',
|
|
|
|
+ 'merge_uuids',
|
|
|
|
+]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_enterprise_area(ots_client, name):
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("name", name)
|
|
|
|
+ ])
|
|
|
|
+ rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
|
|
|
|
+ SearchQuery(bool_query),
|
|
|
|
+ ColumnsToGet(["province", "city", "district"],
|
|
|
|
+ return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ _d = {}
|
|
|
|
+ if len(list_data) > 0:
|
|
|
|
+ _d["province"] = list_data[0].get("province", "")
|
|
|
|
+ _d["city"] = list_data[0].get("city", "")
|
|
|
|
+ _d["district"] = list_data[0].get("district", "")
|
|
|
|
+ return _d
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def area_count(_d):
|
|
|
|
+ keys = ["province", "city", "district"]
|
|
|
|
+ return sum([1 if _d.get(k, "") not in ("", "全国", "未知") else 0 for k in keys])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def fix_area(ots_client, appr):
|
|
|
|
+ if appr.get("district", "") != "":
|
|
|
|
+ return
|
|
|
|
+ declare_company = appr.get("declare_company", "")
|
|
|
|
+ _d = get_enterprise_area(ots_client, declare_company)
|
|
|
|
+ if area_count(_d) > area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ construct_company = appr.get("construct_company", "")
|
|
|
|
+ _d = get_enterprise_area(ots_client, construct_company)
|
|
|
|
+ print('get_enterprise_area _d', _d)
|
|
|
|
+ if area_count(_d) > area_count(appr):
|
|
|
|
+
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ approver = appr.get("approver", "")
|
|
|
|
+ _d = get_enterprise_area(ots_client, approver)
|
|
|
|
+ if area_count(_d) > area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ compilation_unit = appr.get("compilation_unit", "")
|
|
|
|
+ _d = get_enterprise_area(ots_client, compilation_unit)
|
|
|
|
+ if area_count(_d) > area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ publisher = appr.get("publisher", "")
|
|
|
|
+ _d = get_enterprise_area(ots_client, publisher)
|
|
|
|
+ if area_count(_d) > area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class APPredictor:
|
|
|
|
+ def __init__(self):
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+ def chinese_to_arabic(self, ch_str):
|
|
|
|
+ chinese_number_dict = {
|
|
|
|
+ '一': 1,
|
|
|
|
+ '二': 2,
|
|
|
|
+ '两': 2,
|
|
|
|
+ '三': 3,
|
|
|
|
+ '四': 4,
|
|
|
|
+ '五': 5,
|
|
|
|
+ '六': 6,
|
|
|
|
+ '七': 7,
|
|
|
|
+ '八': 8,
|
|
|
|
+ '九': 9,
|
|
|
|
+ '十': 10,
|
|
|
|
+ '拾': 10,
|
|
|
|
+ '百': 100,
|
|
|
|
+ '千': 1000,
|
|
|
|
+ }
|
|
|
|
+ no_list = []
|
|
|
|
+ for c in ch_str:
|
|
|
|
+ if c not in chinese_number_dict.keys():
|
|
|
|
+ return None
|
|
|
|
+ no_list.append(chinese_number_dict.get(c))
|
|
|
|
+ arabic_num = 0
|
|
|
|
+ mul_no = None
|
|
|
|
+ for i, no in enumerate(no_list):
|
|
|
|
+ if no in [10, 100, 1000]:
|
|
|
|
+ if mul_no is None:
|
|
|
|
+ arabic_num += no
|
|
|
|
+ else:
|
|
|
|
+ arabic_num += no * mul_no
|
|
|
|
+ mul_no = None
|
|
|
|
+ else:
|
|
|
|
+ mul_no = no
|
|
|
|
+ if mul_no:
|
|
|
|
+ arabic_num += mul_no
|
|
|
|
+
|
|
|
|
+ return arabic_num
|
|
|
|
+
|
|
|
|
+ def standard_properties(self, d):
|
|
|
|
+ # properties 建设性质 未披露 迁建 新建 技术改造 改建 其他 扩建
|
|
|
|
+ # 未分类 改扩建 技改 技改及其他 拆建 装修装饰 迁扩建 其他 其它
|
|
|
|
+ # 新建(迁建)□改建□扩建□技术改造
|
|
|
|
+ # 新建建设类项目 改建建设类项目 新建工程 新建,
|
|
|
|
+ # 新建,建设内容...
|
|
|
|
+ # 扩建,...
|
|
|
|
+ # 新建□改扩建□技改☑
|
|
|
|
+ # (1)新建√(2)改扩建(3)技改(4)迁建
|
|
|
|
+ # 新建,占地面积106.4亩项组织组建力度,今年以来,该区走访以业务关系为纽带,以点带面 未披露
|
|
|
|
+ properties_type_dict = {
|
|
|
|
+ '未披露': '未披露',
|
|
|
|
+ '迁建': '迁建',
|
|
|
|
+ '新建': '新建',
|
|
|
|
+ '技术改造': '技术改造',
|
|
|
|
+ '技改': '技术改造',
|
|
|
|
+ '改建': '改建',
|
|
|
|
+ '其他': '其他',
|
|
|
|
+ '其它': '其他',
|
|
|
|
+ '扩建': '扩建',
|
|
|
|
+ '拆建': '拆建',
|
|
|
|
+ '改扩建': '改建,扩建',
|
|
|
|
+ '迁扩建': '迁建,扩建',
|
|
|
|
+ '装修装饰': '装修装饰',
|
|
|
|
+ '新建(迁建)': '新建,迁建',
|
|
|
|
+ '迁改扩建': '迁建,改建,扩建',
|
|
|
|
+ '装修工程': '装修装饰',
|
|
|
|
+ '旧路改造': '改建',
|
|
|
|
+ '新建(迁扩建)': '新建,迁建,扩建',
|
|
|
|
+ '原规模技改': '技术改造',
|
|
|
|
+ '改扩建(产能核增)': '改扩建'
|
|
|
|
+ }
|
|
|
|
+ p_base_type = ['未披露', '迁建', '新建', '技术改造', '改建', '其他', '扩建', '拆建', '装修装饰']
|
|
|
|
+ p_all_type = list(properties_type_dict.keys())
|
|
|
|
+ p_all_type.sort(key=lambda x: len(x), reverse=True)
|
|
|
|
+ true_char = '[☑■√R◼回]'
|
|
|
|
+ false_char = '[□£☐口]'
|
|
|
|
+ char_reg = '[☑■√R◼回□£☐口]'
|
|
|
|
+ type_reg = '|'.join(p_all_type)
|
|
|
|
+ # p_type_reg_list = [
|
|
|
|
+ # '(?P<t1>{})(项目|工程|建设类项目|及其他|建设类|(补办))'.format(type_reg),
|
|
|
|
+ # f'^[\u4e00-\u9fff]{{2,4}}(?P<t1>{type_reg})$',
|
|
|
|
+ # f'^(?P<t1>{type_reg})[、](?P<t2>{type_reg})$',
|
|
|
|
+ # '(?P<t1>{})((补办)|工程|)[,,;;](?!{}).*'.format(type_reg, type_reg),
|
|
|
|
+ # f'^((?P<c1>{char_reg})?)(?P<t1>{type_reg})((?P<c2>{char_reg})?)(?P<t2>{type_reg})((?P<c3>{char_reg})?)(?P<t3>{type_reg})$',
|
|
|
|
+ # f'^(?P<c1>{char_reg}){{0,2}}(?P<t1>{type_reg})(?P<c2>{char_reg}){{0,2}}(?P<t2>{type_reg})?(?P<c3>{char_reg}){{0,2}}(?P<t3>{type_reg})?(?P<c4>{char_reg}){{0,2}}(?P<t4>{type_reg})?(?P<c5>{char_reg}){{0,2}}(?P<t5>{type_reg})?(?P<c6>{char_reg}){{0,2}}(?P<t6>{type_reg})?',
|
|
|
|
+ # ]
|
|
|
|
+
|
|
|
|
+ # python3.5 不支持f表达式
|
|
|
|
+ p_type_reg_list = [
|
|
|
|
+ '(?P<t1>{})(项目|工程|建设类项目|及其他|建设类|(补办))'.format(type_reg),
|
|
|
|
+ '^[\u4e00-\u9fff]{{2,4}}(?P<t1>{})$'.format(type_reg),
|
|
|
|
+ '^(?P<t1>{})[、](?P<t2>{})$'.format(type_reg, type_reg),
|
|
|
|
+ '(?P<t1>{})((补办)|工程|)[,,;;](?!{}).*'.format(type_reg, type_reg),
|
|
|
|
+ '^((?P<c1>{})?)(?P<t1>{})((?P<c2>{})?)(?P<t2>{})((?P<c3>{})?)(?P<t3>{})$'
|
|
|
|
+ .format(char_reg, type_reg, char_reg, type_reg, char_reg, type_reg),
|
|
|
|
+ '^(?P<c1>{}){{0,2}}(?P<t1>{})(?P<c2>{}){{0,2}}(?P<t2>{})?(?P<c3>{}){{0,2}}(?P<t3>{})?(?P<c4>{}){{0,2}}(?P<t4>{})?(?P<c5>{}){{0,2}}(?P<t5>{})?(?P<c6>{}){{0,2}}(?P<t6>{})?'
|
|
|
|
+ .format(char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg, char_reg, type_reg),
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ properties = str(d.get('properties'))
|
|
|
|
+ properties = re.sub(' ', '', properties)
|
|
|
|
+ match_flag = 0
|
|
|
|
+ if properties not in ['None', None, '']:
|
|
|
|
+ if properties in p_all_type:
|
|
|
|
+ d['properties'] = properties
|
|
|
|
+ match_flag = 1
|
|
|
|
+ else:
|
|
|
|
+ for index, reg in enumerate(p_type_reg_list):
|
|
|
|
+ match = re.search(reg, properties)
|
|
|
|
+ if match:
|
|
|
|
+ match_flag = 1
|
|
|
|
+ keys = list(match.groupdict().keys())
|
|
|
|
+ # 普通文本类型
|
|
|
|
+ if len(keys) == 1 and keys[0] == 't1':
|
|
|
|
+ t = match.groupdict().get('t1', '')
|
|
|
|
+ t = properties_type_dict.get(t, '')
|
|
|
|
+ d['properties'] = t
|
|
|
|
+ break
|
|
|
|
+ if len(keys) == 1 and 't' in keys:
|
|
|
|
+ t = match.groupdict().get(keys[0], '')
|
|
|
|
+ t = properties_type_dict.get(t, '')
|
|
|
|
+ d['properties'] = t
|
|
|
|
+ break
|
|
|
|
+ if len(keys) == 2 and 't1' in keys and 't2' in keys:
|
|
|
|
+ t_list = [match.groupdict().get('t1', ''), match.groupdict().get('t2', '')]
|
|
|
|
+ t_list = [properties_type_dict.get(x, '') for x in t_list]
|
|
|
|
+ t_list.sort(key=lambda x: x)
|
|
|
|
+ d['properties'] = ','.join(t_list)
|
|
|
|
+ break
|
|
|
|
+ # 括号打勾类型
|
|
|
|
+ if index == 4:
|
|
|
|
+ if match.groupdict().get('c1'):
|
|
|
|
+ t = match.groupdict().get('t1', '')
|
|
|
|
+ elif match.groupdict().get('c2'):
|
|
|
|
+ t = match.groupdict().get('t2', '')
|
|
|
|
+ elif match.groupdict().get('c3'):
|
|
|
|
+ t = match.groupdict().get('t3', '')
|
|
|
|
+ else:
|
|
|
|
+ t = '未披露'
|
|
|
|
+ t = properties_type_dict.get(t, '')
|
|
|
|
+ d['properties'] = t
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ # 选项框类型
|
|
|
|
+ match = re.finditer(true_char, properties)
|
|
|
|
+ match = list(match)
|
|
|
|
+ t_list = []
|
|
|
|
+ for m in match:
|
|
|
|
+ for t in p_all_type:
|
|
|
|
+ if properties[m.end():m.end() + len(t)] == t:
|
|
|
|
+ t_list.append(t)
|
|
|
|
+ break
|
|
|
|
+ if t_list:
|
|
|
|
+ t_list = ','.join([properties_type_dict.get(x) for x in t_list])
|
|
|
|
+ t_list = t_list.split(',')
|
|
|
|
+ t_list.sort(key=lambda x: x)
|
|
|
|
+ d['properties'] = ','.join(t_list)
|
|
|
|
+ else:
|
|
|
|
+ d['properties'] = '未披露'
|
|
|
|
+ break
|
|
|
|
+ if match_flag and ',' not in d.get('properties'):
|
|
|
|
+ t_list = properties_type_dict.get(d.get('properties'), '').split(',')
|
|
|
|
+ t_list.sort(key=lambda x: x)
|
|
|
|
+ d['properties'] = ','.join(t_list)
|
|
|
|
+
|
|
|
|
+ if not match_flag and d.get('properties') not in ['None', None, '']:
|
|
|
|
+ # 规则匹配不到,判断是否有选项框
|
|
|
|
+ match = re.finditer(true_char, properties)
|
|
|
|
+ match = list(match)
|
|
|
|
+ t_list = []
|
|
|
|
+ for m in match:
|
|
|
|
+ for t in p_all_type:
|
|
|
|
+ if properties[m.end():m.end() + len(t)] == t:
|
|
|
|
+ t_list.append(t)
|
|
|
|
+ break
|
|
|
|
+ if t_list:
|
|
|
|
+ t_list = ','.join([properties_type_dict.get(x) for x in t_list])
|
|
|
|
+ t_list = t_list.split(',')
|
|
|
|
+ t_list.sort(key=lambda x: x)
|
|
|
|
+ d['properties'] = ','.join(t_list)
|
|
|
|
+ else:
|
|
|
|
+ d['properties'] = ''
|
|
|
|
+
|
|
|
|
+ if d.get('properties') in ['None', None, '']:
|
|
|
|
+ d['properties'] = '未披露'
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def standard_approval_result(self, d):
|
|
|
|
+ # approval_result 审批结果
|
|
|
|
+ # approval_result_type_dict = {
|
|
|
|
+ # '不予许可/未通过': '不予许可|不准予许可|办结(不通过)|不符合产业政策已撤销|办结(受理不通过)|办结(办理不通过)|不予受理|无需受理|受理不通过|不予备案|拒收|退回|驳回|退件|退办|不合格|不同意',
|
|
|
|
+ # '准予许可/通过': '准予许可|许可/同意|已审[批核]|审[批核]通过|已审[批核]通过|批准|许可|合格|同意|准予备案',
|
|
|
|
+ # '在办状态': '在办|审批中|办理中|部门审批中|部门办理|待审批|待补全|待预审|待接件|审批|审查|部门开始办理|待确认中介机构|审[核批查]中|尚未审[核批查]|待联审|待互联网预审|待答复',
|
|
|
|
+ # '已完成审批': '批复|已办结|办结|批复办结|已批复|审批通过|办结成功|办结(通过)|正常办结|已办结(正式出文)|正式出文|正常|办理完成|结案',
|
|
|
|
+ # '已受理': '已受理|受理|通过|受理通过|进入受理|已接件|补齐补正受理|预审通过|已登记|已备案|确认备案|收件|接件|已立项|已提交|已处理|登记|备案(未抽中)|已发布',
|
|
|
|
+ # '其他': '补齐补正通知|补正(开始)|补齐补正受理|等等其他|申报|特别程序|撤销|补办|撤件|补正|补件|归档|送达|挂起|补齐补证',
|
|
|
|
+ # }
|
|
|
|
+
|
|
|
|
+ approval_result_type_dict = {
|
|
|
|
+ '不予许可/未通过': '不予许可/未通过|不予许可|不准予许可|办结(不通过)|不符合产业政策已撤销|办结(受理不通过)|办结(办理不通过)|不予受理|无需受理|受理不通过|不予备案|拒收|退回|驳回|退件|退办|不合格|不同意|不符合要求|申请材料虚假|不满足条件|未通过审核|项目终止|资质不符|未获批准|无法办理|申请无效|已驳回|撤销申请|备案已撤销',
|
|
|
|
+ '准予许可/通过': '准予许可/通过|准予许可|许可/同意|已审[批核]|审[批核]通过|已审[批核]通过|批准|许可|合格|同意|准予备案|符合规定|申请成功|符合条件|审核通过|项目获批|资质合格|可以办理|申请有效|办结|准予许可|办结(通过)|办结(准予许可)|确认备案',
|
|
|
|
+ '在办状态': '在办状态|在办|审批中|办理中|部门审批中|部门办理|待审批|待补全|待预审|待接件|审批|审查|部门开始办理|待确认中介机构|审[核批查]中|尚未审[核批查]|待联审|待互联网预审|待答复|申报|特别程序申请|特别程序|待补办|规划条件核实(个人住宅)|收件|补正(结束)|申办|已提交|建设单位提交|待承办|尚未审核|拟同意|已立项|补齐补证|正在审核|审批流程中|办理流程中|等待反馈|需补充材料|正在审查|等待审批结果|处理中|审核中',
|
|
|
|
+ '已完成审批': '已完成审批|批复|已办结|办结|批复办结|已批复|审批通过|办结成功|办结(通过)|正常办结|已办结(正式出文)|正式出文|正常|办理完成|结案|审批结束|审批完成|审批通过并办结|审批流程结束|审批成功|审批办结|审批完结|已发布|符合性判定|已处理|送达',
|
|
|
|
+ '已受理': '已受理|受理|通过|受理通过|进入受理|已接件|补齐补正受理|预审通过|已登记|已备案|确认备案|收件|接件|已立项|已提交|已处理|登记|备案(未抽中)|已发布|已接收申请|申请已受理|受理申请|正在受理|已进入受理流程|申请受理中|受理成功|已受理待审核|已撤件',
|
|
|
|
+ '其他': '补齐补正通知|补正(开始)|补齐补正受理|等等其他|申报|特别程序|撤销|补办|撤件|补正|补件|归档|送达|挂起|补齐补证|暂缓审批|暂不受理|需重新申报|暂停办理|暂无结果|需进一步研究|暂存待查|暂不通过|抽中|未抽中|许可|新设',
|
|
|
|
+ }
|
|
|
|
+ # 未分类: 申报 审查 办结|准予许可 特别程序申请 特别程序 撤销申请 归档 待补办 已撤件 备案已撤销 新设
|
|
|
|
+ # 已驳回 办结(通过) 办结(准予许可) 部门开始办理 规划条件核实(个人住宅) 收件 退件 挂起 批准 合格 不合格 确认备案
|
|
|
|
+ # 待确认中介机构 退办 归档 接件 补正(结束) 许可 抽中 未抽中 同意 补件 申办 结案 已发布 符合性判定
|
|
|
|
+ # 已处理 送达 尚未审核 拟同意 已立项 待联审 已提交 待互联网预审 建设单位提交 待承办 备案(未抽中)
|
|
|
|
+ # 补齐补证 已发布
|
|
|
|
+
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ approval_result = d.get('approval_result')
|
|
|
|
+ new_approval_result = ""
|
|
|
|
+ if approval_result:
|
|
|
|
+ for t, reg in approval_result_type_dict.items():
|
|
|
|
+ if re.search(reg, str(approval_result)):
|
|
|
|
+ new_approval_result = t
|
|
|
|
+ break
|
|
|
|
+ d['approval_result'] = new_approval_result
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def standard_year_limit(self, d):
|
|
|
|
+ # year_limit 建设年限 int类型 单位是月
|
|
|
|
+ year_limit_reg_list = [
|
|
|
|
+ '^(?P<year_num>\d{1,2})年$',
|
|
|
|
+ '^(?P<year_num_ch>[一二三四五六七八九十]{1,2})年$',
|
|
|
|
+ '^(?P<month_num>\d+)个月$',
|
|
|
|
+ '^(?P<month_num_ch>[一二三四五六七八九十]{1,3})个月$',
|
|
|
|
+ '^(?P<day_num>\d+)([天日]|个?日历天)$',
|
|
|
|
+ '^(?P<day_num_ch>[一二三四五六七八九十])([天日]|个?日历天)$',
|
|
|
|
+ '^(?P<y_range1>\d{4,})年?[-至到—一]{1,2}(?P<y_range2>\d{4,})年?$',
|
|
|
|
+ '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})月?$',
|
|
|
|
+ '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?(?P<d_range1>\d{1,2})日?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})[月-]?(?P<d_range2>\d{1,2})日?$',
|
|
|
|
+ '^(?P<y_range1>\d{4,})[年-]?(?P<m_range1>\d{1,2})[月-]?(?P<d_range1>\d{1,2})日?[ ]?(?P<h_range1>\d{1,2})[时::]?(?P<mi_range1>\d{1,2})[分::]?(?P<s_range1>\d{1,2})(\.0)?秒?[-至到—一]{1,2}(?P<y_range2>\d{4,})[年-]?(?P<m_range2>\d{1,2})[月-]?(?P<d_range2>\d{1,2})日?[ ]?(?P<h_range2>\d{1,2})[时::]?(?P<mi_range2>\d{1,2})[分::]?(?P<s_range2>\d{1,2})(\.0)?秒?$',
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ year_limit = d.get('year_limit')
|
|
|
|
+ time_commencement = d.get('time_commencement')
|
|
|
|
+ time_completion = d.get('time_completion')
|
|
|
|
+ if year_limit:
|
|
|
|
+ new_year_limit = 0
|
|
|
|
+ for reg_index, reg in enumerate(year_limit_reg_list):
|
|
|
|
+ match = re.search(reg, str(year_limit))
|
|
|
|
+ if not match:
|
|
|
|
+ # print('not match!!')
|
|
|
|
+ continue
|
|
|
|
+ # print('reg_index', reg_index)
|
|
|
|
+ if reg_index == 0:
|
|
|
|
+ new_year_limit = int(match.group('year_num')) * 12 * 30
|
|
|
|
+ elif reg_index == 1:
|
|
|
|
+ new_year_limit = self.chinese_to_arabic(match.group('year_num_ch')) * 12 * 30
|
|
|
|
+ elif reg_index == 2:
|
|
|
|
+ new_year_limit = int(match.group('month_num')) * 30
|
|
|
|
+ elif reg_index == 3:
|
|
|
|
+ new_year_limit = self.chinese_to_arabic(match.group('month_num_ch')) * 30
|
|
|
|
+ elif reg_index == 4:
|
|
|
|
+ new_year_limit = int(match.group('day_num'))
|
|
|
|
+ elif reg_index == 5:
|
|
|
|
+ new_year_limit = self.chinese_to_arabic(match.group('day_num_ch'))
|
|
|
|
+ elif reg_index in [6, 7, 8, 9]:
|
|
|
|
+ year1, year2 = match.groupdict().get('y_range1'), match.groupdict().get('y_range2')
|
|
|
|
+ month1, month2 = match.groupdict().get('m_range1'), match.groupdict().get('m_range2')
|
|
|
|
+ day1, day2 = match.groupdict().get('d_range1'), match.groupdict().get('d_range2')
|
|
|
|
+ if month1:
|
|
|
|
+ month1, month2 = [x if len(x) == 2 else '0' + x for x in [month1, month2]]
|
|
|
|
+ if day1:
|
|
|
|
+ day1, day2 = [x if len(x) == 2 else '0' + x for x in [day1, day2]]
|
|
|
|
+ date_str1 = '{}-{}-{}'.format(year1, month1, day1)
|
|
|
|
+ date1 = datetime.strptime(date_str1, '%Y-%m-%d')
|
|
|
|
+ date_str2 = '{}-{}-{}'.format(year2, month2, day2)
|
|
|
|
+ date2 = datetime.strptime(date_str2, '%Y-%m-%d')
|
|
|
|
+ new_year_limit = (date2 - date1).days
|
|
|
|
+ elif month1:
|
|
|
|
+ date_str1 = '{}-{}'.format(year1, month1)
|
|
|
|
+ date1 = datetime.strptime(date_str1, '%Y-%m')
|
|
|
|
+ date_str2 = '{}-{}'.format(year2, month2)
|
|
|
|
+ date2 = datetime.strptime(date_str2, '%Y-%m')
|
|
|
|
+ new_year_limit = (date2 - date1).days
|
|
|
|
+ else:
|
|
|
|
+ date1 = datetime.strptime(year1, '%Y')
|
|
|
|
+ date2 = datetime.strptime(year2, '%Y')
|
|
|
|
+ new_year_limit = (date2 - date1).days
|
|
|
|
+ # 从建设年限补充开工竣工时间
|
|
|
|
+ if not time_commencement:
|
|
|
|
+ d['time_commencement'] = datetime.strftime(date1, '%Y-%m-%d')
|
|
|
|
+ if not time_completion:
|
|
|
|
+ d['time_completion'] = datetime.strftime(date2, '%Y-%m-%d')
|
|
|
|
+ break
|
|
|
|
+ if new_year_limit > 0:
|
|
|
|
+ d['year_limit'] = new_year_limit
|
|
|
|
+ else:
|
|
|
|
+ d.pop('year_limit')
|
|
|
|
+
|
|
|
|
+ if d.get('year_limit') in [None, 'None', '', 0]:
|
|
|
|
+ d['year_limit'] = ''
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def standard_time_commencement_completion(self, d):
|
|
|
|
+ # time_commencement 开工时间 可以从建设年限year_limit里面提取
|
|
|
|
+ # time_completion 竣工时间 可以从建设年限year_limit里面提取 不能比开工时间早
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ time_commencement = d.get('time_commencement')
|
|
|
|
+ time_completion = d.get('time_completion')
|
|
|
|
+ if time_commencement and time_completion:
|
|
|
|
+ try:
|
|
|
|
+ date1 = datetime.strptime(time_commencement, '%Y-%m-%d')
|
|
|
|
+ date2 = datetime.strptime(time_completion, '%Y-%m-%d')
|
|
|
|
+ except:
|
|
|
|
+ try:
|
|
|
|
+ date1 = datetime.strptime(time_commencement, '%Y-%m')
|
|
|
|
+ date2 = datetime.strptime(time_completion, '%Y-%m')
|
|
|
|
+ d['time_commencement'] = datetime.strftime(date1, '%Y-%m-%d')
|
|
|
|
+ d['time_completion'] = datetime.strftime(date2, '%Y-%m-%d')
|
|
|
|
+ except:
|
|
|
|
+ d['time_commencement'] = ''
|
|
|
|
+ d['time_completion'] = ''
|
|
|
|
+ date1, date2 = None, None
|
|
|
|
+ if date1 is not None and date1 > date2:
|
|
|
|
+ d['time_commencement'] = ''
|
|
|
|
+ d['time_completion'] = ''
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def standard_money_or_area(self, d):
|
|
|
|
+ # 建筑面积、用地面积、长度 float类型两位小数
|
|
|
|
+ # 总投资、环保投资 float类型两位小数
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ for col in ['proportion', 'use_area', 'length']:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ if value:
|
|
|
|
+ try:
|
|
|
|
+ d[col] = round(float(value), 2)
|
|
|
|
+ except:
|
|
|
|
+ value = re.sub('[\u4e00-\u9fa5]', '', value)
|
|
|
|
+ value = re.sub('m2', '㎡', value)
|
|
|
|
+ d[col] = round(float(value[:-1]), 2)
|
|
|
|
+
|
|
|
|
+ for col in ['total_tenderee_money', 'env_invest']:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ if value:
|
|
|
|
+ d[col] = round(float(value), 2)
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def standard_pro_type(self, d):
|
|
|
|
+ # declare_type 申报类型 从原来项目类型pro_type中拆出
|
|
|
|
+ # 若成功匹配,原来的pro_type置为空
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ declare_type = d.get('declare_type')
|
|
|
|
+ if declare_type:
|
|
|
|
+ d['pro_type'] = ''
|
|
|
|
+ return d
|
|
|
|
+
|
|
|
|
+ def extract_env_invest(self, content):
|
|
|
|
+ """
|
|
|
|
+ 环保投资
|
|
|
|
+
|
|
|
|
+ :param content:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ pattern = "环保投资[大概约为是::]*(?P<invs>\d+(\.\d+)?万?元)"
|
|
|
|
+ match = re.search(pattern, content)
|
|
|
|
+ if match is not None:
|
|
|
|
+ invest = match.groupdict().get("invs", "")
|
|
|
|
+ money = getUnifyMoney(invest)
|
|
|
|
+ if money > 0:
|
|
|
|
+ return money
|
|
|
|
+ return ""
|
|
|
|
+
|
|
|
|
+ def extract_money_use(self, content):
|
|
|
|
+ """
|
|
|
|
+ 资金构成
|
|
|
|
+
|
|
|
|
+ :param content:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ list_sentences = re.split(",|。", content)
|
|
|
|
+ list_data = []
|
|
|
|
+ pattern = "^.{,20}[费用|预备费|费][大概约为是::]*\d+(\.\d+)?万?元.{,20}$"
|
|
|
|
+ for sentence in list_sentences:
|
|
|
|
+ match = re.search(pattern, sentence)
|
|
|
|
+ if match is not None:
|
|
|
|
+ list_data.append(sentence)
|
|
|
|
+ return ",".join(list_data)
|
|
|
|
+
|
|
|
|
+ def extract_use_area(self, content, has_preffix=True):
|
|
|
|
+ """
|
|
|
|
+ 用地面积
|
|
|
|
+
|
|
|
|
+ :param content:
|
|
|
|
+ :param has_preffix:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ if not content:
|
|
|
|
+ return "", ""
|
|
|
|
+ # log("content")
|
|
|
|
+ # log(content)
|
|
|
|
+ suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
|
+ # reg_dict = {
|
|
|
|
+ # 0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)|长|长度))" + suffix,
|
|
|
|
+ # 1: "(?P<proportion>((用地|占地|使用)(面积|规模)|全长)" + suffix,
|
|
|
|
+ # # 2: "(?P<proportion>((用地|占地|使用)?面积)" + suffix
|
|
|
|
+ # 2: "(?P<proportion>((用地|占地|使用)面积)" + suffix
|
|
|
|
+ # }
|
|
|
|
+ reg_dict = {
|
|
|
|
+ 0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)))" + suffix,
|
|
|
|
+ 1: "(?P<proportion>((用地|占地|使用)(面积|规模))" + suffix,
|
|
|
|
+ 2: "(?P<proportion>((用地|占地|使用)面积)" + suffix
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not has_preffix:
|
|
|
|
+ reg_dict[3] = "(?P<proportion>" + suffix
|
|
|
|
+
|
|
|
|
+ _proportion = ""
|
|
|
|
+ for i in range(len(list(reg_dict.keys()))):
|
|
|
|
+ if _proportion:
|
|
|
|
+ break
|
|
|
|
+ _pattern = reg_dict.get(i)
|
|
|
|
+ # logging.info('content ' + str(content))
|
|
|
|
+ match = re.search(_pattern, str(content))
|
|
|
|
+ if match:
|
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
+
|
|
|
|
+ if not _proportion:
|
|
|
|
+ return "", ""
|
|
|
|
+
|
|
|
|
+ # 统一格式
|
|
|
|
+ multiple_cnt = 1
|
|
|
|
+ digit = ""
|
|
|
|
+
|
|
|
|
+ # 确定具体数字
|
|
|
|
+ match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
|
|
|
|
+ if match:
|
|
|
|
+ # logging.info(str(_proportion) + ' ' + str(match.group()))
|
|
|
|
+ d1 = match.group('d1')
|
|
|
|
+ d2 = match.group('d2')
|
|
|
|
+ try:
|
|
|
|
+ d1 = int(re.sub(',', '', d1))
|
|
|
|
+ except:
|
|
|
|
+ return "", ""
|
|
|
|
+ if d2:
|
|
|
|
+ d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
|
|
|
|
+ # print('d1, d2', d1, d2)
|
|
|
|
+ d1 += d2
|
|
|
|
+ digit = d1
|
|
|
|
+ # print('digit', digit)
|
|
|
|
+
|
|
|
|
+ # 确定中文倍数
|
|
|
|
+ _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
|
|
|
|
+ match = re.search('[十百千万亿]+', _proportion2)
|
|
|
|
+ _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
|
|
|
+ if match:
|
|
|
|
+ for c in match.group():
|
|
|
|
+ multiple_cnt *= _dict.get(c)
|
|
|
|
+ _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
|
|
|
|
+ else:
|
|
|
|
+ _proportion3 = _proportion2
|
|
|
|
+ # print('multiple_cnt2', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 确定面积/长度
|
|
|
|
+ match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ unit = '㎡'
|
|
|
|
+ else:
|
|
|
|
+ unit = 'm'
|
|
|
|
+
|
|
|
|
+ # 确定单位倍数
|
|
|
|
+ match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ if unit == 'm':
|
|
|
|
+ if re.search('[kK千公]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000
|
|
|
|
+ elif re.search('[里]', match.group()):
|
|
|
|
+ multiple_cnt *= Decimal(str(500))
|
|
|
|
+ else:
|
|
|
|
+ if '亩' in match.group():
|
|
|
|
+ multiple_cnt *= Decimal(str(666.67))
|
|
|
|
+ elif '顷' in match.group():
|
|
|
|
+ multiple_cnt *= 10000
|
|
|
|
+ elif re.search('千米|公里|k[mM㎡]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000000
|
|
|
|
+ # print('multiple_cnt1', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 拼接
|
|
|
|
+ digit = str(digit * multiple_cnt) + unit
|
|
|
|
+
|
|
|
|
+ return _proportion, digit
|
|
|
|
+
|
|
|
|
+ def extract_proportion(self, content, has_preffix=True):
|
|
|
|
+ if not content:
|
|
|
|
+ return "", ""
|
|
|
|
+ # log("content")
|
|
|
|
+ # log(content)
|
|
|
|
+ suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
|
+ reg_dict = {
|
|
|
|
+ 0: "(?P<proportion>(总((建筑|建设)(面积|规模)))" + suffix,
|
|
|
|
+ 1: "(?P<proportion>((建筑|建设)(面积|规模))" + suffix,
|
|
|
|
+ 2: "(?P<proportion>((建筑|建设|区域)?面积|项目规模)" + suffix
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not has_preffix:
|
|
|
|
+ reg_dict[3] = "(?P<proportion>" + suffix
|
|
|
|
+
|
|
|
|
+ _proportion = ""
|
|
|
|
+ for i in range(len(list(reg_dict.keys()))):
|
|
|
|
+ if _proportion:
|
|
|
|
+ break
|
|
|
|
+ _pattern = reg_dict.get(i)
|
|
|
|
+ # logging.info('content ' + str(content))
|
|
|
|
+ match = re.search(_pattern, str(content))
|
|
|
|
+ if match:
|
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
+
|
|
|
|
+ if not _proportion:
|
|
|
|
+ return "", ""
|
|
|
|
+
|
|
|
|
+ # 统一格式
|
|
|
|
+ multiple_cnt = 1
|
|
|
|
+ digit = ""
|
|
|
|
+
|
|
|
|
+ # 确定具体数字
|
|
|
|
+ match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
|
|
|
|
+ if match:
|
|
|
|
+ # logging.info(str(_proportion) + ' ' + str(match.group()))
|
|
|
|
+ d1 = match.group('d1')
|
|
|
|
+ d2 = match.group('d2')
|
|
|
|
+ try:
|
|
|
|
+ d1 = int(re.sub(',', '', d1))
|
|
|
|
+ except:
|
|
|
|
+ return "", ""
|
|
|
|
+ if d2:
|
|
|
|
+ d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
|
|
|
|
+ # print('d1, d2', d1, d2)
|
|
|
|
+ d1 += d2
|
|
|
|
+ digit = d1
|
|
|
|
+ # print('digit', digit)
|
|
|
|
+
|
|
|
|
+ # 确定中文倍数
|
|
|
|
+ _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
|
|
|
|
+ match = re.search('[十百千万亿]+', _proportion2)
|
|
|
|
+ _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
|
|
|
+ if match:
|
|
|
|
+ for c in match.group():
|
|
|
|
+ multiple_cnt *= _dict.get(c)
|
|
|
|
+ _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
|
|
|
|
+ else:
|
|
|
|
+ _proportion3 = _proportion2
|
|
|
|
+ # print('multiple_cnt2', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 确定面积/长度
|
|
|
|
+ match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ unit = '㎡'
|
|
|
|
+ else:
|
|
|
|
+ unit = 'm'
|
|
|
|
+
|
|
|
|
+ # 确定单位倍数
|
|
|
|
+ match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ if unit == 'm':
|
|
|
|
+ if re.search('[kK千公]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000
|
|
|
|
+ elif re.search('[里]', match.group()):
|
|
|
|
+ multiple_cnt *= Decimal(str(500))
|
|
|
|
+ else:
|
|
|
|
+ if '亩' in match.group():
|
|
|
|
+ multiple_cnt *= Decimal(str(666.67))
|
|
|
|
+ elif '顷' in match.group():
|
|
|
|
+ multiple_cnt *= 10000
|
|
|
|
+ elif re.search('千米|公里|k[mM㎡]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000000
|
|
|
|
+ # print('multiple_cnt1', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 拼接
|
|
|
|
+ digit = str(digit * multiple_cnt) + unit
|
|
|
|
+
|
|
|
|
+ return _proportion, digit
|
|
|
|
+
|
|
|
|
+ def extract_length(self, content, has_preffix=True):
|
|
|
|
+ if not content:
|
|
|
|
+ return "", ""
|
|
|
|
+ # log("content")
|
|
|
|
+ # log(content)
|
|
|
|
+ suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
|
+ reg_dict = {
|
|
|
|
+ 0: "(?P<proportion>(总((建筑|建设)长|长度))" + suffix,
|
|
|
|
+ 1: "(?P<proportion>((建筑|建设)全长)" + suffix,
|
|
|
|
+ 2: "(?P<proportion>((建筑|建设|区域)?全长)" + suffix
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not has_preffix:
|
|
|
|
+ reg_dict[3] = "(?P<proportion>" + suffix
|
|
|
|
+
|
|
|
|
+ _proportion = ""
|
|
|
|
+ for i in range(len(list(reg_dict.keys()))):
|
|
|
|
+ if _proportion:
|
|
|
|
+ break
|
|
|
|
+ _pattern = reg_dict.get(i)
|
|
|
|
+ # logging.info('content ' + str(content))
|
|
|
|
+ match = re.search(_pattern, str(content))
|
|
|
|
+ if match:
|
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
+
|
|
|
|
+ if not _proportion:
|
|
|
|
+ return "", ""
|
|
|
|
+
|
|
|
|
+ # 统一格式
|
|
|
|
+ multiple_cnt = 1
|
|
|
|
+ digit = ""
|
|
|
|
+
|
|
|
|
+ # 确定具体数字
|
|
|
|
+ match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
|
|
|
|
+ if match:
|
|
|
|
+ # logging.info(str(_proportion) + ' ' + str(match.group()))
|
|
|
|
+ d1 = match.group('d1')
|
|
|
|
+ d2 = match.group('d2')
|
|
|
|
+ try:
|
|
|
|
+ d1 = int(re.sub(',', '', d1))
|
|
|
|
+ except:
|
|
|
|
+ return "", ""
|
|
|
|
+ if d2:
|
|
|
|
+ d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
|
|
|
|
+ # print('d1, d2', d1, d2)
|
|
|
|
+ d1 += d2
|
|
|
|
+ digit = d1
|
|
|
|
+ # print('digit', digit)
|
|
|
|
+
|
|
|
|
+ # 确定中文倍数
|
|
|
|
+ _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
|
|
|
|
+ match = re.search('[十百千万亿]+', _proportion2)
|
|
|
|
+ _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
|
|
|
+ if match:
|
|
|
|
+ for c in match.group():
|
|
|
|
+ multiple_cnt *= _dict.get(c)
|
|
|
|
+ _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
|
|
|
|
+ else:
|
|
|
|
+ _proportion3 = _proportion2
|
|
|
|
+ # print('multiple_cnt2', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 确定面积/长度
|
|
|
|
+ match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ unit = '㎡'
|
|
|
|
+ else:
|
|
|
|
+ unit = 'm'
|
|
|
|
+
|
|
|
|
+ # 确定单位倍数
|
|
|
|
+ match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ if unit == 'm':
|
|
|
|
+ if re.search('[kK千公]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000
|
|
|
|
+ elif re.search('[里]', match.group()):
|
|
|
|
+ multiple_cnt *= Decimal(str(500))
|
|
|
|
+ else:
|
|
|
|
+ if '亩' in match.group():
|
|
|
|
+ multiple_cnt *= Decimal(str(666.67))
|
|
|
|
+ elif '顷' in match.group():
|
|
|
|
+ multiple_cnt *= 10000
|
|
|
|
+ elif re.search('千米|公里|k[mM㎡]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000000
|
|
|
|
+ # print('multiple_cnt1', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 拼接
|
|
|
|
+ digit = str(digit * multiple_cnt) + unit
|
|
|
|
+
|
|
|
|
+ return _proportion, digit
|
|
|
|
+
|
|
|
|
+ def extract_declare_type(self, pro_type):
|
|
|
|
+ """
|
|
|
|
+ 申报类型从pro_type项目类型中提取
|
|
|
|
+
|
|
|
|
+ :param pro_type:
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ declare_type_list = ['立项', '审批', '备案', '核准', '未知', ]
|
|
|
|
+ declare_type = ''
|
|
|
|
+ if pro_type:
|
|
|
|
+ reg = '(?P<type>{})'.format('|'.join(declare_type_list))
|
|
|
|
+ match = re.search(reg, str(pro_type))
|
|
|
|
+ if match:
|
|
|
|
+ declare_type = match.group('type')
|
|
|
|
+ return declare_type
|
|
|
|
+
|
|
|
|
+ def run_standard(self, rs_dic):
|
|
|
|
+ # 字段标准化 2025-01-13
|
|
|
|
+ # rs_dic = self.standard_data([rs_dic])[0]
|
|
|
|
+ rs_dic = self.standard_properties(rs_dic)
|
|
|
|
+ rs_dic = self.standard_approval_result(rs_dic)
|
|
|
|
+ rs_dic = self.standard_year_limit(rs_dic)
|
|
|
|
+ rs_dic = self.standard_time_commencement_completion(rs_dic)
|
|
|
|
+ rs_dic = self.standard_money_or_area(rs_dic)
|
|
|
|
+ rs_dic = self.standard_pro_type(rs_dic)
|
|
|
|
+ return rs_dic
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class APProduction:
|
|
|
|
+ def __init__(self, test_mode=0, update=1, show=0):
|
|
|
|
+ self.ots_client = getConnect_ots()
|
|
|
|
+ self.ots_capacity = getConnect_ots_capacity()
|
|
|
|
+ self.test_mode = test_mode
|
|
|
|
+ self.update = update
|
|
|
|
+ self.show = show
|
|
|
|
+
|
|
|
|
+ # 处理到的时间
|
|
|
|
+ self.opertime_txt = os.path.abspath(os.path.dirname(__file__)) + '/opertime.txt'
|
|
|
|
+ with open(self.opertime_txt, 'r') as f:
|
|
|
|
+ self.opertime_last = f.read()
|
|
|
|
+
|
|
|
|
+ self.document_cols = ["docid", "doctitle", "project_name", "page_time",
|
|
|
|
+ "project_code", "approval_json", "extract_json",
|
|
|
|
+ 'total_tenderee_money', 'total_tenderee_money_unit',
|
|
|
|
+ 'time_approval',
|
|
|
|
+ 'construct_company', 'construct_company_code',
|
|
|
|
+ 'compilation_unit', 'evaluation_agency', 'declare_company',
|
|
|
|
+ 'approver', 'publisher'
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ self.ap_predictor = APPredictor()
|
|
|
|
+
|
|
|
|
+ self.none_list = [None, '', '-', '全国', '未知', "None"]
|
|
|
|
+
|
|
|
|
+ def ap_data_flow(self, docid):
|
|
|
|
+ flow_start_time = time.time()
|
|
|
|
+ log('process docid ' + str(docid))
|
|
|
|
+
|
|
|
|
+ # 获取公告的数据
|
|
|
|
+ # 单个审批公告可能同时发布多个不同的审批项目
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ one_docid_approvals = self.get_approval_data_from_document_by_docid(docid)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(one_docid_approvals)', len(one_docid_approvals))
|
|
|
|
+ print('one_docid_approvals', one_docid_approvals)
|
|
|
|
+ # log('time1 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ # 跳过单个审批公告多个审批记录的
|
|
|
|
+ if len(one_docid_approvals) > 1:
|
|
|
|
+ log('跳过单个审批公告多个审批记录的')
|
|
|
|
+ print('len(one_docid_approvals)', len(one_docid_approvals))
|
|
|
|
+ return []
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ match_approvals_list = self.get_data_from_document_by_rules(one_docid_approvals)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(match_approvals_list)', len(match_approvals_list))
|
|
|
|
+ # print('match_approvals_list', match_approvals_list)
|
|
|
|
+ log('time2 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+ if match_approvals_list and len(match_approvals_list[0]) == 0:
|
|
|
|
+ log('by_rules cnt too many! return')
|
|
|
|
+ return []
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ merge_approvals_list = self.merge_approval(match_approvals_list, one_docid_approvals)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(merge_approvals_list)', len(merge_approvals_list))
|
|
|
|
+ # log('time3 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ approval_project_list = self.generate_project(merge_approvals_list)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(approval_project_list)', len(approval_project_list))
|
|
|
|
+ for approval_project in approval_project_list:
|
|
|
|
+ print(approval_project.get('uuid'), approval_project.get('docids_cnt'))
|
|
|
|
+ print(approval_project.get('docids'))
|
|
|
|
+ # log('time4 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ match_projects_list = self.find_exist_projects_by_docid(approval_project_list)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(match_projects_list)', len(match_projects_list))
|
|
|
|
+ for match_projects in match_projects_list:
|
|
|
|
+ print([x.get('uuid') for x in match_projects], len(match_projects))
|
|
|
|
+ # log('time5 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ merge_project_list = self.merge_project(match_projects_list)
|
|
|
|
+ if self.show:
|
|
|
|
+ log('len(merge_project_list)', len(merge_project_list))
|
|
|
|
+ # log('time6 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ self.update_project(merge_project_list)
|
|
|
|
+ # log('time7 ' + str(round(time.time()-start_time, 2)))
|
|
|
|
+
|
|
|
|
+ log('approval_project_list len:', len(approval_project_list),
|
|
|
|
+ 'cost:', round(time.time() - flow_start_time, 2))
|
|
|
|
+ return approval_project_list
|
|
|
|
+
|
|
|
|
+ def get_approval_docid_from_document(self, max_cnt=20):
|
|
|
|
+ # 查询这个时间点数量,后续查数据数量需略微大于该时间点数量,否则会死循环
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ RangeQuery("status", 201, 301),
|
|
|
|
+ TermQuery("persistence_time", self.opertime_last),
|
|
|
|
+ TermQuery("docchannel", 302),
|
|
|
|
+ ])
|
|
|
|
+ total_cnt = self.search_util("document", "document_index",
|
|
|
|
+ bool_query, ['docid'], None,
|
|
|
|
+ only_return_total_cnt=1)
|
|
|
|
+ if max_cnt <= total_cnt:
|
|
|
|
+ max_cnt = total_cnt + 1
|
|
|
|
+ log('reset max_cnt', str(max_cnt))
|
|
|
|
+
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ RangeQuery("status", 201, 301),
|
|
|
|
+ RangeQuery("persistence_time", self.opertime_last, None),
|
|
|
|
+ TermQuery("docchannel", 302),
|
|
|
|
+ ])
|
|
|
|
+ # 需时间正序执行
|
|
|
|
+ asc_flag = 1
|
|
|
|
+ row_list = self.search_util("document", "document_index", bool_query,
|
|
|
|
+ ["docid", "persistence_time"], "persistence_time",
|
|
|
|
+ limit=max_cnt, asc=asc_flag)
|
|
|
|
+
|
|
|
|
+ id_list = []
|
|
|
|
+ persistence_time_list = []
|
|
|
|
+ for _data in row_list:
|
|
|
|
+ id_list.append(_data.get("docid"))
|
|
|
|
+ persistence_time_list.append(_data.get("persistence_time"))
|
|
|
|
+ return id_list, persistence_time_list
|
|
|
|
+
|
|
|
|
+ def get_approval_data_from_document_by_docid(self, docid):
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("docid", docid),
|
|
|
|
+ RangeQuery('status', 201, 301)
|
|
|
|
+ ])
|
|
|
|
+
|
|
|
|
+ if self.test_mode:
|
|
|
|
+ rows = self.search_util("document", "document_index",
|
|
|
|
+ bool_query, self.document_cols, 'page_time')
|
|
|
|
+ else:
|
|
|
|
+ self.document_cols += ['persistence_time']
|
|
|
|
+ rows = self.search_util("document", "document_index",
|
|
|
|
+ bool_query, self.document_cols, 'persistence_time')
|
|
|
|
+
|
|
|
|
+ # 处理下数据,后期可能删除;因为按docid搜只有一个或没有,直接返回
|
|
|
|
+ if len(rows) == 1:
|
|
|
|
+ # 这里会把json里的都处理,可能会产生多个不同审批项目,都返回
|
|
|
|
+ approval_list = self.process_approval_json(rows[0])
|
|
|
|
+ else:
|
|
|
|
+ approval_list = []
|
|
|
|
+ return approval_list
|
|
|
|
+
|
|
|
|
+ def get_data_from_document_by_rules(self, approvals):
|
|
|
|
+ def task(approval):
|
|
|
|
+ doc_num = approval.get("doc_num", "")
|
|
|
|
+ doctitle = approval.get("doctitle", "")
|
|
|
|
+ project_name = approval.get("project_name", "")
|
|
|
|
+ project_code = approval.get("project_code", "")
|
|
|
|
+ docid = approval.get("docid")
|
|
|
|
+ should_queries = []
|
|
|
|
+ if doc_num != "":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle", doc_num))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon", doc_num))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon", doc_num))
|
|
|
|
+ if doctitle != "":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle", doctitle))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon", doctitle))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon", doctitle))
|
|
|
|
+ if project_name != "":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle", project_name))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon", project_name))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon", project_name))
|
|
|
|
+ if project_code != "":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle", project_code))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon", project_code))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon", project_code))
|
|
|
|
+ _query = BoolQuery(should_queries=should_queries, must_not_queries=[TermQuery("docid", docid)])
|
|
|
|
+
|
|
|
|
+ # # 两两组合
|
|
|
|
+ # should_doc_num = []
|
|
|
|
+ # should_doctitle = []
|
|
|
|
+ # should_project_name = []
|
|
|
|
+ # should_project_code = []
|
|
|
|
+ # if doc_num != "":
|
|
|
|
+ # should_doc_num.append(MatchPhraseQuery("doctitle", doc_num))
|
|
|
|
+ # should_doc_num.append(MatchPhraseQuery("doctextcon", doc_num))
|
|
|
|
+ # should_doc_num.append(MatchPhraseQuery("attachmenttextcon", doc_num))
|
|
|
|
+ # if doctitle != "":
|
|
|
|
+ # should_doctitle.append(MatchPhraseQuery("doctitle", doctitle))
|
|
|
|
+ # should_doctitle.append(MatchPhraseQuery("doctextcon", doctitle))
|
|
|
|
+ # should_doctitle.append(MatchPhraseQuery("attachmenttextcon", doctitle))
|
|
|
|
+ # if project_name != "":
|
|
|
|
+ # should_project_name.append(MatchPhraseQuery("doctitle", project_name))
|
|
|
|
+ # should_project_name.append(MatchPhraseQuery("doctextcon", project_name))
|
|
|
|
+ # should_project_name.append(MatchPhraseQuery("attachmenttextcon", project_name))
|
|
|
|
+ # if project_code != "":
|
|
|
|
+ # should_project_code.append(MatchPhraseQuery("doctitle", project_code))
|
|
|
|
+ # should_project_code.append(MatchPhraseQuery("doctextcon", project_code))
|
|
|
|
+ # should_project_code.append(MatchPhraseQuery("attachmenttextcon", project_code))
|
|
|
|
+ #
|
|
|
|
+ # all_should_list = []
|
|
|
|
+ # if should_doc_num and should_doctitle:
|
|
|
|
+ # q = BoolQuery(must_queries=[
|
|
|
|
+ # BoolQuery(should_queries=should_doc_num),
|
|
|
|
+ # BoolQuery(should_queries=should_doctitle)
|
|
|
|
+ # ])
|
|
|
|
+ # all_should_list.append(q)
|
|
|
|
+ #
|
|
|
|
+ # if should_doc_num and should_project_name:
|
|
|
|
+ # q = BoolQuery(must_queries=[
|
|
|
|
+ # BoolQuery(should_queries=should_doc_num),
|
|
|
|
+ # BoolQuery(should_queries=should_project_name)
|
|
|
|
+ # ])
|
|
|
|
+ # all_should_list.append(q)
|
|
|
|
+ #
|
|
|
|
+ # if should_doc_num and should_project_code:
|
|
|
|
+ # q = BoolQuery(must_queries=[
|
|
|
|
+ # BoolQuery(should_queries=should_doc_num),
|
|
|
|
+ # BoolQuery(should_queries=should_project_code)
|
|
|
|
+ # ])
|
|
|
|
+ # all_should_list.append(q)
|
|
|
|
+ #
|
|
|
|
+ # if should_project_code and should_doctitle:
|
|
|
|
+ # q = BoolQuery(must_queries=[
|
|
|
|
+ # BoolQuery(should_queries=should_project_code),
|
|
|
|
+ # BoolQuery(should_queries=should_doctitle)
|
|
|
|
+ # ])
|
|
|
|
+ # all_should_list.append(q)
|
|
|
|
+ #
|
|
|
|
+ # if should_project_code and should_project_name:
|
|
|
|
+ # q = BoolQuery(must_queries=[
|
|
|
|
+ # BoolQuery(should_queries=should_project_code),
|
|
|
|
+ # BoolQuery(should_queries=should_project_name)
|
|
|
|
+ # ])
|
|
|
|
+ # all_should_list.append(q)
|
|
|
|
+ #
|
|
|
|
+ # if len(all_should_list) == 0:
|
|
|
|
+ # return [approval]
|
|
|
|
+ #
|
|
|
|
+ # _query = BoolQuery(should_queries=all_should_list)
|
|
|
|
+
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ RangeQuery("status", 201, 301),
|
|
|
|
+ _query
|
|
|
|
+ ])
|
|
|
|
+
|
|
|
|
+ # 先查数量
|
|
|
|
+ total_cnt = self.search_util("document", "document_index",
|
|
|
|
+ bool_query, ['docid'], None,
|
|
|
|
+ only_return_total_cnt=1)
|
|
|
|
+ if total_cnt >= 50:
|
|
|
|
+ log('by_rules total_cnt > 50 ' + str(total_cnt))
|
|
|
|
+ return []
|
|
|
|
+
|
|
|
|
+ rows = self.search_util("document", "document_index",
|
|
|
|
+ bool_query, self.document_cols, 'page_time')
|
|
|
|
+ return [approval] + rows
|
|
|
|
+
|
|
|
|
+ approvals_list = self.multi_thread_util(approvals, task, 3)
|
|
|
|
+ return approvals_list
|
|
|
|
+
|
|
|
|
+ def merge_approval(self, match_approvals_list, origin_approvals):
|
|
|
|
+ merge_approvals_list = []
|
|
|
|
+ for index, match_approvals in enumerate(match_approvals_list):
|
|
|
|
+ # 这里match_approvals是从origin_approval通过规则搜出来的一批符合的
|
|
|
|
+ # match_approvals origin_approval 一一对应
|
|
|
|
+ origin_approval = origin_approvals[index]
|
|
|
|
+ merge_approvals = []
|
|
|
|
+ for _d in match_approvals:
|
|
|
|
+ # 这里每个符合的审批都要把json里的都处理一遍,再判断是否跟原审批匹配
|
|
|
|
+ approval_json_list = self.process_approval_json(_d)
|
|
|
|
+ # approval_json_list = [y for x in approval_json_list for y in x]
|
|
|
|
+ merge_approvals += self.check_approval(approval_json_list, origin_approval)
|
|
|
|
+ merge_approvals = [json.loads(x) for x in list(set([json.dumps(x) for x in merge_approvals]))]
|
|
|
|
+ merge_approvals_list.append(merge_approvals)
|
|
|
|
+
|
|
|
|
+ return merge_approvals_list
|
|
|
|
+
|
|
|
|
+ def generate_project(self, merge_approvals_list):
|
|
|
|
+ approval_project_list = []
|
|
|
|
+ for merge_approvals in merge_approvals_list:
|
|
|
|
+ if not merge_approvals:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ project_id = str(uuid4())
|
|
|
|
+ merge_approvals.sort(key=lambda x: x.get("page_time", ""), reverse=False)
|
|
|
|
+
|
|
|
|
+ merge_approvals[0]['uuid'] = project_id
|
|
|
|
+ _dict = self.get_proper_col_value(merge_approvals)
|
|
|
|
+
|
|
|
|
+ _dict["id"] = project_id
|
|
|
|
+ _dict["uuid"] = project_id
|
|
|
|
+
|
|
|
|
+ project_dict = {}
|
|
|
|
+ for col in approval_cols + ['docids', 'docids_cnt']:
|
|
|
|
+ project_dict[col] = _dict.get(col, '')
|
|
|
|
+ current_time = datetime.now()
|
|
|
|
+ current_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ project_dict['update_time'] = current_time
|
|
|
|
+ project_dict['status'] = '1'
|
|
|
|
+
|
|
|
|
+ approval_project_list.append(project_dict)
|
|
|
|
+ return approval_project_list
|
|
|
|
+
|
|
|
|
+ def find_exist_projects_by_docid(self, project_list):
|
|
|
|
+ match_projects_list = []
|
|
|
|
+ for project in project_list:
|
|
|
|
+ docids = project.get('docids').split(',')
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ BoolQuery(should_queries=[TermQuery("docids", x) for x in docids]),
|
|
|
|
+ # BoolQuery(must_not_queries=[TermQuery('status', '404')]),
|
|
|
|
+ ])
|
|
|
|
+ columns = ["docid", "doctitle", "project_name", "page_time", "project_code", "approval_json",
|
|
|
|
+ "extract_json"]
|
|
|
|
+ rows = self.search_util("approval_project", "approval_project_index",
|
|
|
|
+ bool_query, columns, None)
|
|
|
|
+ match_projects = [project] + rows
|
|
|
|
+ match_projects_list.append(match_projects)
|
|
|
|
+ return match_projects_list
|
|
|
|
+
|
|
|
|
+ def merge_project(self, match_projects_list):
|
|
|
|
+ merge_project_list = []
|
|
|
|
+ for match_projects in match_projects_list:
|
|
|
|
+ merge_uuids = [x.get('uuid') for x in match_projects]
|
|
|
|
+ if len(merge_uuids) > 1:
|
|
|
|
+ # 排除当前项目
|
|
|
|
+ merge_uuids = merge_uuids[1:]
|
|
|
|
+ else:
|
|
|
|
+ merge_uuids = []
|
|
|
|
+
|
|
|
|
+ merge_project = self.get_proper_col_value(match_projects)
|
|
|
|
+ merge_project['merge_uuids'] = merge_uuids
|
|
|
|
+ merge_project_list.append(merge_project)
|
|
|
|
+ return merge_project_list
|
|
|
|
+
|
|
|
|
+ def update_project(self, project_list):
|
|
|
|
+ if self.update:
|
|
|
|
+ for project in project_list:
|
|
|
|
+ if self.show:
|
|
|
|
+ print('update_project dict: ')
|
|
|
|
+ pprint(project)
|
|
|
|
+ _approval_project = approval_project(project)
|
|
|
|
+ _approval_project.update_project(self.ots_client)
|
|
|
|
+ log('update to approval_project success!', len(project_list))
|
|
|
|
+ else:
|
|
|
|
+ if self.show:
|
|
|
|
+ for project in project_list:
|
|
|
|
+ print('update_project dict: ')
|
|
|
|
+ pprint(project)
|
|
|
|
+ log('not update to approval_project!')
|
|
|
|
+
|
|
|
|
+ def update_opertime(self, p_time_list):
|
|
|
|
+ data_format = "%Y-%m-%d %H:%M:%S"
|
|
|
|
+ opertime_last = datetime.strptime(self.opertime_last, data_format)
|
|
|
|
+ for persistence_time in p_time_list:
|
|
|
|
+ if persistence_time is None:
|
|
|
|
+ continue
|
|
|
|
+ opertime = datetime.strptime(persistence_time, data_format)
|
|
|
|
+ if opertime > opertime_last:
|
|
|
|
+ opertime_last = opertime
|
|
|
|
+ opertime_last = opertime_last.strftime(data_format)
|
|
|
|
+ with open(self.opertime_txt, 'w') as f:
|
|
|
|
+ f.write(opertime_last)
|
|
|
|
+ return opertime_last
|
|
|
|
+
|
|
|
|
+ def process_approval_json(self, approval_dict):
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ :param approval_dict: 数据dict
|
|
|
|
+ :param check: 是否检查与原来的审批公告属于同一审批项目
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ _d = approval_dict
|
|
|
|
+ approval_json = _d.get("approval_json")
|
|
|
|
+ partitionkey = _d.get("partitionkey")
|
|
|
|
+ docid = _d.get("docid")
|
|
|
|
+ doctitle = _d.get("doctitle")
|
|
|
|
+ project_name = _d.get("project_name")
|
|
|
|
+ page_time = _d.get("page_time")
|
|
|
|
+ has_standard = _d.get("standard")
|
|
|
|
+
|
|
|
|
+ extract_json = _d.get("extract_json")
|
|
|
|
+ # pro_type = extract_json.get("pro_type")
|
|
|
|
+ # approval_result = extract_json.get("approval_result")
|
|
|
|
+ _d_html = {"partitionkey": partitionkey, "docid": docid}
|
|
|
|
+ _html = Document(_d_html)
|
|
|
|
+ _html.fix_columns(self.ots_capacity, ["dochtmlcon"], True)
|
|
|
|
+ dochtml = _html.getProperties().get("dochtmlcon", "")
|
|
|
|
+ doctextcon = BeautifulSoup(dochtml, "lxml").get_text()
|
|
|
|
+ attachmenttextcon = ""
|
|
|
|
+ try:
|
|
|
|
+ _extract = json.loads(extract_json)
|
|
|
|
+ for key in _extract.keys():
|
|
|
|
+ if _d.get(key) in self.none_list and _extract.get(key) not in self.none_list:
|
|
|
|
+ _d[key] = _extract.get(key)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ _extract = {}
|
|
|
|
+
|
|
|
|
+ # 若要素提取已经标准化过了,不用重新跑
|
|
|
|
+ if has_standard:
|
|
|
|
+ proportion = _d.get('proportion')
|
|
|
|
+ use_area = _d.get('use_area')
|
|
|
|
+ env_invest = _d.get('env_invest')
|
|
|
|
+ money_use = _d.get('money_use')
|
|
|
|
+ declare_type = _d.get('declare_type')
|
|
|
|
+ length = _d.get('length')
|
|
|
|
+ else:
|
|
|
|
+ proportion = _extract.get("pb", {}).get("proportion")
|
|
|
|
+ _, use_area = self.ap_predictor.extract_use_area(doctextcon + attachmenttextcon)
|
|
|
|
+ env_invest = self.ap_predictor.extract_env_invest(doctextcon + attachmenttextcon)
|
|
|
|
+ money_use = self.ap_predictor.extract_money_use(doctextcon + attachmenttextcon)
|
|
|
|
+ declare_type = self.ap_predictor.extract_declare_type(_d.get('pro_type'))
|
|
|
|
+ _, length = self.ap_predictor.extract_length(doctextcon + attachmenttextcon)
|
|
|
|
+ _d["proportion"] = proportion
|
|
|
|
+ _d["use_area"] = use_area
|
|
|
|
+ _d["env_invest"] = env_invest
|
|
|
|
+ _d["money_use"] = money_use
|
|
|
|
+ _d["declare_type"] = declare_type
|
|
|
|
+ _d["length"] = length
|
|
|
|
+
|
|
|
|
+ # 标准化
|
|
|
|
+ _d = self.ap_predictor.run_standard(_d)
|
|
|
|
+
|
|
|
|
+ approval_list = []
|
|
|
|
+ if approval_json:
|
|
|
|
+ approval_list = json.loads(approval_json)
|
|
|
|
+ for _appr in approval_list:
|
|
|
|
+ for key in _d.keys():
|
|
|
|
+ if key not in _appr:
|
|
|
|
+ _appr[key] = _d.get(key)
|
|
|
|
+
|
|
|
|
+ fix_area(self.ots_client, _appr)
|
|
|
|
+
|
|
|
|
+ # 建设单位信用代码、地区
|
|
|
|
+ if _d.get("construct_company"):
|
|
|
|
+ new_d = self.get_construct_code_area(_d.get("construct_company"))
|
|
|
|
+ _appr.update(new_d)
|
|
|
|
+
|
|
|
|
+ # 再从各自的项目概况中提取
|
|
|
|
+ construction_scale = _d.get("construction_scale", "")
|
|
|
|
+ proportion, _ = self.ap_predictor.extract_proportion(construction_scale)
|
|
|
|
+ if proportion != "":
|
|
|
|
+ _appr["proportion"] = proportion
|
|
|
|
+ _, use_area = self.ap_predictor.extract_use_area(construction_scale)
|
|
|
|
+ if use_area != "":
|
|
|
|
+ _appr["use_area"] = use_area
|
|
|
|
+ env_invest = self.ap_predictor.extract_env_invest(construction_scale)
|
|
|
|
+ if env_invest != "":
|
|
|
|
+ _appr["env_invest"] = env_invest
|
|
|
|
+ money_use = self.ap_predictor.extract_money_use(construction_scale)
|
|
|
|
+ if money_use != "":
|
|
|
|
+ _appr["money_use"] = money_use
|
|
|
|
+ # 标准化
|
|
|
|
+ _appr = self.ap_predictor.run_standard(_appr)
|
|
|
|
+ return approval_list
|
|
|
|
+
|
|
|
|
+ def get_proper_col_value(self, dict_list):
|
|
|
|
+ """
|
|
|
|
+ 获取合适的字段值
|
|
|
|
+
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ merge_dict = {}
|
|
|
|
+ if dict_list:
|
|
|
|
+ merge_dict['uuid'] = dict_list[0].get('uuid')
|
|
|
|
+
|
|
|
|
+ # 处理多值字段
|
|
|
|
+ many_value_cols = ['docids', 'doc_nums', 'project_codes']
|
|
|
|
+ for col in many_value_cols:
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ if d.get(col) is not None:
|
|
|
|
+ continue
|
|
|
|
+ col_values = [str(x.get(col[:-1])) for x in dict_list]
|
|
|
|
+ d[col] = ','.join(col_values)
|
|
|
|
+ if col == 'docids':
|
|
|
|
+ d['docids_cnt'] = len(col_values)
|
|
|
|
+
|
|
|
|
+ # 合并多值字段
|
|
|
|
+ many_value_cols = ['docids', 'doc_nums', 'project_codes', 'properties']
|
|
|
|
+ for col in many_value_cols:
|
|
|
|
+ value_list = []
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ if value in self.none_list:
|
|
|
|
+ continue
|
|
|
|
+ value_list += value.split(',')
|
|
|
|
+ value_list = list(set(value_list))
|
|
|
|
+ for v in self.none_list:
|
|
|
|
+ if v in value_list:
|
|
|
|
+ value_list.remove(v)
|
|
|
|
+ value_list.sort(key=lambda x: x)
|
|
|
|
+ if col == 'properties' and len(value_list) > 1 and '未披露' in value_list:
|
|
|
|
+ value_list.remove('未披露')
|
|
|
|
+ max_cnt = 200
|
|
|
|
+ if col == 'docids' and len(value_list) < max_cnt:
|
|
|
|
+ merge_dict['docids_cnt'] = len(value_list)
|
|
|
|
+ # 限制个数
|
|
|
|
+ value_list = value_list[:max_cnt]
|
|
|
|
+ merge_dict[col] = ','.join(value_list)
|
|
|
|
+
|
|
|
|
+ # 数字字段
|
|
|
|
+ num_cols = ['year_limit', 'env_invest', 'proportion', 'length', 'use_area', 'total_tenderee_money']
|
|
|
|
+ for col in num_cols:
|
|
|
|
+ max_value = 0
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ value = d.get(col, 0)
|
|
|
|
+ if value in [None, '', 'None']:
|
|
|
|
+ value = 0
|
|
|
|
+ if value > max_value:
|
|
|
|
+ max_value = value
|
|
|
|
+ if max_value != 0:
|
|
|
|
+ merge_dict[col] = max_value
|
|
|
|
+ else:
|
|
|
|
+ merge_dict[col] = ''
|
|
|
|
+
|
|
|
|
+ # 日期字段
|
|
|
|
+ time_cols = ['time_commencement', 'time_completion', 'time_release', 'page_time',
|
|
|
|
+ 'time_declare', 'time_approval']
|
|
|
|
+ for col in time_cols:
|
|
|
|
+ # if col in ['update_time']:
|
|
|
|
+ # format_string = "%Y-%m-%d %H:%M:%S"
|
|
|
|
+ # else:
|
|
|
|
+ format_string = "%Y-%m-%d"
|
|
|
|
+
|
|
|
|
+ max_time = None
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ try:
|
|
|
|
+ value = datetime.strptime(value, format_string)
|
|
|
|
+ except:
|
|
|
|
+ continue
|
|
|
|
+ if max_time is None or value > max_time:
|
|
|
|
+ max_time = value
|
|
|
|
+ if max_time is not None:
|
|
|
|
+ merge_dict[col] = datetime.strftime(max_time, format_string)
|
|
|
|
+ else:
|
|
|
|
+ merge_dict[col] = ''
|
|
|
|
+
|
|
|
|
+ # 地区字段
|
|
|
|
+ area_cols = ['area', 'province', 'city', 'district']
|
|
|
|
+ area_list, province_list, city_list, district_list = [[] for _ in range(4)]
|
|
|
|
+ a_p_c_d_dict = {}
|
|
|
|
+ # 有些district是dict
|
|
|
|
+ extra_dict_list = []
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ district = d.get('district')
|
|
|
|
+ if type(district) == dict:
|
|
|
|
+ extra_dict_list.append(district)
|
|
|
|
+ d['district'] = ''
|
|
|
|
+
|
|
|
|
+ for d in dict_list + extra_dict_list:
|
|
|
|
+ area = d.get('area')
|
|
|
|
+ province = d.get('province')
|
|
|
|
+ city = d.get('city')
|
|
|
|
+ district = d.get('district')
|
|
|
|
+ district_list.append(district)
|
|
|
|
+ area_list.append(area)
|
|
|
|
+ province_list.append(province)
|
|
|
|
+ city_list.append(city)
|
|
|
|
+
|
|
|
|
+ # 重复加1
|
|
|
|
+ a_p_c_d = []
|
|
|
|
+ for v in [area, province, city, district]:
|
|
|
|
+ if v not in self.none_list:
|
|
|
|
+ a_p_c_d.append(v)
|
|
|
|
+ a_p_c_d = '-'.join(a_p_c_d)
|
|
|
|
+ if a_p_c_d in a_p_c_d_dict.keys():
|
|
|
|
+ a_p_c_d_dict[a_p_c_d] += 1
|
|
|
|
+ else:
|
|
|
|
+ a_p_c_d_dict[a_p_c_d] = 1
|
|
|
|
+
|
|
|
|
+ # 包含关系再加1
|
|
|
|
+ keys = list(a_p_c_d_dict.keys())
|
|
|
|
+ for i in range(len(keys)):
|
|
|
|
+ a_p_c_d_1 = keys[i]
|
|
|
|
+ for j in range(i + 1, len(keys)):
|
|
|
|
+ a_p_c_d_2 = keys[j]
|
|
|
|
+ if len(a_p_c_d_1) >= len(a_p_c_d_2) and a_p_c_d_2 in a_p_c_d_1:
|
|
|
|
+ a_p_c_d_dict[a_p_c_d_1] += a_p_c_d_dict.get(a_p_c_d_2)
|
|
|
|
+ elif len(a_p_c_d_1) < len(a_p_c_d_2) and a_p_c_d_1 in a_p_c_d_2:
|
|
|
|
+ a_p_c_d_dict[a_p_c_d_2] += a_p_c_d_dict.get(a_p_c_d_1)
|
|
|
|
+
|
|
|
|
+ if self.show:
|
|
|
|
+ print('a_p_c_d_dict', a_p_c_d_dict)
|
|
|
|
+ if '' in a_p_c_d_dict.keys():
|
|
|
|
+ a_p_c_d_dict.pop('')
|
|
|
|
+
|
|
|
|
+ # 地区匹配次数取最多
|
|
|
|
+ a_p_c_d_list = [[key, value] for key, value in a_p_c_d_dict.items()]
|
|
|
|
+ a_p_c_d_list.sort(key=lambda x: x[1])
|
|
|
|
+ a_p_c_d = a_p_c_d_list[-1][0].split('-')
|
|
|
|
+ for value in a_p_c_d:
|
|
|
|
+ if value in area_list:
|
|
|
|
+ merge_dict['area'] = value
|
|
|
|
+ elif value in province_list:
|
|
|
|
+ merge_dict['province'] = value
|
|
|
|
+ elif value in city_list:
|
|
|
|
+ merge_dict['city'] = value
|
|
|
|
+ elif value in district_list:
|
|
|
|
+ merge_dict['district'] = value
|
|
|
|
+
|
|
|
|
+ # 地址字段,取最长的
|
|
|
|
+ address_cols = ['project_addr', 'construction_scale']
|
|
|
|
+ for col in address_cols:
|
|
|
|
+ longer_value = ''
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ if value in self.none_list:
|
|
|
|
+ continue
|
|
|
|
+ if len(value) > len(longer_value):
|
|
|
|
+ longer_value = value
|
|
|
|
+ merge_dict[col] = longer_value
|
|
|
|
+
|
|
|
|
+ # 取最多的
|
|
|
|
+ name_cols = ['project_name', 'doctitle']
|
|
|
|
+ for col in name_cols:
|
|
|
|
+ value_dict = {}
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ value = d.get(col)
|
|
|
|
+ if value in self.none_list:
|
|
|
|
+ continue
|
|
|
|
+ if value in value_dict:
|
|
|
|
+ value_dict[value] += 1
|
|
|
|
+ else:
|
|
|
|
+ value_dict[value] = 1
|
|
|
|
+ value_list = [[k, v] for k, v in value_dict.items()]
|
|
|
|
+ if value_list:
|
|
|
|
+ value_list.sort(key=lambda x: (x[1], len(x[0])))
|
|
|
|
+ merge_dict[col] = value_list[-1][0]
|
|
|
|
+
|
|
|
|
+ # 剩下的为空就填
|
|
|
|
+ for col in approval_cols:
|
|
|
|
+ if col in many_value_cols + num_cols + time_cols + area_cols + address_cols + name_cols:
|
|
|
|
+ continue
|
|
|
|
+ # 按page_time排序
|
|
|
|
+ dict_list.sort(key=lambda x: x.get('page_time'), reverse=True)
|
|
|
|
+ for d in dict_list:
|
|
|
|
+ if merge_dict.get(col) in [None, '']:
|
|
|
|
+ merge_dict[col] = d.get(col)
|
|
|
|
+
|
|
|
|
+ # 一些改了名字的字段
|
|
|
|
+ # for col in ['moneyuse']:
|
|
|
|
+ # for d in dict_list:
|
|
|
|
+ # merge_dict['money_use'] = d.get(col)
|
|
|
|
+
|
|
|
|
+ for col in approval_cols:
|
|
|
|
+ if col in area_cols:
|
|
|
|
+ continue
|
|
|
|
+ if merge_dict.get(col) in self.none_list:
|
|
|
|
+ merge_dict[col] = ''
|
|
|
|
+ return merge_dict
|
|
|
|
+
|
|
|
|
+ def get_construct_code_area(self, name):
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("name", name)
|
|
|
|
+ ])
|
|
|
|
+
|
|
|
|
+ cols = ["credit_code", "province", "city", "district"]
|
|
|
|
+ data_list = self.search_util("enterprise", "enterprise_index", bool_query,
|
|
|
|
+ cols, None, )
|
|
|
|
+ _d = {}
|
|
|
|
+ if len(data_list) > 0:
|
|
|
|
+ _d["construct_company_code"] = data_list[0].get("credit_code", "")
|
|
|
|
+ _d["construct_company_province"] = data_list[0].get("province", "")
|
|
|
|
+ _d["construct_company_city"] = data_list[0].get("city", "")
|
|
|
|
+ _d["construct_company_district"] = data_list[0].get("district", "")
|
|
|
|
+ return _d
|
|
|
|
+
|
|
|
|
+ def check_approval(self, approval_list, origin_approval):
|
|
|
|
+ appr1 = origin_approval
|
|
|
|
+ same_approval_list = [origin_approval]
|
|
|
|
+ for appr2 in approval_list:
|
|
|
|
+ check_keys = ["declare_company", "construct_company", "total_tenderee_money", "proportion", "use_area",
|
|
|
|
+ "doc_num",
|
|
|
|
+ "project_code"]
|
|
|
|
+ same_count = 0
|
|
|
|
+ for k in check_keys:
|
|
|
|
+ if k in appr1 and k in appr2:
|
|
|
|
+ if appr1[k] == appr2[k] and appr1[k] is not None and appr1[k] != "":
|
|
|
|
+ same_count += 1
|
|
|
|
+
|
|
|
|
+ if same_count >= 1:
|
|
|
|
+ same_approval_list.append(appr2)
|
|
|
|
+ return same_approval_list
|
|
|
|
+
|
|
|
|
+ def search_util(self, table, table_index, bool_query, columns, sort_col, limit=999999, show_total=0, while_next=1,
|
|
|
|
+ asc=0, only_return_total_cnt=0):
|
|
|
|
+ if sort_col:
|
|
|
|
+ if asc:
|
|
|
|
+ sort = Sort(sorters=[FieldSort(sort_col, SortOrder.ASC)])
|
|
|
|
+ else:
|
|
|
|
+ sort = Sort(sorters=[FieldSort(sort_col, SortOrder.DESC)])
|
|
|
|
+ else:
|
|
|
|
+ sort = None
|
|
|
|
+ return_type = ColumnReturnType.SPECIFIED
|
|
|
|
+ rows, next_token, total_count, is_all_succeed = self.ots_client.search(table, table_index,
|
|
|
|
+ SearchQuery(bool_query, sort=sort,
|
|
|
|
+ limit=100,
|
|
|
|
+ get_total_count=True),
|
|
|
|
+ ColumnsToGet(columns, return_type))
|
|
|
|
+ if only_return_total_cnt:
|
|
|
|
+ return total_count
|
|
|
|
+
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ if show_total:
|
|
|
|
+ print('search total_count', total_count)
|
|
|
|
+ if len(list_data) >= limit:
|
|
|
|
+ print('limit ', limit, len(list_data))
|
|
|
|
+ return list_data[:limit]
|
|
|
|
+
|
|
|
|
+ if while_next:
|
|
|
|
+ while next_token and len(list_data) < limit:
|
|
|
|
+ rows, next_token, total_count, is_all_succeed = self.ots_client.search(table, table_index,
|
|
|
|
+ SearchQuery(bool_query,
|
|
|
|
+ next_token=next_token,
|
|
|
|
+ limit=100,
|
|
|
|
+ get_total_count=True),
|
|
|
|
+ ColumnsToGet(columns,
|
|
|
|
+ return_type))
|
|
|
|
+ list_data += getRow_ots(rows)
|
|
|
|
+ return list_data
|
|
|
|
+
|
|
|
|
+ def multi_thread_util(self, _list, task, threads):
|
|
|
|
+ result_list = []
|
|
|
|
+ # 创建一个线程池,参数为线程数量
|
|
|
|
+ with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
|
|
+ # 使用线程池映射任务函数和参数
|
|
|
|
+ futures = [executor.submit(task, _list[i]) for i in range(len(_list))]
|
|
|
|
+
|
|
|
|
+ # 等待所有任务完成,并获取返回值
|
|
|
|
+ for future in futures:
|
|
|
|
+ result = future.result() # 获取任务的返回值
|
|
|
|
+ # print(f"Task returned: {result}")
|
|
|
|
+ result_list.append(result)
|
|
|
|
+ return result_list
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def run_flow(data, test_mode=0, update=1):
|
|
|
|
+ docid, test_mode, update = data
|
|
|
|
+ ds = APProduction(test_mode, update)
|
|
|
|
+ print('run_flow, test_mode, update', test_mode, update)
|
|
|
|
+ # globals()['docid'] = data
|
|
|
|
+ try:
|
|
|
|
+ aps = ds.ap_data_flow(docid)
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ log('error ' + str(docid))
|
|
|
|
+ aps = []
|
|
|
|
+ return aps
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ test_mode = 0
|
|
|
|
+ test_docid_list = [400010986401]
|
|
|
|
+ update = 1
|
|
|
|
+
|
|
|
|
+ process_num = 7
|
|
|
|
+ sleep_sec = 100
|
|
|
|
+ while True:
|
|
|
|
+ sleep_flag = 1
|
|
|
|
+ ds = APProduction()
|
|
|
|
+
|
|
|
|
+ if test_mode:
|
|
|
|
+ tmp_data_list = test_docid_list
|
|
|
|
+ persistence_time_list = []
|
|
|
|
+ else:
|
|
|
|
+ # 获取新增审批公告
|
|
|
|
+ tmp_data_list, persistence_time_list = ds.get_approval_docid_from_document(max_cnt=50)
|
|
|
|
+ log('len(tmp_data_list)', len(tmp_data_list))
|
|
|
|
+
|
|
|
|
+ tmp_data_list = [[x, test_mode, update] for x in tmp_data_list]
|
|
|
|
+
|
|
|
|
+ # 取不到就等待再循环
|
|
|
|
+ if len(tmp_data_list) <= 0:
|
|
|
|
+ print('sleep for', sleep_sec)
|
|
|
|
+ time.sleep(sleep_sec)
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 多进程合并
|
|
|
|
+ mp_start_time = time.time()
|
|
|
|
+ with multiprocessing.Pool(processes=process_num) as pool:
|
|
|
|
+ results = pool.map(run_flow, tmp_data_list)
|
|
|
|
+ all_ap_num = len(results)
|
|
|
|
+ pool.close()
|
|
|
|
+ pool.join()
|
|
|
|
+
|
|
|
|
+ # 更新数据时间
|
|
|
|
+ if not test_mode:
|
|
|
|
+ last_time_str = ds.update_opertime(persistence_time_list)
|
|
|
|
+ # 判断时间
|
|
|
|
+ data_format = "%Y-%m-%d %H:%M:%S"
|
|
|
|
+ last_time = datetime.strptime(last_time_str, data_format)
|
|
|
|
+ now = datetime.now()
|
|
|
|
+ print('*' * 30, last_time_str)
|
|
|
|
+ # 执行结束的太快,等待再处理
|
|
|
|
+ if now - last_time >= timedelta(seconds=sleep_sec):
|
|
|
|
+ sleep_flag = 0
|
|
|
|
+
|
|
|
|
+ log('mp process data num:', len(tmp_data_list), 'update ap num:', all_ap_num,
|
|
|
|
+ 'cost:', round(time.time() - mp_start_time, 2),
|
|
|
|
+ 'avg:', round((time.time() - mp_start_time) / len(tmp_data_list), 2))
|
|
|
|
+
|
|
|
|
+ # 测试模式不循环
|
|
|
|
+ if test_mode:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ if sleep_flag:
|
|
|
|
+ print('sleep for', sleep_sec)
|
|
|
|
+ time.sleep(sleep_sec)
|