|
@@ -0,0 +1,646 @@
|
|
|
|
+
|
|
|
|
+from BaseDataMaintenance.common.Utils import *
|
|
|
|
+from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity
|
|
|
|
+from tablestore import *
|
|
|
|
+import pandas as pd
|
|
|
|
+from queue import Queue
|
|
|
|
+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
|
+from BaseDataMaintenance.model.ots.document import Document
|
|
|
|
+
|
|
|
|
+import json
|
|
|
|
+from uuid import uuid4
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+
|
|
|
|
+'''
|
|
|
|
+"approval": [
|
|
|
|
+ {
|
|
|
|
+ "approval_items": "", #审批事项
|
|
|
|
+ "approval_result": "", #审批结果
|
|
|
|
+ "approver": "",#审批部门
|
|
|
|
+ "city": "深圳",
|
|
|
|
+ "construct_company": "深圳市赛孚电子科技有限公司",# 建设单位
|
|
|
|
+ "construction_scale": "",#建设规模
|
|
|
|
+ "declare_company": "",#申报单位
|
|
|
|
+ "district": "光明",
|
|
|
|
+ "doc_num": "",#审批文号
|
|
|
|
+ "evaluation_agency": "",#环评机构
|
|
|
|
+ "legal_person": "陈雷", # 项目法人
|
|
|
|
+ "moneysource": "",# 资金来源
|
|
|
|
+ "phone": "",
|
|
|
|
+ "pro_type": "",#申报类型
|
|
|
|
+ "project_addr": "广东省深圳市光明区玉塘街道田寮社区第七工业区26栋301",
|
|
|
|
+ "project_code": "",
|
|
|
|
+ "project_name": "深圳市赛孚电子科技有限公司销售医用射线装置项目",
|
|
|
|
+ "properties": "新建", #建设性质
|
|
|
|
+ "province": "广东",
|
|
|
|
+ "time_commencement": "",# 开工时间
|
|
|
|
+ "time_completion": "",#竣工时间
|
|
|
|
+ "time_declare": "",#申报时间
|
|
|
|
+ "total_tendereeMoney": "200000", # 总投资
|
|
|
|
+ "year_limit": ""#建设年限,
|
|
|
|
+"compilation_unit": "编制单位",
|
|
|
|
+"publisher": "发布单位",
|
|
|
|
+"time_approval":"审批时间",
|
|
|
|
+"time_release": "发布日期"
|
|
|
|
+ }
|
|
|
|
+ ]
|
|
|
|
+'''
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+key_trans = {
|
|
|
|
+ "doctitle":"公告标题",
|
|
|
|
+ "page_time":"公告时间",
|
|
|
|
+ "province": "省份",
|
|
|
|
+ "city": "城市",
|
|
|
|
+ "district": "地区",
|
|
|
|
+
|
|
|
|
+ "approval_items": "审批事项",
|
|
|
|
+ "approval_result": "审批结果",
|
|
|
|
+ "declare_company": "申报单位",
|
|
|
|
+ "construct_company": "建设单位",
|
|
|
|
+ "evaluation_agency": "环评机构",
|
|
|
|
+ "approver": "审批部门",
|
|
|
|
+ "compilation_unit": "编制单位",
|
|
|
|
+ "publisher": "发布单位",
|
|
|
|
+
|
|
|
|
+ "total_tendereeMoney": "总投资",
|
|
|
|
+ "construction_scale": "建设规模",
|
|
|
|
+ "proportion":"建筑面积",
|
|
|
|
+ "usearea":"用地面积",
|
|
|
|
+
|
|
|
|
+ "doc_num": "审批文号",
|
|
|
|
+
|
|
|
|
+ "legal_person": "项目法人",
|
|
|
|
+ "moneysource": "资金来源",
|
|
|
|
+ "moneyuse":"资金构成",
|
|
|
|
+ "env_invest":"环保投资",
|
|
|
|
+ "phone": "电话",
|
|
|
|
+ "pro_type": "申报类型",
|
|
|
|
+ "project_addr": "项目地址",
|
|
|
|
+ "project_code": "项目编号",
|
|
|
|
+ "project_name": "项目名称",
|
|
|
|
+ "properties": "建设性质",
|
|
|
|
+ "time_commencement": "开工时间",
|
|
|
|
+ "time_completion": "竣工时间",
|
|
|
|
+ "time_declare": "申报时间",
|
|
|
|
+
|
|
|
|
+ "year_limit": "建设年限",
|
|
|
|
+
|
|
|
|
+ "time_approval":"审批时间",
|
|
|
|
+ "time_release": "发布日期"
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+key_trans_d = {"docid":"公告id"}
|
|
|
|
+key_trans_d.update(key_trans)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_proportion(content, has_preffix=True):
|
|
|
|
+ if not content:
|
|
|
|
+ return "", ""
|
|
|
|
+ # log("content")
|
|
|
|
+ # log(content)
|
|
|
|
+ suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
|
+ reg_dict = {
|
|
|
|
+ 0: "(?P<proportion>(总((建筑|建设)(面积|规模)|长|长度))" + suffix,
|
|
|
|
+ 1: "(?P<proportion>((建筑|建设)(面积|规模)|全长)" + suffix,
|
|
|
|
+ 2: "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)" + suffix
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not has_preffix:
|
|
|
|
+ reg_dict[3] = "(?P<proportion>" + suffix
|
|
|
|
+
|
|
|
|
+ _proportion = ""
|
|
|
|
+ for i in range(len(list(reg_dict.keys()))):
|
|
|
|
+ if _proportion:
|
|
|
|
+ break
|
|
|
|
+ _pattern = reg_dict.get(i)
|
|
|
|
+ # logging.info('content ' + str(content))
|
|
|
|
+ match = re.search(_pattern, str(content))
|
|
|
|
+ if match:
|
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
+
|
|
|
|
+ if not _proportion:
|
|
|
|
+ return "", ""
|
|
|
|
+
|
|
|
|
+ # 统一格式
|
|
|
|
+ multiple_cnt = 1
|
|
|
|
+ digit = ""
|
|
|
|
+
|
|
|
|
+ # 确定具体数字
|
|
|
|
+ match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
|
|
|
|
+ if match:
|
|
|
|
+ # logging.info(str(_proportion) + ' ' + str(match.group()))
|
|
|
|
+ d1 = match.group('d1')
|
|
|
|
+ d2 = match.group('d2')
|
|
|
|
+ try:
|
|
|
|
+ d1 = int(re.sub(',', '', d1))
|
|
|
|
+ except:
|
|
|
|
+ return "", ""
|
|
|
|
+ if d2:
|
|
|
|
+ d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
|
|
|
|
+ # print('d1, d2', d1, d2)
|
|
|
|
+ d1 += d2
|
|
|
|
+ digit = d1
|
|
|
|
+ # print('digit', digit)
|
|
|
|
+
|
|
|
|
+ # 确定中文倍数
|
|
|
|
+ _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
|
|
|
|
+ match = re.search('[十百千万亿]+', _proportion2)
|
|
|
|
+ _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
|
|
|
+ if match:
|
|
|
|
+ for c in match.group():
|
|
|
|
+ multiple_cnt *= _dict.get(c)
|
|
|
|
+ _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
|
|
|
|
+ else:
|
|
|
|
+ _proportion3 = _proportion2
|
|
|
|
+ # print('multiple_cnt2', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 确定面积/长度
|
|
|
|
+ match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ unit = '㎡'
|
|
|
|
+ else:
|
|
|
|
+ unit = 'm'
|
|
|
|
+
|
|
|
|
+ # 确定单位倍数
|
|
|
|
+ match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ if unit == 'm':
|
|
|
|
+ if re.search('[kK千公]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000
|
|
|
|
+ elif re.search('[里]', match.group()):
|
|
|
|
+ multiple_cnt *= Decimal(str(500))
|
|
|
|
+ else:
|
|
|
|
+ if '亩' in match.group():
|
|
|
|
+ multiple_cnt *= Decimal(str(666.67))
|
|
|
|
+ elif '顷' in match.group():
|
|
|
|
+ multiple_cnt *= 10000
|
|
|
|
+ elif re.search('千米|公里|k[mM㎡]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000000
|
|
|
|
+ # print('multiple_cnt1', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 拼接
|
|
|
|
+ digit = str(digit * multiple_cnt) + unit
|
|
|
|
+
|
|
|
|
+ return _proportion, digit
|
|
|
|
+
|
|
|
|
+def extract_usearea(content, has_preffix=True):
|
|
|
|
+ if not content:
|
|
|
|
+ return "", ""
|
|
|
|
+ # log("content")
|
|
|
|
+ # log(content)
|
|
|
|
+ suffix = "[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
|
|
|
|
+ reg_dict = {
|
|
|
|
+ 0: "(?P<proportion>(总((用地|占地|使用)(面积|规模)|长|长度))" + suffix,
|
|
|
|
+ 1: "(?P<proportion>((用地|占地|使用)(面积|规模)|全长)" + suffix,
|
|
|
|
+ 2: "(?P<proportion>((用地|占地|使用)?面积)" + suffix
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not has_preffix:
|
|
|
|
+ reg_dict[3] = "(?P<proportion>" + suffix
|
|
|
|
+
|
|
|
|
+ _proportion = ""
|
|
|
|
+ for i in range(len(list(reg_dict.keys()))):
|
|
|
|
+ if _proportion:
|
|
|
|
+ break
|
|
|
|
+ _pattern = reg_dict.get(i)
|
|
|
|
+ # logging.info('content ' + str(content))
|
|
|
|
+ match = re.search(_pattern, str(content))
|
|
|
|
+ if match:
|
|
|
|
+ _proportion = match.groupdict().get("proportion", "")
|
|
|
|
+
|
|
|
|
+ if not _proportion:
|
|
|
|
+ return "", ""
|
|
|
|
+
|
|
|
|
+ # 统一格式
|
|
|
|
+ multiple_cnt = 1
|
|
|
|
+ digit = ""
|
|
|
|
+
|
|
|
|
+ # 确定具体数字
|
|
|
|
+ match = re.search('(?P<d1>[\d,]+)(?P<d2>(\.\d+)?)', _proportion)
|
|
|
|
+ if match:
|
|
|
|
+ # logging.info(str(_proportion) + ' ' + str(match.group()))
|
|
|
|
+ d1 = match.group('d1')
|
|
|
|
+ d2 = match.group('d2')
|
|
|
|
+ try:
|
|
|
|
+ d1 = int(re.sub(',', '', d1))
|
|
|
|
+ except:
|
|
|
|
+ return "", ""
|
|
|
|
+ if d2:
|
|
|
|
+ d2 = Decimal(d2[1:]) / Decimal(str(int(10 ** len(d2[1:]))))
|
|
|
|
+ # print('d1, d2', d1, d2)
|
|
|
|
+ d1 += d2
|
|
|
|
+ digit = d1
|
|
|
|
+ # print('digit', digit)
|
|
|
|
+
|
|
|
|
+ # 确定中文倍数
|
|
|
|
+ _proportion2 = re.sub(re.escape(match.group()), '', _proportion)
|
|
|
|
+ match = re.search('[十百千万亿]+', _proportion2)
|
|
|
|
+ _dict = {'十': 10, '百': 100, '千': 1000, '万': 10000, '亿': 100000000}
|
|
|
|
+ if match:
|
|
|
|
+ for c in match.group():
|
|
|
|
+ multiple_cnt *= _dict.get(c)
|
|
|
|
+ _proportion3 = re.sub(re.escape(match.group()), '', _proportion2)
|
|
|
|
+ else:
|
|
|
|
+ _proportion3 = _proportion2
|
|
|
|
+ # print('multiple_cnt2', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 确定面积/长度
|
|
|
|
+ match = re.search('[平方㎡顷亩]+|[mM]2', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ unit = '㎡'
|
|
|
|
+ else:
|
|
|
|
+ unit = 'm'
|
|
|
|
+
|
|
|
|
+ # 确定单位倍数
|
|
|
|
+ match = re.search('[平方kK千万公㎡mM米里顷亩]+2?', _proportion3)
|
|
|
|
+ if match:
|
|
|
|
+ if unit == 'm':
|
|
|
|
+ if re.search('[kK千公]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000
|
|
|
|
+ elif re.search('[里]', match.group()):
|
|
|
|
+ multiple_cnt *= Decimal(str(500))
|
|
|
|
+ else:
|
|
|
|
+ if '亩' in match.group():
|
|
|
|
+ multiple_cnt *= Decimal(str(666.67))
|
|
|
|
+ elif '顷' in match.group():
|
|
|
|
+ multiple_cnt *= 10000
|
|
|
|
+ elif re.search('千米|公里|k[mM㎡]', match.group()):
|
|
|
|
+ multiple_cnt *= 1000000
|
|
|
|
+ # print('multiple_cnt1', multiple_cnt)
|
|
|
|
+
|
|
|
|
+ # 拼接
|
|
|
|
+ digit = str(digit * multiple_cnt) + unit
|
|
|
|
+
|
|
|
|
+ return _proportion, digit
|
|
|
|
+
|
|
|
|
+def extract_env_invest(content):
|
|
|
|
+ pattern = "环保投资[大概约为是::]*(?P<invs>\d+(\.\d+)?万?元)"
|
|
|
|
+
|
|
|
|
+ match = re.search(pattern,content)
|
|
|
|
+ if match is not None:
|
|
|
|
+ invest = match.groupdict().get("invs","")
|
|
|
|
+ money = getUnifyMoney(invest)
|
|
|
|
+ if money>0:
|
|
|
|
+ return money
|
|
|
|
+ return ""
|
|
|
|
+
|
|
|
|
+def extract_moneyuse(content):
|
|
|
|
+ list_sentences = re.split(",|。",content)
|
|
|
|
+ list_data = []
|
|
|
|
+ pattern = "^.{,20}[费用|预备费|费][大概约为是::]*\d+(\.\d+)?万?元.{,20}$"
|
|
|
|
+ for sentence in list_sentences:
|
|
|
|
+ match = re.search(pattern,sentence)
|
|
|
|
+ if match is not None:
|
|
|
|
+ list_data.append(sentence)
|
|
|
|
+ return ",".join(list_data)
|
|
|
|
+
|
|
|
|
+def get_approval_data(ots_client,ots_capacity,docid):
|
|
|
|
+
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("docid",docid)
|
|
|
|
+ ])
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
|
+ SearchQuery(bool_query),
|
|
|
|
+ ColumnsToGet(["doctitle","project_name","page_time","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ for _d in list_data:
|
|
|
|
+ approval_json = _d.get("approval_json")
|
|
|
|
+ partitionkey = _d.get("partitionkey")
|
|
|
|
+ docid = _d.get("docid")
|
|
|
|
+ doctitle = _d.get("doctitle")
|
|
|
|
+ project_name = _d.get("project_name")
|
|
|
|
+ page_time = _d.get("page_time")
|
|
|
|
+ extract_json = _d.get("extract_json")
|
|
|
|
+
|
|
|
|
+ _d_html = {"partitionkey":partitionkey,"docid":docid}
|
|
|
|
+ _html = Document(_d_html)
|
|
|
|
+ _html.fix_columns(ots_capacity,["dochtmlcon"],True)
|
|
|
|
+ dochtml = _html.getProperties().get("dochtmlcon","")
|
|
|
|
+ doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
|
|
|
|
+ attachmenttextcon = ""
|
|
|
|
+ try:
|
|
|
|
+ _extract = json.loads(extract_json)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ _extract = {}
|
|
|
|
+ proportion = _extract.get("pb",{}).get("proportion")
|
|
|
|
+ _,usearea = extract_usearea(doctextcon+attachmenttextcon)
|
|
|
|
+ env_invest = extract_env_invest(doctextcon+attachmenttextcon)
|
|
|
|
+ moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
|
|
|
|
+
|
|
|
|
+ if approval_json:
|
|
|
|
+ list_approval = json.loads(approval_json)
|
|
|
|
+ for _appr in list_approval:
|
|
|
|
+ _appr["partitionkey"] = partitionkey
|
|
|
|
+ _appr["docid"] = docid
|
|
|
|
+ _appr["doctitle"] = doctitle
|
|
|
|
+ _appr["page_time"] = page_time
|
|
|
|
+ _appr["proportion"] = proportion
|
|
|
|
+ _appr["usearea"] = usearea
|
|
|
|
+ _appr["env_invest"] = env_invest
|
|
|
|
+ _appr["moneyuse"] = moneyuse
|
|
|
|
+
|
|
|
|
+ fix_area(ots_client,_appr)
|
|
|
|
+
|
|
|
|
+ construction_scale = _d.get("construction_scale","")
|
|
|
|
+ proportion,_ = extract_proportion(construction_scale)
|
|
|
|
+ if proportion!="":
|
|
|
|
+ _appr["proportion"] = proportion
|
|
|
|
+ _,usearea = extract_usearea(construction_scale)
|
|
|
|
+ if usearea!="":
|
|
|
|
+ _appr["usearea"] = usearea
|
|
|
|
+ env_invest = extract_env_invest(construction_scale)
|
|
|
|
+ if env_invest!="":
|
|
|
|
+ _appr["env_invest"] = env_invest
|
|
|
|
+ moneyuse = extract_moneyuse(construction_scale)
|
|
|
|
+ if moneyuse!="":
|
|
|
|
+ _appr["moneyuse"] = moneyuse
|
|
|
|
+
|
|
|
|
+ return list_approval
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def check_approval(appr1,appr2):
|
|
|
|
+ check_keys = ["declare_company","construct_company","total_tendereeMoney","proportion","usearea","doc_num","project_code"]
|
|
|
|
+ same_count = 0
|
|
|
|
+ for k in check_keys:
|
|
|
|
+ if k in appr1 and k in appr2:
|
|
|
|
+ if appr1[k]==appr2[k] and appr1[k] is not None and appr1[k]!="":
|
|
|
|
+ same_count += 1
|
|
|
|
+
|
|
|
|
+ if same_count>=1:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def merge_approval_real(ots_client,ots_capacity,approval):
|
|
|
|
+ doc_num = approval.get("doc_num","")
|
|
|
|
+ doctitle = approval.get("doctitle","")
|
|
|
|
+ project_name = approval.get("project_name","")
|
|
|
|
+ project_code = approval.get("project_code","")
|
|
|
|
+
|
|
|
|
+ docid = approval.get("docid")
|
|
|
|
+ should_queries = []
|
|
|
|
+
|
|
|
|
+ if doc_num!="":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle",doc_num))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon",doc_num))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon",doc_num))
|
|
|
|
+ if doctitle!="":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle",doctitle))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon",doctitle))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon",doctitle))
|
|
|
|
+ if project_name!="":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle",project_name))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon",project_name))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon",project_name))
|
|
|
|
+ if project_code!="":
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctitle",project_code))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("doctextcon",project_code))
|
|
|
|
+ should_queries.append(MatchPhraseQuery("attachmenttextcon",project_code))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ _query = BoolQuery(should_queries=should_queries,must_not_queries=[TermQuery("docid",docid)])
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ RangeQuery("status",201,301),
|
|
|
|
+ _query
|
|
|
|
+ ])
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
|
+ SearchQuery(bool_query),
|
|
|
|
+ ColumnsToGet(["doctitle","page_time","project_name","project_code","approval_json","extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ approvals = [approval]
|
|
|
|
+ for _d in list_data:
|
|
|
|
+ approval_json = _d.get("approval_json")
|
|
|
|
+ partitionkey = _d.get("partitionkey")
|
|
|
|
+ docid = _d.get("docid")
|
|
|
|
+ doctitle = _d.get("doctitle")
|
|
|
|
+ project_name = _d.get("project_name")
|
|
|
|
+ page_time = _d.get("page_time")
|
|
|
|
+ extract_json = _d.get("extract_json")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ _d_html = {"partitionkey":partitionkey,"docid":docid}
|
|
|
|
+ _html = Document(_d_html)
|
|
|
|
+ _html.fix_columns(ots_capacity,["dochtmlcon"],True)
|
|
|
|
+ dochtml = _html.getProperties().get("dochtmlcon","")
|
|
|
|
+ doctextcon = BeautifulSoup(dochtml,"lxml").get_text()
|
|
|
|
+ attachmenttextcon = ""
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ _extract = json.loads(extract_json)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ _extract = {}
|
|
|
|
+ proportion = _extract.get("pb",{}).get("proportion")
|
|
|
|
+ _,usearea = extract_usearea(doctextcon+attachmenttextcon)
|
|
|
|
+ env_invest = extract_env_invest(doctextcon+attachmenttextcon)
|
|
|
|
+ moneyuse = extract_moneyuse(doctextcon+attachmenttextcon)
|
|
|
|
+ if approval_json:
|
|
|
|
+ list_approval = json.loads(approval_json)
|
|
|
|
+ for _appr in list_approval:
|
|
|
|
+ _appr["partitionkey"] = partitionkey
|
|
|
|
+ _appr["docid"] = docid
|
|
|
|
+ _appr["doctitle"] = doctitle
|
|
|
|
+ _appr["page_time"] = page_time
|
|
|
|
+ _appr["usearea"] = usearea
|
|
|
|
+ _appr["env_invest"] = env_invest
|
|
|
|
+ _appr["moneyuse"] = moneyuse
|
|
|
|
+
|
|
|
|
+ fix_area(ots_client,_appr)
|
|
|
|
+
|
|
|
|
+ construction_scale = _d.get("construction_scale","")
|
|
|
|
+ proportion,_ = extract_proportion(construction_scale)
|
|
|
|
+ if proportion!="":
|
|
|
|
+ _appr["proportion"] = proportion
|
|
|
|
+ _,usearea = extract_usearea(construction_scale)
|
|
|
|
+ if usearea!="":
|
|
|
|
+ _appr["usearea"] = usearea
|
|
|
|
+ env_invest = extract_env_invest(construction_scale)
|
|
|
|
+ if env_invest!="":
|
|
|
|
+ _appr["env_invest"] = env_invest
|
|
|
|
+ moneyuse = extract_moneyuse(construction_scale)
|
|
|
|
+ if moneyuse!="":
|
|
|
|
+ _appr["moneyuse"] = moneyuse
|
|
|
|
+ if check_approval(approval,_appr):
|
|
|
|
+ approvals.append(_appr)
|
|
|
|
+ return approvals
|
|
|
|
+
|
|
|
|
+def get_enterprise_area(ots_client,name):
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ TermQuery("name",name)
|
|
|
|
+ ])
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
|
|
|
|
+ SearchQuery(bool_query),
|
|
|
|
+ ColumnsToGet(["province","city","district"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+ _d = {}
|
|
|
|
+ if len(list_data)>0:
|
|
|
|
+ _d["province"] = list_data[0].get("province","")
|
|
|
|
+ _d["city"] = list_data[0].get("city","")
|
|
|
|
+ _d["district"] = list_data[0].get("district","")
|
|
|
|
+ return _d
|
|
|
|
+
|
|
|
|
+def area_count(_d):
|
|
|
|
+ keys = ["province","city","district"]
|
|
|
|
+ return sum([1 if _d.get(k,"") not in ("","全国","未知") else 0 for k in keys])
|
|
|
|
+
|
|
|
|
+def fix_area(ots_client,appr):
|
|
|
|
+ if appr.get("district","")!="":
|
|
|
|
+ return
|
|
|
|
+ declare_company = appr.get("declare_company","")
|
|
|
|
+ _d = get_enterprise_area(ots_client,declare_company)
|
|
|
|
+ if area_count(_d)>area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ construct_company = appr.get("construct_company","")
|
|
|
|
+ _d = get_enterprise_area(ots_client,construct_company)
|
|
|
|
+ if area_count(_d)>area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ approver = appr.get("approver","")
|
|
|
|
+ _d = get_enterprise_area(ots_client,approver)
|
|
|
|
+ if area_count(_d)>area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ compilation_unit = appr.get("compilation_unit","")
|
|
|
|
+ _d = get_enterprise_area(ots_client,approver)
|
|
|
|
+ if area_count(_d)>area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+ publisher = appr.get("publisher","")
|
|
|
|
+ _d = get_enterprise_area(ots_client,publisher)
|
|
|
|
+ if area_count(_d)>area_count(appr):
|
|
|
|
+ appr.update(_d)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def generate_projects(approvals):
|
|
|
|
+ project_id = str(uuid4())
|
|
|
|
+ approvals.sort(key=lambda x:x.get("page_time",""),reverse=False)
|
|
|
|
+ _dict = {}
|
|
|
|
+ for appr in approvals:
|
|
|
|
+ _d = {}
|
|
|
|
+ _d_area = {}
|
|
|
|
+ for k,v in appr.items():
|
|
|
|
+ if v is not None and v!="":
|
|
|
|
+ if k in ("province","city","district"):
|
|
|
|
+ _d_area[k] = v
|
|
|
|
+ else:
|
|
|
|
+ _d[k] = v
|
|
|
|
+ if _dict.get("province","")=="" and _d_area.get("province","")!="":
|
|
|
|
+ _dict.update(_d_area)
|
|
|
|
+ if _dict.get("city","")=="" and _d_area.get("city","")!="":
|
|
|
|
+ _dict.update(_d_area)
|
|
|
|
+ if _dict.get("district","")=="" and _d_area.get("district","")!="":
|
|
|
|
+ _dict.update(_d_area)
|
|
|
|
+ _dict.update(_d)
|
|
|
|
+ _dict["id"] = project_id
|
|
|
|
+ return _dict
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def merge_approval():
|
|
|
|
+ ots_client = getConnect_ots()
|
|
|
|
+ ots_capacity = getConnect_ots_capacity()
|
|
|
|
+
|
|
|
|
+ list_data = []
|
|
|
|
+
|
|
|
|
+ # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-11\20241104审批项目公告_审批要素.xlsx"
|
|
|
|
+ # df = pd.read_excel(filename)
|
|
|
|
+ # _count = 0
|
|
|
|
+ # for docid in df["公告id"]:
|
|
|
|
+ # docid = int(docid)
|
|
|
|
+ # _count += 1
|
|
|
|
+ # # if _count>3000:
|
|
|
|
+ # # break
|
|
|
|
+ # # if docid!=400066972170 and docid!=400066972181:
|
|
|
|
+ # # continue
|
|
|
|
+ # # list_approval = get_approval_data(ots_client,docid)
|
|
|
|
+ # # if list_approval:
|
|
|
|
+ # # list_data.extend(list_approval)
|
|
|
|
+ # list_data.append(docid)
|
|
|
|
+
|
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
|
+ RangeQuery("status",201,301),
|
|
|
|
+ TermQuery("page_time","2024-11-04"),
|
|
|
|
+ TermQuery("docchannel",302),
|
|
|
|
+ ])
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
|
|
|
|
+ ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_row = getRow_ots(rows)
|
|
|
|
+ for _data in list_row:
|
|
|
|
+ list_data.append(_data.get("docid"))
|
|
|
|
+
|
|
|
|
+ while next_token:
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
|
+ ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_row = getRow_ots(rows)
|
|
|
|
+ for _data in list_row:
|
|
|
|
+ list_data.append(_data.get("docid"))
|
|
|
|
+ print("%d/%d"%(len(list_data),total_count))
|
|
|
|
+ # if len(list_data)>=2000:
|
|
|
|
+ # break
|
|
|
|
+
|
|
|
|
+ task_queue = Queue()
|
|
|
|
+ for _data in list_data:
|
|
|
|
+ task_queue.put(_data)
|
|
|
|
+
|
|
|
|
+ result_queue = Queue()
|
|
|
|
+
|
|
|
|
+ def merge_approval_handle(docid,result_queue):
|
|
|
|
+ print("docid",docid)
|
|
|
|
+ list_approval = get_approval_data(ots_client,ots_capacity,docid)
|
|
|
|
+ if list_approval:
|
|
|
|
+ for appr in list_approval:
|
|
|
|
+ approvals = merge_approval_real(ots_client,ots_capacity,appr)
|
|
|
|
+ result_queue.put(approvals)
|
|
|
|
+
|
|
|
|
+ mt = MultiThreadHandler(task_queue,merge_approval_handle,result_queue,30)
|
|
|
|
+ mt.run()
|
|
|
|
+
|
|
|
|
+ list_approvals = []
|
|
|
|
+ try:
|
|
|
|
+ while 1:
|
|
|
|
+ item = result_queue.get(timeout=1)
|
|
|
|
+ list_approvals.append(item)
|
|
|
|
+ except:
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+ data_approval = []
|
|
|
|
+ data_approvals_p = []
|
|
|
|
+ for approvals in list_approvals:
|
|
|
|
+ _project = generate_projects(approvals)
|
|
|
|
+ _project_id = _project.get("id")
|
|
|
|
+
|
|
|
|
+ for _approval in approvals:
|
|
|
|
+
|
|
|
|
+ _d = {"项目id":_project_id}
|
|
|
|
+ for k,v in key_trans_d.items():
|
|
|
|
+ if k in _approval:
|
|
|
|
+ _d[v] = _approval[k]
|
|
|
|
+ else:
|
|
|
|
+ _d[v] = ""
|
|
|
|
+ data_approval.append(_d)
|
|
|
|
+ _d = {"项目id":_project_id}
|
|
|
|
+ for k,v in key_trans.items():
|
|
|
|
+ if k in _project:
|
|
|
|
+ _d[v] = _project[k]
|
|
|
|
+ else:
|
|
|
|
+ _d[v] = ""
|
|
|
|
+ data_approvals_p.append(_d)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ df_approval = pd.DataFrame(data_approval)
|
|
|
|
+ df_approvals_p = pd.DataFrame(data_approvals_p)
|
|
|
|
+ df_approval.to_excel("a.xlsx")
|
|
|
|
+ df_approvals_p.to_excel("b.xlsx")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ merge_approval()
|