|
@@ -0,0 +1,136 @@
|
|
|
|
+import time
|
|
|
|
+import pandas as pd
|
|
|
|
+from sqlalchemy import create_engine
|
|
|
|
+from tablestore import INF_MIN, INF_MAX, CompositeColumnCondition, LogicalOperator, SingleColumnCondition, \
|
|
|
|
+ ComparatorType, Direction, OTSClientError, OTSServiceError
|
|
|
|
+from BaseDataMaintenance.dataSource.source import getConnect_ots
|
|
|
|
+
|
|
|
|
+ots_client = getConnect_ots()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_one_col_all_data():
|
|
|
|
+ table_name = 'designed_project'
|
|
|
|
+
|
|
|
|
+ # 设置范围读的起始主键。
|
|
|
|
+ inclusive_start_primary_key = [('partitionkey', INF_MIN), ('id', INF_MIN)]
|
|
|
|
+
|
|
|
|
+ # 设置范围读的结束主键。
|
|
|
|
+ exclusive_end_primary_key = [('partitionkey', INF_MAX), ('id', INF_MAX)]
|
|
|
|
+
|
|
|
|
+ # 查询所有列。
|
|
|
|
+ columns_to_get = ['project_investment']
|
|
|
|
+
|
|
|
|
+ # 每次最多返回90行,如果总共有100个结果,首次查询时指定limit=90,则第一次最多返回90,最少可能返回0个结果,但是next_start_primary_key不为None。
|
|
|
|
+ limit = 90
|
|
|
|
+
|
|
|
|
+ # 设置过滤器,增加列条件。过滤条件为address列值等于'China'且age列值小于50。
|
|
|
|
+ cond = CompositeColumnCondition(LogicalOperator.AND)
|
|
|
|
+
|
|
|
|
+ # 如果某行不存在对应列时,您需要配置pass_if_missing参数来确定该行是否满足过滤条件。
|
|
|
|
+ # 当不设置pass_if_missing或者设置pass_if_missing为True时,表示当某行不存在该列时,该行满足过滤条件。
|
|
|
|
+ # 当设置pass_if_missing为False时,表示当某行不存在该列时,该行不满足过滤条件。
|
|
|
|
+ # cond.add_sub_condition(SingleColumnCondition("address", 'China', ComparatorType.EQUAL, pass_if_missing=False))
|
|
|
|
+ # cond.add_sub_condition(SingleColumnCondition("age", 50, ComparatorType.LESS_THAN, pass_if_missing=False))
|
|
|
|
+
|
|
|
|
+ all_rows = []
|
|
|
|
+ start_time1 = time.time()
|
|
|
|
+ try:
|
|
|
|
+ # 调用get_range接口。
|
|
|
|
+ consumed, next_start_primary_key, row_list, next_token = ots_client.get_range(
|
|
|
|
+ table_name,
|
|
|
|
+ Direction.FORWARD,
|
|
|
|
+ inclusive_start_primary_key,
|
|
|
|
+ exclusive_end_primary_key,
|
|
|
|
+ columns_to_get,
|
|
|
|
+ limit,
|
|
|
|
+ # column_filter=cond,
|
|
|
|
+ max_version=1,
|
|
|
|
+ # time_range=(1557125059000, 1557129059000) # start_time大于等于1557125059000,end_time小于1557129059000。
|
|
|
|
+ )
|
|
|
|
+ all_rows.extend(row_list)
|
|
|
|
+
|
|
|
|
+ # 当next_start_primary_key不为空时,则继续读取数据。
|
|
|
|
+ index = 0
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ while next_start_primary_key is not None:
|
|
|
|
+ if index % 1000 == 0:
|
|
|
|
+ print('Loop', (index+2)*limit, time.time()-start_time)
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ inclusive_start_primary_key = next_start_primary_key
|
|
|
|
+ consumed, next_start_primary_key, row_list, next_token = ots_client.get_range(
|
|
|
|
+ table_name,
|
|
|
|
+ Direction.FORWARD,
|
|
|
|
+ inclusive_start_primary_key,
|
|
|
|
+ exclusive_end_primary_key,
|
|
|
|
+ columns_to_get,
|
|
|
|
+ limit,
|
|
|
|
+ # column_filter=cond,
|
|
|
|
+ max_version=1
|
|
|
|
+ )
|
|
|
|
+ all_rows.extend(row_list)
|
|
|
|
+ index += 1
|
|
|
|
+
|
|
|
|
+ # 客户端异常,一般为参数错误或者网络异常。
|
|
|
|
+ except OTSClientError as e:
|
|
|
|
+ print('get row failed, http_status:%d, error_message:%s' % (e.get_http_status(), e.get_error_message()))
|
|
|
|
+ # 服务端异常,一般为参数错误或者流控错误。
|
|
|
|
+ except OTSServiceError as e:
|
|
|
|
+ print('get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s' % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id()))
|
|
|
|
+
|
|
|
|
+ # 打印主键和属性列。
|
|
|
|
+ result_rows = []
|
|
|
|
+ for row in all_rows:
|
|
|
|
+ # print(row.primary_key, row.attribute_columns)
|
|
|
|
+ result_rows.append([row.primary_key[0][1], row.primary_key[1][1], row.attribute_columns[0][1]])
|
|
|
|
+ print('Total rows: ', len(all_rows), 'Total time: ', time.time() - start_time1)
|
|
|
|
+ print(result_rows[0])
|
|
|
|
+ print(result_rows[1])
|
|
|
|
+
|
|
|
|
+ return result_rows
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def list_to_xlsx(data_list):
|
|
|
|
+ df = pd.DataFrame(data_list)
|
|
|
|
+ df.columns = ['partitionkey', 'id', 'project_investment']
|
|
|
|
+ df.to_csv('D:\\BIDI_DOC\\比地_文档\\统一格式_project_investment.csv', index=False)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def csv_to_mysql():
|
|
|
|
+ mysql_host = '192.168.2.170:3306'
|
|
|
|
+ mysql_db = 'exportdb'
|
|
|
|
+ mysql_user = 'root'
|
|
|
|
+ mysql_pwd = 'pwdformysql0922'
|
|
|
|
+ mysql_table = 'bdm_one_col_format_unify'
|
|
|
|
+ xlsx_path = r'D:\BIDI_DOC\比地_文档\统一格式_project_investment.csv'
|
|
|
|
+
|
|
|
|
+ engine = create_engine('mysql+pymysql://{}:{}@{}/{}?charset=utf8'.format(mysql_user, mysql_pwd, mysql_host, mysql_db))
|
|
|
|
+ # df = pd.read_excel(xlsx_path)
|
|
|
|
+ df = pd.read_csv(xlsx_path)
|
|
|
|
+ # 表名 需删除索引列
|
|
|
|
+ df.to_sql(mysql_table, con=engine, if_exists='append', index=False)
|
|
|
|
+ """
|
|
|
|
+ to_sql参数:(比较重要)
|
|
|
|
+ if_exists:表如果存在怎么处理
|
|
|
|
+ append:追加
|
|
|
|
+ replace:删除原表,建立新表再添加
|
|
|
|
+ fail:什么都不干
|
|
|
|
+ chunksize: 默认的话是一次性导入, 给值的话是批量导入,一批次导入多少
|
|
|
|
+ index=False:不插入索引index
|
|
|
|
+ dtype 创建表结构
|
|
|
|
+ 需要导入 import sqlalchemy
|
|
|
|
+ dtype = {'id': sqlalchemy.types.BigInteger(),
|
|
|
|
+ 'name': sqlalchemy.types.String(length=20),
|
|
|
|
+ 'sex': sqlalchemy.types.String(length=20),
|
|
|
|
+ 'age': sqlalchemy.types.BigInteger(),
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ # _list = get_one_col_all_data()
|
|
|
|
+ # list_to_xlsx(_list)
|
|
|
|
+ csv_to_mysql()
|
|
|
|
+
|
|
|
|
+ # 拿到单行所有数据上传到maxcompute处理成统一格式,再更新
|
|
|
|
+ # 本地取数据是因为maxcompute取数据需要数据类型,一列中有多种数据类型会报错
|