克隆策略
In [21]:
import matplotlib
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, Layer, MaxPool2D, AveragePooling2D, Dropout,Lambda, Concatenate, BatchNormalization, Flatten, Dense, Conv2D
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.utils import plot_model
import pickle
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
import time
# import tensorflow_probability as tfp
from tqdm import tqdm
from datetime import date, timedelta
# from tensorflow.image import extract_patches
import tensorflow as tf
import matplotlib.pyplot as plt

import os
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
# import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
cpus = tf.config.experimental.list_physical_devices(device_type='CPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
gpus, cpus
Out[21]:
([PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')],
 [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')])
In [3]:
m7.data.read().shape
Out[3]:
(3605843, 18)

    {"Description":"实验创建于2017/8/26","Summary":"","Graph":{"EdgesInternal":[{"DestinationInputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-15:instruments","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8:data"},{"DestinationInputPortId":"-585:features_ds","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24:data"},{"DestinationInputPortId":"-22014:input_3","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24:data"},{"DestinationInputPortId":"-760:features","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24:data"},{"DestinationInputPortId":"-222:input_data","SourceOutputPortId":"-215:data"},{"DestinationInputPortId":"-760:train_ds","SourceOutputPortId":"-222:data"},{"DestinationInputPortId":"-760:test_ds","SourceOutputPortId":"-222:data"},{"DestinationInputPortId":"-215:features","SourceOutputPortId":"-585:data"},{"DestinationInputPortId":"-222:features","SourceOutputPortId":"-585:data"},{"DestinationInputPortId":"-215:instruments","SourceOutputPortId":"-8309:data"},{"DestinationInputPortId":"-22014:input_1","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-15:data"},{"DestinationInputPortId":"-22014:input_2","SourceOutputPortId":"-760:train_data"}],"ModuleNodes":[{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","ModuleId":"BigQuantSpace.instruments.instruments-v2","ModuleParameters":[{"Name":"start_date","Value":"2020-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2021-05-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"market","Value":"CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_list","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"max_count","Value":"0","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"rolling_conf","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","OutputType":null}],"UsePreviousResults":false,"moduleIdForCode":1,"IsPartOfPartialRun":null,"Comment":"训练集标签","CommentCollapsed":false},{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-24","ModuleId":"BigQuantSpace.input_features.input_features-v1","ModuleParameters":[{"Name":"features","Value":"correlation(open_0,high_0,10)\ncorrelation(open_0,low_0,10)\ncorrelation(open_0,close_0,10)\ncorrelation(open_0,amount_0/volume_0,10)\ncorrelation(open_0,volume_0,10)\ncorrelation(open_0,return_1,10)\ncorrelation(open_0,turn_0,10)\ncorrelation(open_0,open_0/turn_0,10)\ncorrelation(open_0,volume_0/low_0,10)\ncorrelation(open_0,amount_0/volume_0/high_0,10)\ncorrelation(open_0,low_0/high_0,10)\ncorrelation(open_0,amount_0/volume_0/close_0,10)\ncorrelation(high_0,low_0,10)\ncorrelation(high_0,close_0,10)\ncorrelation(high_0,amount_0/volume_0,10)\ncorrelation(high_0,volume_0,10)\ncorrelation(high_0,return_1,10)\ncorrelation(high_0,turn_0,10)\ncorrelation(high_0,open_0/turn_0,10)\ncorrelation(high_0,volume_0/low_0,10)\ncorrelation(high_0,amount_0/volume_0/high_0,10)\ncorrelation(high_0,low_0/high_0,10)\ncorrelation(high_0,amount_0/volume_0/close_0,10)\ncorrelation(low_0,close_0,10)\ncorrelation(low_0,amount_0/volume_0,10)\ncorrelation(low_0,volume_0,10)\ncorrelation(low_0,return_1,10)\ncorrelation(low_0,turn_0,10)\ncorrelation(low_0,open_0/turn_0,10)\ncorrelation(low_0,volume_0/low_0,10)\ncorrelation(low_0,amount_0/volume_0/high_0,10)\ncorrelation(low_0,low_0/high_0,10)\ncorrelation(low_0,amount_0/volume_0/close_0,10)\ncorrelation(close_0,amount_0/volume_0,10)\ncorrelation(close_0,volume_0,10)\ncorrelation(close_0,return_1,10)\ncorrelation(close_0,turn_0,10)\ncorrelation(close_0,open_0/turn_0,10)\ncorrelation(close_0,volume_0/low_0,10)\ncorrelation(close_0,amount_0/volume_0/high_0,10)\ncorrelation(close_0,low_0/high_0,10)\ncorrelation(close_0,amount_0/volume_0/close_0,10)\ncorrelation(amount_0/volume_0,volume_0,10)\ncorrelation(amount_0/volume_0,return_1,10)\ncorrelation(amount_0/volume_0,turn_0,10)\ncorrelation(amount_0/volume_0,open_0/turn_0,10)\ncorrelation(amount_0/volume_0,volume_0/low_0,10)\ncorrelation(amount_0/volume_0,amount_0/volume_0/high_0,10)\ncorrelation(amount_0/volume_0,low_0/high_0,10)\ncorrelation(amount_0/volume_0,amount_0/volume_0/close_0,10)\ncorrelation(volume_0,return_1,10)\ncorrelation(volume_0,turn_0,10)\ncorrelation(volume_0,open_0/turn_0,10)\ncorrelation(volume_0,volume_0/low_0,10)\ncorrelation(volume_0,amount_0/volume_0/high_0,10)\ncorrelation(volume_0,low_0/high_0,10)\ncorrelation(volume_0,amount_0/volume_0/close_0,10)\ncorrelation(return_1,turn_0,10)\ncorrelation(return_1,open_0/turn_0,10)\ncorrelation(return_1,volume_0/low_0,10)\ncorrelation(return_1,amount_0/volume_0/high_0,10)\ncorrelation(return_1,low_0/high_0,10)\ncorrelation(return_1,amount_0/volume_0/close_0,10)\ncorrelation(turn_0,open_0/turn_0,10)\ncorrelation(turn_0,volume_0/low_0,10)\ncorrelation(turn_0,amount_0/volume_0/high_0,10)\ncorrelation(turn_0,low_0/high_0,10)\ncorrelation(turn_0,amount_0/volume_0/close_0,10)\ncorrelation(open_0/turn_0,volume_0/low_0,10)\ncorrelation(open_0/turn_0,amount_0/volume_0/high_0,10)\ncorrelation(open_0/turn_0,low_0/high_0,10)\ncorrelation(open_0/turn_0,amount_0/volume_0/close_0,10)\ncorrelation(volume_0/low_0,amount_0/volume_0/high_0,10)\ncorrelation(volume_0/low_0,low_0/high_0,10)\ncorrelation(volume_0/low_0,amount_0/volume_0/close_0,10)\ncorrelation(amount_0/volume_0/high_0,low_0/high_0,10)\ncorrelation(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)\ncorrelation(low_0/high_0,amount_0/volume_0/close_0,10)\ncovariance(open_0,high_0,10)\ncovariance(open_0,low_0,10)\ncovariance(open_0,close_0,10)\ncovariance(open_0,amount_0/volume_0,10)\ncovariance(open_0,volume_0,10)\ncovariance(open_0,return_1,10)\ncovariance(open_0,turn_0,10)\ncovariance(open_0,open_0/turn_0,10)\ncovariance(open_0,volume_0/low_0,10)\ncovariance(open_0,amount_0/volume_0/high_0,10)\ncovariance(open_0,low_0/high_0,10)\ncovariance(open_0,amount_0/volume_0/close_0,10)\ncovariance(high_0,low_0,10)\ncovariance(high_0,close_0,10)\ncovariance(high_0,amount_0/volume_0,10)\ncovariance(high_0,volume_0,10)\ncovariance(high_0,return_1,10)\ncovariance(high_0,turn_0,10)\ncovariance(high_0,open_0/turn_0,10)\ncovariance(high_0,volume_0/low_0,10)\ncovariance(high_0,amount_0/volume_0/high_0,10)\ncovariance(high_0,low_0/high_0,10)\ncovariance(high_0,amount_0/volume_0/close_0,10)\ncovariance(low_0,close_0,10)\ncovariance(low_0,amount_0/volume_0,10)\ncovariance(low_0,volume_0,10)\ncovariance(low_0,return_1,10)\ncovariance(low_0,turn_0,10)\ncovariance(low_0,open_0/turn_0,10)\ncovariance(low_0,volume_0/low_0,10)\ncovariance(low_0,amount_0/volume_0/high_0,10)\ncovariance(low_0,low_0/high_0,10)\ncovariance(low_0,amount_0/volume_0/close_0,10)\ncovariance(close_0,amount_0/volume_0,10)\ncovariance(close_0,volume_0,10)\ncovariance(close_0,return_1,10)\ncovariance(close_0,turn_0,10)\ncovariance(close_0,open_0/turn_0,10)\ncovariance(close_0,volume_0/low_0,10)\ncovariance(close_0,amount_0/volume_0/high_0,10)\ncovariance(close_0,low_0/high_0,10)\ncovariance(close_0,amount_0/volume_0/close_0,10)\ncovariance(amount_0/volume_0,volume_0,10)\ncovariance(amount_0/volume_0,return_1,10)\ncovariance(amount_0/volume_0,turn_0,10)\ncovariance(amount_0/volume_0,open_0/turn_0,10)\ncovariance(amount_0/volume_0,volume_0/low_0,10)\ncovariance(amount_0/volume_0,amount_0/volume_0/high_0,10)\ncovariance(amount_0/volume_0,low_0/high_0,10)\ncovariance(amount_0/volume_0,amount_0/volume_0/close_0,10)\ncovariance(volume_0,return_1,10)\ncovariance(volume_0,turn_0,10)\ncovariance(volume_0,open_0/turn_0,10)\ncovariance(volume_0,volume_0/low_0,10)\ncovariance(volume_0,amount_0/volume_0/high_0,10)\ncovariance(volume_0,low_0/high_0,10)\ncovariance(volume_0,amount_0/volume_0/close_0,10)\ncovariance(return_1,turn_0,10)\ncovariance(return_1,open_0/turn_0,10)\ncovariance(return_1,volume_0/low_0,10)\ncovariance(return_1,amount_0/volume_0/high_0,10)\ncovariance(return_1,low_0/high_0,10)\ncovariance(return_1,amount_0/volume_0/close_0,10)\ncovariance(turn_0,open_0/turn_0,10)\ncovariance(turn_0,volume_0/low_0,10)\ncovariance(turn_0,amount_0/volume_0/high_0,10)\ncovariance(turn_0,low_0/high_0,10)\ncovariance(turn_0,amount_0/volume_0/close_0,10)\ncovariance(open_0/turn_0,volume_0/low_0,10)\ncovariance(open_0/turn_0,amount_0/volume_0/high_0,10)\ncovariance(open_0/turn_0,low_0/high_0,10)\ncovariance(open_0/turn_0,amount_0/volume_0/close_0,10)\ncovariance(volume_0/low_0,amount_0/volume_0/high_0,10)\ncovariance(volume_0/low_0,low_0/high_0,10)\ncovariance(volume_0/low_0,amount_0/volume_0/close_0,10)\ncovariance(amount_0/volume_0/high_0,low_0/high_0,10)\ncovariance(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)\ncovariance(low_0/high_0,amount_0/volume_0/close_0,10)\nstd(open_0,10)\nstd(high_0,10)\nstd(low_0,10)\nstd(close_0,10)\nstd(amount_0/volume_0,10)\nstd(volume_0,10)\nstd(return_1,10)\nstd(turn_0,10)\nstd(open_0/turn_0,10)\nstd(volume_0/low_0,10)\nstd(amount_0/volume_0/high_0,10)\nstd(low_0/high_0,10)\nstd(amount_0/volume_0/close_0,10)\nmean(open_0,10)/std(open_0,10)\nmean(high_0,10)/std(high_0,10)\nmean(low_0,10)/std(low_0,10)\nmean(close_0,10)/std(close_0,10)\nmean(amount_0/volume_0,10)/std(amount_0/volume_0,10)\nmean(volume_0,10)/std(volume_0,10)\nmean(return_1,10)/std(return_1,10)\nmean(turn_0,10)/std(turn_0,10)\nmean(open_0/turn_0,10)/std(open_0/turn_0,10)\nmean(volume_0/low_0,10)/std(volume_0/low_0,10)\nmean(amount_0/volume_0/high_0,10)/std(amount_0/volume_0/high_0,10)\nmean(low_0/high_0,10)/std(low_0/high_0,10)\nmean(amount_0/volume_0/close_0,10)/std(amount_0/volume_0/close_0,10)\n(open_0/turn_0-shift(open_0/turn_0,10))/shift(open_0/turn_0,10)-1\n(high_0-shift(high_0,10))/shift(high_0,10)-1\n(low_0/high_0-shift(low_0/high_0,10))/shift(low_0/high_0,10)-1\n(close_0-shift(close_0,10))/shift(close_0,10)-1\n(amount_0/volume_0/close_0-shift(amount_0/volume_0/close_0,10))/shift(amount_0/volume_0/close_0,10)-1\n(volume_0/low_0-shift(volume_0/low_0,10))/shift(volume_0/low_0,10)-1\n(return_1-shift(return_1,10))/shift(return_1,10)-1\n(turn_0-shift(turn_0,10))/shift(turn_0,10)-1\n(open_0-shift(open_0,10))/shift(open_0,10)-1\n(volume_0-shift(volume_0,10))/shift(volume_0,10)-1\n(amount_0/volume_0-shift(amount_0/volume_0,10))/shift(amount_0/volume_0,10)-1\n(low_0-shift(low_0,10))/shift(low_0,10)-1\n(amount_0/volume_0/high_0-shift(amount_0/volume_0/high_0,10))/shift(amount_0/volume_0/high_0,10)-1\ndecay_linear(open_0,10)\ndecay_linear(high_0,10)\ndecay_linear(low_0,10)\ndecay_linear(close_0,10)\ndecay_linear(amount_0/volume_0,10)\ndecay_linear(volume_0,10)\ndecay_linear(return_1,10)\ndecay_linear(turn_0,10)\ndecay_linear(open_0/turn_0,10)\ndecay_linear(volume_0/low_0,10)\ndecay_linear(amount_0/volume_0/high_0,10)\ndecay_linear(low_0/high_0,10)\ndecay_linear(amount_0/volume_0/close_0,10)","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features_ds","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":3,"IsPartOfPartialRun":null,"Comment":"特征","CommentCollapsed":false},{"Id":"-215","ModuleId":"BigQuantSpace.general_feature_extractor.general_feature_extractor-v7","ModuleParameters":[{"Name":"start_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"before_start_days","Value":"300","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"-215"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-215"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-215","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":7,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-222","ModuleId":"BigQuantSpace.derived_feature_extractor.derived_feature_extractor-v3","ModuleParameters":[{"Name":"date_col","Value":"date","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_col","Value":"instrument","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"drop_na","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"remove_extra_columns","Value":"True","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"user_functions","Value":"","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_data","NodeId":"-222"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-222"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-222","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":8,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-585","ModuleId":"BigQuantSpace.input_features.input_features-v1","ModuleParameters":[{"Name":"features","Value":"# #号开始的表示注释\n# 多个特征,每行一个,可以包含基础特征和衍生特征\n\nm_amount_x = mean(amount_0, 5)\nmarket_cap_float_x = market_cap_float_0\nmarket_cap_x = market_cap_0\n\nin_csi800_x = in_csi800_0\nin_csi500_x = in_csi500_0\nin_csi300_x = in_csi300_0\n\nlist_days_x = list_days_0\nindustry_sw_level1_x = industry_sw_level1_0\nst_flag_x = st_CN_STOCK_A__st_type","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features_ds","NodeId":"-585"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-585","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":11,"IsPartOfPartialRun":null,"Comment":"辅助特征","CommentCollapsed":true},{"Id":"-8309","ModuleId":"BigQuantSpace.instruments.instruments-v2","ModuleParameters":[{"Name":"start_date","Value":"2020-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2021-05-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"market","Value":"CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_list","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"max_count","Value":"0","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"rolling_conf","NodeId":"-8309"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-8309","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":12,"IsPartOfPartialRun":null,"Comment":"公共数据集","CommentCollapsed":false},{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-15","ModuleId":"BigQuantSpace.advanced_auto_labeler.advanced_auto_labeler-v2","ModuleParameters":[{"Name":"label_expr","Value":"# #号开始的表示注释\n# 0. 每行一个,顺序执行,从第二个开始,可以使用label字段\n# 1. 可用数据字段见 https://bigquant.com/docs/develop/datasource/deprecated/history_data.html\n# 添加benchmark_前缀,可使用对应的benchmark数据\n# 2. 可用操作符和函数见 `表达式引擎 <https://bigquant.com/docs/develop/bigexpr/usage.html>`_\n\n# 计算收益:5日收盘价(作为卖出价格)除以明日开盘价(作为买入价格)\n\n( shift(close, -20) / shift(open, -1) -1)\n\n# 极值处理:用1%和99%分位的值做clip\nclip(label, all_quantile(label, 0.01), all_quantile(label, 0.99))\n\n# where( (label > all_quantile(label,0.8))|(label < all_quantile(label,0.2)) , label, NaN)\n\n# 将分数映射到分类,这里使用20个分类\n# all_wbins(label, 20)\n\n# 过滤掉一字涨停的情况 (设置label为NaN,在后续处理和训练中会忽略NaN的label)\nwhere( abs(shift(high, -1)-shift(low, -1)) < 1e-3, NaN, label)\n","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"start_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"benchmark","Value":"000905.SHA","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"drop_na_label","Value":"True","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"cast_label_int","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"user_functions","Value":"def cal_max_ret(df, close, open, benchmark_close, benchmark_open, M, N ): \n df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \\\n lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) - x['benchmark_close'].shift(-i) / x['benchmark_open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))\n last_date = df.date.sort_values().unique()[-N]\n return df.query('date < @last_date')['ret']\n\ndef cal_max_ret_v2(df, close, open, benchmark_close, benchmark_open, M, N ):\n df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \\\n lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))\n last_date = df.date.sort_values().unique()[-N]\n return df.query('date < @last_date')['ret']\n\n\ndef last_max_ret(df, close, open, benchmark_close, benchmark_open, M=1, N=20):\n return df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply(lambda x:(pd.concat([ x['close'].shift(-N) / x['open'].shift(-i) for i in range(M,N-1) ], axis=1) ).mean(axis=1,skipna=False) )\n\n\nbigquant_run={'cal_max_ret':cal_max_ret, 'cal_max_ret_v2': cal_max_ret_v2, 'last_max_ret': last_max_ret}","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-15"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-15","OutputType":null}],"UsePreviousResults":false,"moduleIdForCode":2,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-22014","ModuleId":"BigQuantSpace.cached.cached-v3","ModuleParameters":[{"Name":"run","Value":"def bigquant_run(input_1, input_2, input_3, cap_n,vol_n,ret_n):\n import time\n import multiprocessing as mp\n train_label = input_1.read() # 标签数据\n data_set = input_2.read() # 全数据集\n feature_list = input_3.read() # 特征列表 \n start_t, end_t = train_label.date.min(), train_label.date.max()\n args_to_test = None\n\n # 训练集处理\n ########################################################################################################################################### \n # 获取数据,按日期、股票代码排序\n data_set = data_set.query('date>=@start_t and date<=@end_t').sort_values(['date','instrument']).reset_index(drop=True) \n # 缺失值检测\n data_set_checkNaN = data_set[feature_list].isna().sum(axis=0) / data_set.shape[0]\n print('-'*100,'\\n训练集:', data_set.shape, '开始日期:', data_set.date.min(), '结束日期:', data_set.date.max(),\"\\n 列缺失值检测(超过5%):\\n\",data_set_checkNaN[data_set_checkNaN > 0.05].sort_values(ascending=False))\n \n # 标记股票池\n data_set['select_pool'] = 1\n #data_set['select_pool'][data_set.eval('in_csi800_x !=1')] = 0 # 股票池\n data_set['select_pool'][data_set.eval('list_days_x <= 100')] = 0 # 上市天数 \n ####data_set['select_pool'][data_set.eval('st_flag_x != 0')] = 0 # ST状态:0:正常股票,1:ST,2:*ST,11:暂停上市\n data_set['m_amount_x_rank'] = data_set.groupby(['date'])['m_amount_x'].rank(pct=True,ascending=False) \n # data_set['select_pool'][data_set.eval('m_amount_x_rank >= 0.70')] = 0 # 流动性控制 \n data_set['select_pool'][data_set[feature_list].isna().sum(axis=1) > 5] = 0 # 缺失严重 \n data_set = data_set.query('industry_sw_level1_x > 1.0 ') # 去除异常行业\n \n \n data_set = data_set.query('select_pool == 1')\n data_set[feature_list] = data_set[feature_list].replace([np.inf, -np.inf, np.nan], 0) # 数据异常值、缺失值处理\n\n # 标签处理 \n train_label['label'] = train_label[['date','label']].groupby(['date'])['label'].rank(pct=True,ascending=True) # rank 归一化\n \n ###########################################################################################################################################\n # 训练集标签合并\n label_data = pd.merge( data_set, train_label, on=['date','instrument'], how='inner') \n data_set = label_data[['date','instrument','label','select_pool','market_cap_x','industry_sw_level1_x']+feature_list].dropna(subset=['label']) \n factor_train_data = data_set.reset_index(drop=True)\n return Outputs(data_1= None, data_2= args_to_test, data_3= DataSource.write_df(factor_train_data) )\n","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"post_run","Value":"# 后处理函数,可选。输入是主函数的输出,可以在这里对数据做处理,或者返回更友好的outputs数据格式。此函数输出不会被缓存。\ndef bigquant_run(outputs):\n return outputs\n","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"input_ports","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"params","Value":"{'cap_n':4,'vol_n':4, 'ret_n':50}","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"output_ports","Value":"","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_1","NodeId":"-22014"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_2","NodeId":"-22014"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_3","NodeId":"-22014"}],"OutputPortsInternal":[{"Name":"data_1","NodeId":"-22014","OutputType":null},{"Name":"data_2","NodeId":"-22014","OutputType":null},{"Name":"data_3","NodeId":"-22014","OutputType":null}],"UsePreviousResults":false,"moduleIdForCode":9,"IsPartOfPartialRun":null,"Comment":"数据集","CommentCollapsed":false},{"Id":"-760","ModuleId":"BigQuantSpace.RobustScaler.RobustScaler-v13","ModuleParameters":[{"Name":"scale_type","Value":"standard","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"quantile_range_min","Value":0.01,"ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"quantile_range_max","Value":"0.99","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"global_scale","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"train_ds","NodeId":"-760"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-760"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"test_ds","NodeId":"-760"}],"OutputPortsInternal":[{"Name":"train_data","NodeId":"-760","OutputType":null},{"Name":"test_data","NodeId":"-760","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":4,"Comment":"","CommentCollapsed":true}],"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-8' Position='93,-460,200,200'/><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-24' Position='721,-597,200,200'/><NodePosition Node='-215' Position='688.3325805664062,-385.71685791015625,200,200'/><NodePosition Node='-222' Position='662.8134765625,-312.9056396484375,200,200'/><NodePosition Node='-585' Position='713,-475,200,200'/><NodePosition Node='-8309' Position='366.880615234375,-567,200,200'/><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-15' Position='90,-362,200,200'/><NodePosition Node='-22014' Position='370,-118,200,200'/><NodePosition Node='-760' Position='462,-215,200,200'/></NodePositions><NodeGroups /></DataV1>"},"IsDraft":true,"ParentExperimentId":null,"WebService":{"IsWebServiceExperiment":false,"Inputs":[],"Outputs":[],"Parameters":[{"Name":"交易日期","Value":"","ParameterDefinition":{"Name":"交易日期","FriendlyName":"交易日期","DefaultValue":"","ParameterType":"String","HasDefaultValue":true,"IsOptional":true,"ParameterRules":[],"HasRules":false,"MarkupType":0,"CredentialDescriptor":null}}],"WebServiceGroupId":null,"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions></NodePositions><NodeGroups /></DataV1>"},"DisableNodesUpdate":false,"Category":"user","Tags":[],"IsPartialRun":true}
    In [3]:
    # 本代码由可视化策略环境自动生成 2021年6月23日18:27
    # 本代码单元只能在可视化模式下编辑。您也可以拷贝代码,粘贴到新建的代码单元或者策略,然后修改。
    
    
    def cal_max_ret(df, close, open, benchmark_close, benchmark_open, M, N ): 
        df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \
        lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) - x['benchmark_close'].shift(-i) / x['benchmark_open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))
        last_date = df.date.sort_values().unique()[-N]
        return df.query('date < @last_date')['ret']
    
    def cal_max_ret_v2(df, close, open, benchmark_close, benchmark_open, M, N ):
        df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \
        lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))
        last_date = df.date.sort_values().unique()[-N]
        return df.query('date < @last_date')['ret']
    
    
    def last_max_ret(df, close, open, benchmark_close, benchmark_open, M=1, N=20):
        return df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply(lambda x:(pd.concat([ x['close'].shift(-N) / x['open'].shift(-i) for i in range(M,N-1) ], axis=1) ).mean(axis=1,skipna=False) )
    
    
    m2_user_functions_bigquant_run={'cal_max_ret':cal_max_ret, 'cal_max_ret_v2': cal_max_ret_v2, 'last_max_ret': last_max_ret}
    def m9_run_bigquant_run(input_1, input_2, input_3, cap_n,vol_n,ret_n):
        import time
        import multiprocessing as mp
        train_label        = input_1.read()          # 标签数据
        data_set           = input_2.read()          # 全数据集
        feature_list       = input_3.read()          # 特征列表 
        start_t, end_t     = train_label.date.min(), train_label.date.max()
        args_to_test       = None
    
        # 训练集处理
        ###########################################################################################################################################    
        # 获取数据,按日期、股票代码排序
        data_set           = data_set.query('date>=@start_t and date<=@end_t').sort_values(['date','instrument']).reset_index(drop=True)       
        # 缺失值检测
        data_set_checkNaN = data_set[feature_list].isna().sum(axis=0) / data_set.shape[0]
        print('-'*100,'\n训练集:', data_set.shape, '开始日期:', data_set.date.min(), '结束日期:', data_set.date.max(),"\n 列缺失值检测(超过5%):\n",data_set_checkNaN[data_set_checkNaN > 0.05].sort_values(ascending=False))
        
        # 标记股票池
        data_set['select_pool']                                                = 1
        #data_set['select_pool'][data_set.eval('in_csi800_x !=1')]              = 0  # 股票池
        data_set['select_pool'][data_set.eval('list_days_x <= 100')]            = 0  # 上市天数   
        ####data_set['select_pool'][data_set.eval('st_flag_x   != 0')]             = 0  # ST状态:0:正常股票,1:ST,2:*ST,11:暂停上市
        data_set['m_amount_x_rank'] = data_set.groupby(['date'])['m_amount_x'].rank(pct=True,ascending=False) 
        # data_set['select_pool'][data_set.eval('m_amount_x_rank >= 0.70')]      = 0  # 流动性控制 
        data_set['select_pool'][data_set[feature_list].isna().sum(axis=1) > 5] = 0  # 缺失严重    
        data_set             = data_set.query('industry_sw_level1_x > 1.0 ')        # 去除异常行业
      
      
        data_set               = data_set.query('select_pool == 1')
        data_set[feature_list] = data_set[feature_list].replace([np.inf, -np.inf, np.nan], 0) # 数据异常值、缺失值处理
    
        # 标签处理  
        train_label['label']   = train_label[['date','label']].groupby(['date'])['label'].rank(pct=True,ascending=True) # rank 归一化
        
        ###########################################################################################################################################
        # 训练集标签合并
        label_data          = pd.merge(  data_set, train_label, on=['date','instrument'], how='inner')    
        data_set            = label_data[['date','instrument','label','select_pool','market_cap_x','industry_sw_level1_x']+feature_list].dropna(subset=['label'])         
        factor_train_data   = data_set.reset_index(drop=True)
        return Outputs(data_1= None, data_2= args_to_test, data_3= DataSource.write_df(factor_train_data) )
    
    # 后处理函数,可选。输入是主函数的输出,可以在这里对数据做处理,或者返回更友好的outputs数据格式。此函数输出不会被缓存。
    def m9_post_run_bigquant_run(outputs):
        return outputs
    
    
    m1 = M.instruments.v2(
        start_date='2020-01-01',
        end_date='2021-05-01',
        market='CN_STOCK_A',
        instrument_list='',
        max_count=0,
        m_cached=False
    )
    
    m2 = M.advanced_auto_labeler.v2(
        instruments=m1.data,
        label_expr="""# #号开始的表示注释
    # 0. 每行一个,顺序执行,从第二个开始,可以使用label字段
    # 1. 可用数据字段见 https://bigquant.com/docs/develop/datasource/deprecated/history_data.html
    #   添加benchmark_前缀,可使用对应的benchmark数据
    # 2. 可用操作符和函数见 `表达式引擎 <https://bigquant.com/docs/develop/bigexpr/usage.html>`_
    
    # 计算收益:5日收盘价(作为卖出价格)除以明日开盘价(作为买入价格)
    
    ( shift(close, -20) / shift(open, -1) -1)
    
    # 极值处理:用1%和99%分位的值做clip
    clip(label, all_quantile(label, 0.01), all_quantile(label, 0.99))
    
    # where( (label > all_quantile(label,0.8))|(label < all_quantile(label,0.2)) , label, NaN)
    
    # 将分数映射到分类,这里使用20个分类
    # all_wbins(label, 20)
    
    # 过滤掉一字涨停的情况 (设置label为NaN,在后续处理和训练中会忽略NaN的label)
    where( abs(shift(high, -1)-shift(low, -1)) < 1e-3, NaN, label)
    """,
        start_date='',
        end_date='',
        benchmark='000905.SHA',
        drop_na_label=True,
        cast_label_int=False,
        user_functions=m2_user_functions_bigquant_run,
        m_cached=False
    )
    
    m3 = M.input_features.v1(
        features="""correlation(open_0,high_0,10)
    correlation(open_0,low_0,10)
    correlation(open_0,close_0,10)
    correlation(open_0,amount_0/volume_0,10)
    correlation(open_0,volume_0,10)
    correlation(open_0,return_1,10)
    correlation(open_0,turn_0,10)
    correlation(open_0,open_0/turn_0,10)
    correlation(open_0,volume_0/low_0,10)
    correlation(open_0,amount_0/volume_0/high_0,10)
    correlation(open_0,low_0/high_0,10)
    correlation(open_0,amount_0/volume_0/close_0,10)
    correlation(high_0,low_0,10)
    correlation(high_0,close_0,10)
    correlation(high_0,amount_0/volume_0,10)
    correlation(high_0,volume_0,10)
    correlation(high_0,return_1,10)
    correlation(high_0,turn_0,10)
    correlation(high_0,open_0/turn_0,10)
    correlation(high_0,volume_0/low_0,10)
    correlation(high_0,amount_0/volume_0/high_0,10)
    correlation(high_0,low_0/high_0,10)
    correlation(high_0,amount_0/volume_0/close_0,10)
    correlation(low_0,close_0,10)
    correlation(low_0,amount_0/volume_0,10)
    correlation(low_0,volume_0,10)
    correlation(low_0,return_1,10)
    correlation(low_0,turn_0,10)
    correlation(low_0,open_0/turn_0,10)
    correlation(low_0,volume_0/low_0,10)
    correlation(low_0,amount_0/volume_0/high_0,10)
    correlation(low_0,low_0/high_0,10)
    correlation(low_0,amount_0/volume_0/close_0,10)
    correlation(close_0,amount_0/volume_0,10)
    correlation(close_0,volume_0,10)
    correlation(close_0,return_1,10)
    correlation(close_0,turn_0,10)
    correlation(close_0,open_0/turn_0,10)
    correlation(close_0,volume_0/low_0,10)
    correlation(close_0,amount_0/volume_0/high_0,10)
    correlation(close_0,low_0/high_0,10)
    correlation(close_0,amount_0/volume_0/close_0,10)
    correlation(amount_0/volume_0,volume_0,10)
    correlation(amount_0/volume_0,return_1,10)
    correlation(amount_0/volume_0,turn_0,10)
    correlation(amount_0/volume_0,open_0/turn_0,10)
    correlation(amount_0/volume_0,volume_0/low_0,10)
    correlation(amount_0/volume_0,amount_0/volume_0/high_0,10)
    correlation(amount_0/volume_0,low_0/high_0,10)
    correlation(amount_0/volume_0,amount_0/volume_0/close_0,10)
    correlation(volume_0,return_1,10)
    correlation(volume_0,turn_0,10)
    correlation(volume_0,open_0/turn_0,10)
    correlation(volume_0,volume_0/low_0,10)
    correlation(volume_0,amount_0/volume_0/high_0,10)
    correlation(volume_0,low_0/high_0,10)
    correlation(volume_0,amount_0/volume_0/close_0,10)
    correlation(return_1,turn_0,10)
    correlation(return_1,open_0/turn_0,10)
    correlation(return_1,volume_0/low_0,10)
    correlation(return_1,amount_0/volume_0/high_0,10)
    correlation(return_1,low_0/high_0,10)
    correlation(return_1,amount_0/volume_0/close_0,10)
    correlation(turn_0,open_0/turn_0,10)
    correlation(turn_0,volume_0/low_0,10)
    correlation(turn_0,amount_0/volume_0/high_0,10)
    correlation(turn_0,low_0/high_0,10)
    correlation(turn_0,amount_0/volume_0/close_0,10)
    correlation(open_0/turn_0,volume_0/low_0,10)
    correlation(open_0/turn_0,amount_0/volume_0/high_0,10)
    correlation(open_0/turn_0,low_0/high_0,10)
    correlation(open_0/turn_0,amount_0/volume_0/close_0,10)
    correlation(volume_0/low_0,amount_0/volume_0/high_0,10)
    correlation(volume_0/low_0,low_0/high_0,10)
    correlation(volume_0/low_0,amount_0/volume_0/close_0,10)
    correlation(amount_0/volume_0/high_0,low_0/high_0,10)
    correlation(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)
    correlation(low_0/high_0,amount_0/volume_0/close_0,10)
    covariance(open_0,high_0,10)
    covariance(open_0,low_0,10)
    covariance(open_0,close_0,10)
    covariance(open_0,amount_0/volume_0,10)
    covariance(open_0,volume_0,10)
    covariance(open_0,return_1,10)
    covariance(open_0,turn_0,10)
    covariance(open_0,open_0/turn_0,10)
    covariance(open_0,volume_0/low_0,10)
    covariance(open_0,amount_0/volume_0/high_0,10)
    covariance(open_0,low_0/high_0,10)
    covariance(open_0,amount_0/volume_0/close_0,10)
    covariance(high_0,low_0,10)
    covariance(high_0,close_0,10)
    covariance(high_0,amount_0/volume_0,10)
    covariance(high_0,volume_0,10)
    covariance(high_0,return_1,10)
    covariance(high_0,turn_0,10)
    covariance(high_0,open_0/turn_0,10)
    covariance(high_0,volume_0/low_0,10)
    covariance(high_0,amount_0/volume_0/high_0,10)
    covariance(high_0,low_0/high_0,10)
    covariance(high_0,amount_0/volume_0/close_0,10)
    covariance(low_0,close_0,10)
    covariance(low_0,amount_0/volume_0,10)
    covariance(low_0,volume_0,10)
    covariance(low_0,return_1,10)
    covariance(low_0,turn_0,10)
    covariance(low_0,open_0/turn_0,10)
    covariance(low_0,volume_0/low_0,10)
    covariance(low_0,amount_0/volume_0/high_0,10)
    covariance(low_0,low_0/high_0,10)
    covariance(low_0,amount_0/volume_0/close_0,10)
    covariance(close_0,amount_0/volume_0,10)
    covariance(close_0,volume_0,10)
    covariance(close_0,return_1,10)
    covariance(close_0,turn_0,10)
    covariance(close_0,open_0/turn_0,10)
    covariance(close_0,volume_0/low_0,10)
    covariance(close_0,amount_0/volume_0/high_0,10)
    covariance(close_0,low_0/high_0,10)
    covariance(close_0,amount_0/volume_0/close_0,10)
    covariance(amount_0/volume_0,volume_0,10)
    covariance(amount_0/volume_0,return_1,10)
    covariance(amount_0/volume_0,turn_0,10)
    covariance(amount_0/volume_0,open_0/turn_0,10)
    covariance(amount_0/volume_0,volume_0/low_0,10)
    covariance(amount_0/volume_0,amount_0/volume_0/high_0,10)
    covariance(amount_0/volume_0,low_0/high_0,10)
    covariance(amount_0/volume_0,amount_0/volume_0/close_0,10)
    covariance(volume_0,return_1,10)
    covariance(volume_0,turn_0,10)
    covariance(volume_0,open_0/turn_0,10)
    covariance(volume_0,volume_0/low_0,10)
    covariance(volume_0,amount_0/volume_0/high_0,10)
    covariance(volume_0,low_0/high_0,10)
    covariance(volume_0,amount_0/volume_0/close_0,10)
    covariance(return_1,turn_0,10)
    covariance(return_1,open_0/turn_0,10)
    covariance(return_1,volume_0/low_0,10)
    covariance(return_1,amount_0/volume_0/high_0,10)
    covariance(return_1,low_0/high_0,10)
    covariance(return_1,amount_0/volume_0/close_0,10)
    covariance(turn_0,open_0/turn_0,10)
    covariance(turn_0,volume_0/low_0,10)
    covariance(turn_0,amount_0/volume_0/high_0,10)
    covariance(turn_0,low_0/high_0,10)
    covariance(turn_0,amount_0/volume_0/close_0,10)
    covariance(open_0/turn_0,volume_0/low_0,10)
    covariance(open_0/turn_0,amount_0/volume_0/high_0,10)
    covariance(open_0/turn_0,low_0/high_0,10)
    covariance(open_0/turn_0,amount_0/volume_0/close_0,10)
    covariance(volume_0/low_0,amount_0/volume_0/high_0,10)
    covariance(volume_0/low_0,low_0/high_0,10)
    covariance(volume_0/low_0,amount_0/volume_0/close_0,10)
    covariance(amount_0/volume_0/high_0,low_0/high_0,10)
    covariance(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)
    covariance(low_0/high_0,amount_0/volume_0/close_0,10)
    std(open_0,10)
    std(high_0,10)
    std(low_0,10)
    std(close_0,10)
    std(amount_0/volume_0,10)
    std(volume_0,10)
    std(return_1,10)
    std(turn_0,10)
    std(open_0/turn_0,10)
    std(volume_0/low_0,10)
    std(amount_0/volume_0/high_0,10)
    std(low_0/high_0,10)
    std(amount_0/volume_0/close_0,10)
    mean(open_0,10)/std(open_0,10)
    mean(high_0,10)/std(high_0,10)
    mean(low_0,10)/std(low_0,10)
    mean(close_0,10)/std(close_0,10)
    mean(amount_0/volume_0,10)/std(amount_0/volume_0,10)
    mean(volume_0,10)/std(volume_0,10)
    mean(return_1,10)/std(return_1,10)
    mean(turn_0,10)/std(turn_0,10)
    mean(open_0/turn_0,10)/std(open_0/turn_0,10)
    mean(volume_0/low_0,10)/std(volume_0/low_0,10)
    mean(amount_0/volume_0/high_0,10)/std(amount_0/volume_0/high_0,10)
    mean(low_0/high_0,10)/std(low_0/high_0,10)
    mean(amount_0/volume_0/close_0,10)/std(amount_0/volume_0/close_0,10)
    (open_0/turn_0-shift(open_0/turn_0,10))/shift(open_0/turn_0,10)-1
    (high_0-shift(high_0,10))/shift(high_0,10)-1
    (low_0/high_0-shift(low_0/high_0,10))/shift(low_0/high_0,10)-1
    (close_0-shift(close_0,10))/shift(close_0,10)-1
    (amount_0/volume_0/close_0-shift(amount_0/volume_0/close_0,10))/shift(amount_0/volume_0/close_0,10)-1
    (volume_0/low_0-shift(volume_0/low_0,10))/shift(volume_0/low_0,10)-1
    (return_1-shift(return_1,10))/shift(return_1,10)-1
    (turn_0-shift(turn_0,10))/shift(turn_0,10)-1
    (open_0-shift(open_0,10))/shift(open_0,10)-1
    (volume_0-shift(volume_0,10))/shift(volume_0,10)-1
    (amount_0/volume_0-shift(amount_0/volume_0,10))/shift(amount_0/volume_0,10)-1
    (low_0-shift(low_0,10))/shift(low_0,10)-1
    (amount_0/volume_0/high_0-shift(amount_0/volume_0/high_0,10))/shift(amount_0/volume_0/high_0,10)-1
    decay_linear(open_0,10)
    decay_linear(high_0,10)
    decay_linear(low_0,10)
    decay_linear(close_0,10)
    decay_linear(amount_0/volume_0,10)
    decay_linear(volume_0,10)
    decay_linear(return_1,10)
    decay_linear(turn_0,10)
    decay_linear(open_0/turn_0,10)
    decay_linear(volume_0/low_0,10)
    decay_linear(amount_0/volume_0/high_0,10)
    decay_linear(low_0/high_0,10)
    decay_linear(amount_0/volume_0/close_0,10)"""
    )
    
    m11 = M.input_features.v1(
        features_ds=m3.data,
        features="""# #号开始的表示注释
    # 多个特征,每行一个,可以包含基础特征和衍生特征
    
    m_amount_x             = mean(amount_0, 5)
    market_cap_float_x     = market_cap_float_0
    market_cap_x           = market_cap_0
    
    in_csi800_x            = in_csi800_0
    in_csi500_x            = in_csi500_0
    in_csi300_x            = in_csi300_0
    
    list_days_x            = list_days_0
    industry_sw_level1_x   = industry_sw_level1_0
    st_flag_x              = st_CN_STOCK_A__st_type"""
    )
    
    m12 = M.instruments.v2(
        start_date='2020-01-01',
        end_date='2021-05-01',
        market='CN_STOCK_A',
        instrument_list='',
        max_count=0
    )
    
    m7 = M.general_feature_extractor.v7(
        instruments=m12.data,
        features=m11.data,
        start_date='',
        end_date='',
        before_start_days=300
    )
    
    m8 = M.derived_feature_extractor.v3(
        input_data=m7.data,
        features=m11.data,
        date_col='date',
        instrument_col='instrument',
        drop_na=False,
        remove_extra_columns=True
    )
    
    m4 = M.RobustScaler.v13(
        train_ds=m8.data,
        features=m3.data,
        test_ds=m8.data,
        scale_type='standard',
        quantile_range_min=0.01,
        quantile_range_max=0.99,
        global_scale=False
    )
    
    m9 = M.cached.v3(
        input_1=m2.data,
        input_2=m4.train_data,
        input_3=m3.data,
        run=m9_run_bigquant_run,
        post_run=m9_post_run_bigquant_run,
        input_ports='',
        params='{\'cap_n\':4,\'vol_n\':4, \'ret_n\':50}',
        output_ports='',
        m_cached=False
    )
    
    ---------------------------------------------------------------------------------------------------- 
    训练集: (1191717, 218) 开始日期: 2020-01-02 00:00:00 结束日期: 2021-04-01 00:00:00 
     列缺失值检测(超过5%):
     mean(low_0/high_0,10)/std(low_0/high_0,10)                              0.623716
    correlation(open_0/turn_0,amount_0/volume_0/close_0,10)                 0.472042
    correlation(open_0/turn_0,amount_0/volume_0/high_0,10)                  0.472042
    correlation(open_0,amount_0/volume_0/high_0,10)                         0.472038
    correlation(volume_0,amount_0/volume_0/close_0,10)                      0.472038
    mean(amount_0/volume_0/high_0,10)/std(amount_0/volume_0/high_0,10)      0.472038
    correlation(volume_0/low_0,amount_0/volume_0/close_0,10)                0.472038
    correlation(volume_0/low_0,amount_0/volume_0/high_0,10)                 0.472038
    correlation(turn_0,amount_0/volume_0/close_0,10)                        0.472038
    correlation(turn_0,amount_0/volume_0/high_0,10)                         0.472038
    correlation(open_0,amount_0/volume_0/close_0,10)                        0.472038
    mean(amount_0/volume_0/close_0,10)/std(amount_0/volume_0/close_0,10)    0.472038
    correlation(volume_0,amount_0/volume_0/high_0,10)                       0.472038
    correlation(amount_0/volume_0,amount_0/volume_0/close_0,10)             0.472038
    correlation(amount_0/volume_0,amount_0/volume_0/high_0,10)              0.472038
    correlation(close_0,amount_0/volume_0/close_0,10)                       0.472038
    correlation(close_0,amount_0/volume_0/high_0,10)                        0.472038
    correlation(low_0,amount_0/volume_0/close_0,10)                         0.472038
    correlation(low_0,amount_0/volume_0/high_0,10)                          0.472038
    correlation(high_0,amount_0/volume_0/close_0,10)                        0.472038
    correlation(high_0,amount_0/volume_0/high_0,10)                         0.472038
    correlation(return_1,amount_0/volume_0/close_0,10)                      0.469194
    correlation(return_1,amount_0/volume_0/high_0,10)                       0.469194
    correlation(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)      0.468626
    correlation(low_0/high_0,amount_0/volume_0/close_0,10)                  0.269440
    correlation(amount_0/volume_0/high_0,low_0/high_0,10)                   0.213077
    correlation(amount_0/volume_0,low_0/high_0,10)                          0.204724
    dtype: float64
    
    In [4]:
    dataset = m9.data_3.read().sort_values(['instrument','date']).set_index('instrument')  # 已经按日期从小到大排序
    factors = m3.data.read()
    stocks  = dataset.index.unique().tolist()
    
    In [8]:
    dataset.shape,
    len(factors)
    
    Out[8]:
    208
    In [12]:
    ''''''
    dw  = 5 # alphanet 滚动5天 
    X, Y = [], []
    idx = pd.DataFrame()
    for m, stock in enumerate(stocks):
        
        if m % 50 == 99:
            print(m, '/', len(stocks))
        df = dataset[stock:stock][factors+['label', 'date']]
        x_  = [df.iloc[i-dw:i,:-2].T.values for i in range(dw,len(df)) ]
        y_  = [df.iloc[i,-2]                for i in range(dw,len(df)) ]
        temp_df = df.iloc[dw:len(df)].reset_index()[['date', 'instrument']]
        idx = idx.append(temp_df)
        X.extend(x_), Y.extend(y_)
    X, y  = np.array(X), np.array(Y)
     
    # pd.to_pickle([X, y, idx],'dw30Xy2018-2021_3y_pkl.csv')
    # print("save done")
    
    In [ ]:
    # def save(dataset, factors, stocks):
    #     store = pd.HDFStore("train_data.h5")
    #     store['dataset'] = dataset
    #     store['factors'] = pd.Series(factors)
    #     store['stocks'] = pd.Series(stocks)
    #     store.close()
    
    # def load():
    #     store = pd.HDFStore("train_data.h5")
    #     dataset = store['dataset']
    #     factors = store['factors']
    #     stocks = store['stocks']
    #     store.close()
    #     return dataset, factors, stocks
    
    In [ ]:
    # save(dataset, factors, stocks)
    dataset, factors, stocks = load()
    
    In [ ]:
    dates = dataset.date.sort_values().unique()
    dates
    
    In [ ]:
    label_mean = dataset[['date', 'label']].groupby(by='date').mean()
    label_std = dataset[['date', 'label']].groupby(by='date').std()
    
    In [ ]:
    dataset[(dataset.label > 0.1) | (dataset.label < -0.1)]
    
    In [ ]:
    try:
        X,y = pd.read_pickle('Xy0520_pkl.csv')
    except:
        dw  = 30
        X, Y = [], []
        for stock in stocks:
            df = dataset[stock:stock][factors+['label']]
            x_  = [df.iloc[i-dw:i,:-1].T.values for i in range(dw,len(df)) ]
            y_  = [df.iloc[i,-1]                for i in range(dw,len(df)) ]
            X.extend(x_), Y.extend(y_)
        X, y  = np.array(X), np.array(Y)
        pd.to_pickle([X,y],'Xy0520_pkl.csv')
    print(X.shape,y.shape)
    
    In [ ]: