克隆策略

In [21]:

import matplotlib
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, Layer, MaxPool2D, AveragePooling2D, Dropout,Lambda, Concatenate, BatchNormalization, Flatten, Dense, Conv2D
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.utils import plot_model
import pickle
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
import time
# import tensorflow_probability as tfp
from tqdm import tqdm
from datetime import date, timedelta
# from tensorflow.image import extract_patches
import tensorflow as tf
import matplotlib.pyplot as plt

import os
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
# import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
cpus = tf.config.experimental.list_physical_devices(device_type='CPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
gpus, cpus

Out[21]:

([PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
  PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')],
 [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')])

In [3]:

m7.data.read().shape

Out[3]:

(3605843, 18)

In [3]:

# 本代码由可视化策略环境自动生成 2021年6月23日18:27
# 本代码单元只能在可视化模式下编辑。您也可以拷贝代码，粘贴到新建的代码单元或者策略，然后修改。


def cal_max_ret(df, close, open, benchmark_close, benchmark_open, M, N ): 
    df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \
    lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) - x['benchmark_close'].shift(-i) / x['benchmark_open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))
    last_date = df.date.sort_values().unique()[-N]
    return df.query('date < @last_date')['ret']

def cal_max_ret_v2(df, close, open, benchmark_close, benchmark_open, M, N ):
    df['ret'] = df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply( \
    lambda x: (pd.concat([x['close'].shift(-i) / x['open'].shift(-1) for i in range(M,N+1)], axis=1)).max(axis=1,skipna=False))
    last_date = df.date.sort_values().unique()[-N]
    return df.query('date < @last_date')['ret']


def last_max_ret(df, close, open, benchmark_close, benchmark_open, M=1, N=20):
    return df.groupby('instrument', as_index=False, sort=False, group_keys=False).apply(lambda x:(pd.concat([ x['close'].shift(-N) / x['open'].shift(-i) for i in range(M,N-1) ], axis=1) ).mean(axis=1,skipna=False) )


m2_user_functions_bigquant_run={'cal_max_ret':cal_max_ret, 'cal_max_ret_v2': cal_max_ret_v2, 'last_max_ret': last_max_ret}
def m9_run_bigquant_run(input_1, input_2, input_3, cap_n,vol_n,ret_n):
    import time
    import multiprocessing as mp
    train_label        = input_1.read()          # 标签数据
    data_set           = input_2.read()          # 全数据集
    feature_list       = input_3.read()          # 特征列表 
    start_t, end_t     = train_label.date.min(), train_label.date.max()
    args_to_test       = None

    # 训练集处理
    ###########################################################################################################################################    
    # 获取数据，按日期、股票代码排序
    data_set           = data_set.query('date>=@start_t and date<=@end_t').sort_values(['date','instrument']).reset_index(drop=True)       
    # 缺失值检测
    data_set_checkNaN = data_set[feature_list].isna().sum(axis=0) / data_set.shape[0]
    print('-'*100,'\n训练集：', data_set.shape, '开始日期：', data_set.date.min(), '结束日期：', data_set.date.max(),"\n 列缺失值检测(超过5%):\n",data_set_checkNaN[data_set_checkNaN > 0.05].sort_values(ascending=False))
    
    # 标记股票池
    data_set['select_pool']                                                = 1
    #data_set['select_pool'][data_set.eval('in_csi800_x !=1')]              = 0  # 股票池
    data_set['select_pool'][data_set.eval('list_days_x <= 100')]            = 0  # 上市天数   
    ####data_set['select_pool'][data_set.eval('st_flag_x   != 0')]             = 0  # ST状态：0：正常股票，1：ST，2：*ST，11：暂停上市
    data_set['m_amount_x_rank'] = data_set.groupby(['date'])['m_amount_x'].rank(pct=True,ascending=False) 
    # data_set['select_pool'][data_set.eval('m_amount_x_rank >= 0.70')]      = 0  # 流动性控制 
    data_set['select_pool'][data_set[feature_list].isna().sum(axis=1) > 5] = 0  # 缺失严重    
    data_set             = data_set.query('industry_sw_level1_x > 1.0 ')        # 去除异常行业
  
  
    data_set               = data_set.query('select_pool == 1')
    data_set[feature_list] = data_set[feature_list].replace([np.inf, -np.inf, np.nan], 0) # 数据异常值、缺失值处理

    # 标签处理  
    train_label['label']   = train_label[['date','label']].groupby(['date'])['label'].rank(pct=True,ascending=True) # rank 归一化
    
    ###########################################################################################################################################
    # 训练集标签合并
    label_data          = pd.merge(  data_set, train_label, on=['date','instrument'], how='inner')    
    data_set            = label_data[['date','instrument','label','select_pool','market_cap_x','industry_sw_level1_x']+feature_list].dropna(subset=['label'])         
    factor_train_data   = data_set.reset_index(drop=True)
    return Outputs(data_1= None, data_2= args_to_test, data_3= DataSource.write_df(factor_train_data) )

# 后处理函数，可选。输入是主函数的输出，可以在这里对数据做处理，或者返回更友好的outputs数据格式。此函数输出不会被缓存。
def m9_post_run_bigquant_run(outputs):
    return outputs


m1 = M.instruments.v2(
    start_date='2020-01-01',
    end_date='2021-05-01',
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0,
    m_cached=False
)

m2 = M.advanced_auto_labeler.v2(
    instruments=m1.data,
    label_expr="""# #号开始的表示注释
# 0. 每行一个，顺序执行，从第二个开始，可以使用label字段
# 1. 可用数据字段见 https://bigquant.com/docs/develop/datasource/deprecated/history_data.html
#   添加benchmark_前缀，可使用对应的benchmark数据
# 2. 可用操作符和函数见 `表达式引擎 <https://bigquant.com/docs/develop/bigexpr/usage.html>`_

# 计算收益：5日收盘价(作为卖出价格)除以明日开盘价(作为买入价格)

( shift(close, -20) / shift(open, -1) -1)

# 极值处理：用1%和99%分位的值做clip
clip(label, all_quantile(label, 0.01), all_quantile(label, 0.99))

# where( (label > all_quantile(label,0.8))|(label < all_quantile(label,0.2)) , label, NaN)

# 将分数映射到分类，这里使用20个分类
# all_wbins(label, 20)

# 过滤掉一字涨停的情况 (设置label为NaN，在后续处理和训练中会忽略NaN的label)
where( abs(shift(high, -1)-shift(low, -1)) < 1e-3, NaN, label)
""",
    start_date='',
    end_date='',
    benchmark='000905.SHA',
    drop_na_label=True,
    cast_label_int=False,
    user_functions=m2_user_functions_bigquant_run,
    m_cached=False
)

m3 = M.input_features.v1(
    features="""correlation(open_0,high_0,10)
correlation(open_0,low_0,10)
correlation(open_0,close_0,10)
correlation(open_0,amount_0/volume_0,10)
correlation(open_0,volume_0,10)
correlation(open_0,return_1,10)
correlation(open_0,turn_0,10)
correlation(open_0,open_0/turn_0,10)
correlation(open_0,volume_0/low_0,10)
correlation(open_0,amount_0/volume_0/high_0,10)
correlation(open_0,low_0/high_0,10)
correlation(open_0,amount_0/volume_0/close_0,10)
correlation(high_0,low_0,10)
correlation(high_0,close_0,10)
correlation(high_0,amount_0/volume_0,10)
correlation(high_0,volume_0,10)
correlation(high_0,return_1,10)
correlation(high_0,turn_0,10)
correlation(high_0,open_0/turn_0,10)
correlation(high_0,volume_0/low_0,10)
correlation(high_0,amount_0/volume_0/high_0,10)
correlation(high_0,low_0/high_0,10)
correlation(high_0,amount_0/volume_0/close_0,10)
correlation(low_0,close_0,10)
correlation(low_0,amount_0/volume_0,10)
correlation(low_0,volume_0,10)
correlation(low_0,return_1,10)
correlation(low_0,turn_0,10)
correlation(low_0,open_0/turn_0,10)
correlation(low_0,volume_0/low_0,10)
correlation(low_0,amount_0/volume_0/high_0,10)
correlation(low_0,low_0/high_0,10)
correlation(low_0,amount_0/volume_0/close_0,10)
correlation(close_0,amount_0/volume_0,10)
correlation(close_0,volume_0,10)
correlation(close_0,return_1,10)
correlation(close_0,turn_0,10)
correlation(close_0,open_0/turn_0,10)
correlation(close_0,volume_0/low_0,10)
correlation(close_0,amount_0/volume_0/high_0,10)
correlation(close_0,low_0/high_0,10)
correlation(close_0,amount_0/volume_0/close_0,10)
correlation(amount_0/volume_0,volume_0,10)
correlation(amount_0/volume_0,return_1,10)
correlation(amount_0/volume_0,turn_0,10)
correlation(amount_0/volume_0,open_0/turn_0,10)
correlation(amount_0/volume_0,volume_0/low_0,10)
correlation(amount_0/volume_0,amount_0/volume_0/high_0,10)
correlation(amount_0/volume_0,low_0/high_0,10)
correlation(amount_0/volume_0,amount_0/volume_0/close_0,10)
correlation(volume_0,return_1,10)
correlation(volume_0,turn_0,10)
correlation(volume_0,open_0/turn_0,10)
correlation(volume_0,volume_0/low_0,10)
correlation(volume_0,amount_0/volume_0/high_0,10)
correlation(volume_0,low_0/high_0,10)
correlation(volume_0,amount_0/volume_0/close_0,10)
correlation(return_1,turn_0,10)
correlation(return_1,open_0/turn_0,10)
correlation(return_1,volume_0/low_0,10)
correlation(return_1,amount_0/volume_0/high_0,10)
correlation(return_1,low_0/high_0,10)
correlation(return_1,amount_0/volume_0/close_0,10)
correlation(turn_0,open_0/turn_0,10)
correlation(turn_0,volume_0/low_0,10)
correlation(turn_0,amount_0/volume_0/high_0,10)
correlation(turn_0,low_0/high_0,10)
correlation(turn_0,amount_0/volume_0/close_0,10)
correlation(open_0/turn_0,volume_0/low_0,10)
correlation(open_0/turn_0,amount_0/volume_0/high_0,10)
correlation(open_0/turn_0,low_0/high_0,10)
correlation(open_0/turn_0,amount_0/volume_0/close_0,10)
correlation(volume_0/low_0,amount_0/volume_0/high_0,10)
correlation(volume_0/low_0,low_0/high_0,10)
correlation(volume_0/low_0,amount_0/volume_0/close_0,10)
correlation(amount_0/volume_0/high_0,low_0/high_0,10)
correlation(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)
correlation(low_0/high_0,amount_0/volume_0/close_0,10)
covariance(open_0,high_0,10)
covariance(open_0,low_0,10)
covariance(open_0,close_0,10)
covariance(open_0,amount_0/volume_0,10)
covariance(open_0,volume_0,10)
covariance(open_0,return_1,10)
covariance(open_0,turn_0,10)
covariance(open_0,open_0/turn_0,10)
covariance(open_0,volume_0/low_0,10)
covariance(open_0,amount_0/volume_0/high_0,10)
covariance(open_0,low_0/high_0,10)
covariance(open_0,amount_0/volume_0/close_0,10)
covariance(high_0,low_0,10)
covariance(high_0,close_0,10)
covariance(high_0,amount_0/volume_0,10)
covariance(high_0,volume_0,10)
covariance(high_0,return_1,10)
covariance(high_0,turn_0,10)
covariance(high_0,open_0/turn_0,10)
covariance(high_0,volume_0/low_0,10)
covariance(high_0,amount_0/volume_0/high_0,10)
covariance(high_0,low_0/high_0,10)
covariance(high_0,amount_0/volume_0/close_0,10)
covariance(low_0,close_0,10)
covariance(low_0,amount_0/volume_0,10)
covariance(low_0,volume_0,10)
covariance(low_0,return_1,10)
covariance(low_0,turn_0,10)
covariance(low_0,open_0/turn_0,10)
covariance(low_0,volume_0/low_0,10)
covariance(low_0,amount_0/volume_0/high_0,10)
covariance(low_0,low_0/high_0,10)
covariance(low_0,amount_0/volume_0/close_0,10)
covariance(close_0,amount_0/volume_0,10)
covariance(close_0,volume_0,10)
covariance(close_0,return_1,10)
covariance(close_0,turn_0,10)
covariance(close_0,open_0/turn_0,10)
covariance(close_0,volume_0/low_0,10)
covariance(close_0,amount_0/volume_0/high_0,10)
covariance(close_0,low_0/high_0,10)
covariance(close_0,amount_0/volume_0/close_0,10)
covariance(amount_0/volume_0,volume_0,10)
covariance(amount_0/volume_0,return_1,10)
covariance(amount_0/volume_0,turn_0,10)
covariance(amount_0/volume_0,open_0/turn_0,10)
covariance(amount_0/volume_0,volume_0/low_0,10)
covariance(amount_0/volume_0,amount_0/volume_0/high_0,10)
covariance(amount_0/volume_0,low_0/high_0,10)
covariance(amount_0/volume_0,amount_0/volume_0/close_0,10)
covariance(volume_0,return_1,10)
covariance(volume_0,turn_0,10)
covariance(volume_0,open_0/turn_0,10)
covariance(volume_0,volume_0/low_0,10)
covariance(volume_0,amount_0/volume_0/high_0,10)
covariance(volume_0,low_0/high_0,10)
covariance(volume_0,amount_0/volume_0/close_0,10)
covariance(return_1,turn_0,10)
covariance(return_1,open_0/turn_0,10)
covariance(return_1,volume_0/low_0,10)
covariance(return_1,amount_0/volume_0/high_0,10)
covariance(return_1,low_0/high_0,10)
covariance(return_1,amount_0/volume_0/close_0,10)
covariance(turn_0,open_0/turn_0,10)
covariance(turn_0,volume_0/low_0,10)
covariance(turn_0,amount_0/volume_0/high_0,10)
covariance(turn_0,low_0/high_0,10)
covariance(turn_0,amount_0/volume_0/close_0,10)
covariance(open_0/turn_0,volume_0/low_0,10)
covariance(open_0/turn_0,amount_0/volume_0/high_0,10)
covariance(open_0/turn_0,low_0/high_0,10)
covariance(open_0/turn_0,amount_0/volume_0/close_0,10)
covariance(volume_0/low_0,amount_0/volume_0/high_0,10)
covariance(volume_0/low_0,low_0/high_0,10)
covariance(volume_0/low_0,amount_0/volume_0/close_0,10)
covariance(amount_0/volume_0/high_0,low_0/high_0,10)
covariance(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)
covariance(low_0/high_0,amount_0/volume_0/close_0,10)
std(open_0,10)
std(high_0,10)
std(low_0,10)
std(close_0,10)
std(amount_0/volume_0,10)
std(volume_0,10)
std(return_1,10)
std(turn_0,10)
std(open_0/turn_0,10)
std(volume_0/low_0,10)
std(amount_0/volume_0/high_0,10)
std(low_0/high_0,10)
std(amount_0/volume_0/close_0,10)
mean(open_0,10)/std(open_0,10)
mean(high_0,10)/std(high_0,10)
mean(low_0,10)/std(low_0,10)
mean(close_0,10)/std(close_0,10)
mean(amount_0/volume_0,10)/std(amount_0/volume_0,10)
mean(volume_0,10)/std(volume_0,10)
mean(return_1,10)/std(return_1,10)
mean(turn_0,10)/std(turn_0,10)
mean(open_0/turn_0,10)/std(open_0/turn_0,10)
mean(volume_0/low_0,10)/std(volume_0/low_0,10)
mean(amount_0/volume_0/high_0,10)/std(amount_0/volume_0/high_0,10)
mean(low_0/high_0,10)/std(low_0/high_0,10)
mean(amount_0/volume_0/close_0,10)/std(amount_0/volume_0/close_0,10)
(open_0/turn_0-shift(open_0/turn_0,10))/shift(open_0/turn_0,10)-1
(high_0-shift(high_0,10))/shift(high_0,10)-1
(low_0/high_0-shift(low_0/high_0,10))/shift(low_0/high_0,10)-1
(close_0-shift(close_0,10))/shift(close_0,10)-1
(amount_0/volume_0/close_0-shift(amount_0/volume_0/close_0,10))/shift(amount_0/volume_0/close_0,10)-1
(volume_0/low_0-shift(volume_0/low_0,10))/shift(volume_0/low_0,10)-1
(return_1-shift(return_1,10))/shift(return_1,10)-1
(turn_0-shift(turn_0,10))/shift(turn_0,10)-1
(open_0-shift(open_0,10))/shift(open_0,10)-1
(volume_0-shift(volume_0,10))/shift(volume_0,10)-1
(amount_0/volume_0-shift(amount_0/volume_0,10))/shift(amount_0/volume_0,10)-1
(low_0-shift(low_0,10))/shift(low_0,10)-1
(amount_0/volume_0/high_0-shift(amount_0/volume_0/high_0,10))/shift(amount_0/volume_0/high_0,10)-1
decay_linear(open_0,10)
decay_linear(high_0,10)
decay_linear(low_0,10)
decay_linear(close_0,10)
decay_linear(amount_0/volume_0,10)
decay_linear(volume_0,10)
decay_linear(return_1,10)
decay_linear(turn_0,10)
decay_linear(open_0/turn_0,10)
decay_linear(volume_0/low_0,10)
decay_linear(amount_0/volume_0/high_0,10)
decay_linear(low_0/high_0,10)
decay_linear(amount_0/volume_0/close_0,10)"""
)

m11 = M.input_features.v1(
    features_ds=m3.data,
    features="""# #号开始的表示注释
# 多个特征，每行一个，可以包含基础特征和衍生特征

m_amount_x             = mean(amount_0, 5)
market_cap_float_x     = market_cap_float_0
market_cap_x           = market_cap_0

in_csi800_x            = in_csi800_0
in_csi500_x            = in_csi500_0
in_csi300_x            = in_csi300_0

list_days_x            = list_days_0
industry_sw_level1_x   = industry_sw_level1_0
st_flag_x              = st_CN_STOCK_A__st_type"""
)

m12 = M.instruments.v2(
    start_date='2020-01-01',
    end_date='2021-05-01',
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0
)

m7 = M.general_feature_extractor.v7(
    instruments=m12.data,
    features=m11.data,
    start_date='',
    end_date='',
    before_start_days=300
)

m8 = M.derived_feature_extractor.v3(
    input_data=m7.data,
    features=m11.data,
    date_col='date',
    instrument_col='instrument',
    drop_na=False,
    remove_extra_columns=True
)

m4 = M.RobustScaler.v13(
    train_ds=m8.data,
    features=m3.data,
    test_ds=m8.data,
    scale_type='standard',
    quantile_range_min=0.01,
    quantile_range_max=0.99,
    global_scale=False
)

m9 = M.cached.v3(
    input_1=m2.data,
    input_2=m4.train_data,
    input_3=m3.data,
    run=m9_run_bigquant_run,
    post_run=m9_post_run_bigquant_run,
    input_ports='',
    params='{\'cap_n\':4,\'vol_n\':4, \'ret_n\':50}',
    output_ports='',
    m_cached=False
)

[2021-06-23 17:38:40.914622] INFO: moduleinvoker: instruments.v2 开始运行..

[2021-06-23 17:38:41.252068] INFO: moduleinvoker: instruments.v2 运行完成[0.337427s].

[2021-06-23 17:38:41.256800] INFO: moduleinvoker: advanced_auto_labeler.v2 开始运行..

[2021-06-23 17:38:43.664853] INFO: 自动标注(股票): 加载历史数据: 1276561 行

[2021-06-23 17:38:43.666436] INFO: 自动标注(股票): 开始标注 ..

[2021-06-23 17:38:47.028700] INFO: moduleinvoker: advanced_auto_labeler.v2 运行完成[5.771885s].

[2021-06-23 17:38:47.032576] INFO: moduleinvoker: input_features.v1 开始运行..

[2021-06-23 17:38:47.040633] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.042929] INFO: moduleinvoker: input_features.v1 运行完成[0.010363s].

[2021-06-23 17:38:47.046931] INFO: moduleinvoker: input_features.v1 开始运行..

[2021-06-23 17:38:47.053398] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.055514] INFO: moduleinvoker: input_features.v1 运行完成[0.008591s].

[2021-06-23 17:38:47.058597] INFO: moduleinvoker: instruments.v2 开始运行..

[2021-06-23 17:38:47.065557] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.067488] INFO: moduleinvoker: instruments.v2 运行完成[0.008891s].

[2021-06-23 17:38:47.140790] INFO: moduleinvoker: general_feature_extractor.v7 开始运行..

[2021-06-23 17:38:47.147490] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.149394] INFO: moduleinvoker: general_feature_extractor.v7 运行完成[0.008638s].

[2021-06-23 17:38:47.155090] INFO: moduleinvoker: derived_feature_extractor.v3 开始运行..

[2021-06-23 17:38:47.162972] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.165368] INFO: moduleinvoker: derived_feature_extractor.v3 运行完成[0.010307s].

[2021-06-23 17:38:47.179077] INFO: moduleinvoker: RobustScaler.v13 开始运行..

[2021-06-23 17:38:47.188617] INFO: moduleinvoker: 命中缓存

[2021-06-23 17:38:47.190222] INFO: moduleinvoker: RobustScaler.v13 运行完成[0.011163s].

[2021-06-23 17:38:47.194676] INFO: moduleinvoker: cached.v3 开始运行..

[2021-06-23 17:40:08.017076] INFO: moduleinvoker: cached.v3 运行完成[80.822382s].

---------------------------------------------------------------------------------------------------- 
训练集： (1191717, 218) 开始日期： 2020-01-02 00:00:00 结束日期： 2021-04-01 00:00:00 
 列缺失值检测(超过5%):
 mean(low_0/high_0,10)/std(low_0/high_0,10)                              0.623716
correlation(open_0/turn_0,amount_0/volume_0/close_0,10)                 0.472042
correlation(open_0/turn_0,amount_0/volume_0/high_0,10)                  0.472042
correlation(open_0,amount_0/volume_0/high_0,10)                         0.472038
correlation(volume_0,amount_0/volume_0/close_0,10)                      0.472038
mean(amount_0/volume_0/high_0,10)/std(amount_0/volume_0/high_0,10)      0.472038
correlation(volume_0/low_0,amount_0/volume_0/close_0,10)                0.472038
correlation(volume_0/low_0,amount_0/volume_0/high_0,10)                 0.472038
correlation(turn_0,amount_0/volume_0/close_0,10)                        0.472038
correlation(turn_0,amount_0/volume_0/high_0,10)                         0.472038
correlation(open_0,amount_0/volume_0/close_0,10)                        0.472038
mean(amount_0/volume_0/close_0,10)/std(amount_0/volume_0/close_0,10)    0.472038
correlation(volume_0,amount_0/volume_0/high_0,10)                       0.472038
correlation(amount_0/volume_0,amount_0/volume_0/close_0,10)             0.472038
correlation(amount_0/volume_0,amount_0/volume_0/high_0,10)              0.472038
correlation(close_0,amount_0/volume_0/close_0,10)                       0.472038
correlation(close_0,amount_0/volume_0/high_0,10)                        0.472038
correlation(low_0,amount_0/volume_0/close_0,10)                         0.472038
correlation(low_0,amount_0/volume_0/high_0,10)                          0.472038
correlation(high_0,amount_0/volume_0/close_0,10)                        0.472038
correlation(high_0,amount_0/volume_0/high_0,10)                         0.472038
correlation(return_1,amount_0/volume_0/close_0,10)                      0.469194
correlation(return_1,amount_0/volume_0/high_0,10)                       0.469194
correlation(amount_0/volume_0/high_0,amount_0/volume_0/close_0,10)      0.468626
correlation(low_0/high_0,amount_0/volume_0/close_0,10)                  0.269440
correlation(amount_0/volume_0/high_0,low_0/high_0,10)                   0.213077
correlation(amount_0/volume_0,low_0/high_0,10)                          0.204724
dtype: float64

In [4]:

dataset = m9.data_3.read().sort_values(['instrument','date']).set_index('instrument')  # 已经按日期从小到大排序
factors = m3.data.read()
stocks  = dataset.index.unique().tolist()

In [8]:

dataset.shape,
len(factors)

Out[8]:

In [12]:

''''''
dw  = 5 # alphanet 滚动5天 
X, Y = [], []
idx = pd.DataFrame()
for m, stock in enumerate(stocks):
    
    if m % 50 == 99:
        print(m, '/', len(stocks))
    df = dataset[stock:stock][factors+['label', 'date']]
    x_  = [df.iloc[i-dw:i,:-2].T.values for i in range(dw,len(df)) ]
    y_  = [df.iloc[i,-2]                for i in range(dw,len(df)) ]
    temp_df = df.iloc[dw:len(df)].reset_index()[['date', 'instrument']]
    idx = idx.append(temp_df)
    X.extend(x_), Y.extend(y_)
X, y  = np.array(X), np.array(Y)
 
# pd.to_pickle([X, y, idx],'dw30Xy2018-2021_3y_pkl.csv')
# print("save done")

In [ ]:

# def save(dataset, factors, stocks):
#     store = pd.HDFStore("train_data.h5")
#     store['dataset'] = dataset
#     store['factors'] = pd.Series(factors)
#     store['stocks'] = pd.Series(stocks)
#     store.close()

# def load():
#     store = pd.HDFStore("train_data.h5")
#     dataset = store['dataset']
#     factors = store['factors']
#     stocks = store['stocks']
#     store.close()
#     return dataset, factors, stocks

In [ ]:

# save(dataset, factors, stocks)
dataset, factors, stocks = load()

In [ ]:

dates = dataset.date.sort_values().unique()
dates

In [ ]:

label_mean = dataset[['date', 'label']].groupby(by='date').mean()
label_std = dataset[['date', 'label']].groupby(by='date').std()

In [ ]:

dataset[(dataset.label > 0.1) | (dataset.label < -0.1)]

In [ ]:

try:
    X,y = pd.read_pickle('Xy0520_pkl.csv')
except:
    dw  = 30
    X, Y = [], []
    for stock in stocks:
        df = dataset[stock:stock][factors+['label']]
        x_  = [df.iloc[i-dw:i,:-1].T.values for i in range(dw,len(df)) ]
        y_  = [df.iloc[i,-1]                for i in range(dw,len(df)) ]
        X.extend(x_), Y.extend(y_)
    X, y  = np.array(X), np.array(Y)
    pd.to_pickle([X,y],'Xy0520_pkl.csv')
print(X.shape,y.shape)

In [ ]: