复制链接
克隆策略
In [28]:
import pandas as pd
import numpy as np
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs

import warnings
warnings.filterwarnings('ignore')


from joblib import Parallel, delayed
In [29]:
sd = '2020-01-01'
ed = '2023-05-09'
In [30]:
df = DataSource('tide_factor_U').read()
In [31]:
df = df[df['date']>='2021-01-01']
In [32]:
df
Out[32]:
instrument date factor
951987 000001.SZA 2021-01-04 0.000011
951988 000002.SZA 2021-01-04 -0.000079
951989 000004.SZA 2021-01-04 0.000065
951990 000005.SZA 2021-01-04 0.000000
951991 000006.SZA 2021-01-04 -0.000029
... ... ... ...
3567777 688799.SHA 2023-05-09 -0.000120
3567778 688800.SHA 2023-05-09 -0.000146
3567779 688819.SHA 2023-05-09 -0.000002
3567780 688981.SHA 2023-05-09 -0.000505
3567781 689009.SHA 2023-05-09 0.000014

2615795 rows × 3 columns

In [33]:
m1 = M.instruments.v2(
    start_date='2021-01-01',
    end_date=ed,
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0
)

m2 = M.input_features.v1(
    features="""
收盘=close_0

市值=market_cap_0

行业=industry_sw_level1_0

"""
)


m3 = M.general_feature_extractor.v7(
    instruments=m1.data,
    features=m2.data,
    start_date='',
    end_date='',
    before_start_days=500
)

m4 = M.derived_feature_extractor.v3(
    input_data=m3.data,
    features=m2.data,
    date_col='date',
    instrument_col='instrument',
    drop_na=False,
    remove_extra_columns=False,
    user_functions={}
)

m5 = M.chinaa_stock_filter.v1(
    input_data=m4.data,
    index_constituent_cond=['全部'],
    board_cond=['上证主板', '深证主板', '创业板'],
    industry_cond=['全部'],
    st_cond=['正常'],
    delist_cond=['非退市'],
    output_left_data=False
)

df_ = m5.data.read()

df = pd.merge(df,df_[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')

df.sort_values(by='date',inplace=True)
df.dropna(inplace=True)
df
[2023-05-13 19:45:44.403988] INFO moduleinvoker: instruments.v2 开始运行..
[2023-05-13 19:45:44.411343] INFO moduleinvoker: 命中缓存
[2023-05-13 19:45:44.413304] INFO moduleinvoker: instruments.v2 运行完成[0.009343s].
[2023-05-13 19:45:44.418936] INFO moduleinvoker: input_features.v1 开始运行..
[2023-05-13 19:45:44.427902] INFO moduleinvoker: 命中缓存
[2023-05-13 19:45:44.429541] INFO moduleinvoker: input_features.v1 运行完成[0.010616s].
[2023-05-13 19:45:44.445555] INFO moduleinvoker: general_feature_extractor.v7 开始运行..
[2023-05-13 19:45:44.453459] INFO moduleinvoker: 命中缓存
[2023-05-13 19:45:44.455041] INFO moduleinvoker: general_feature_extractor.v7 运行完成[0.009538s].
[2023-05-13 19:45:44.466804] INFO moduleinvoker: derived_feature_extractor.v3 开始运行..
[2023-05-13 19:45:44.473889] INFO moduleinvoker: 命中缓存
[2023-05-13 19:45:44.475749] INFO moduleinvoker: derived_feature_extractor.v3 运行完成[0.00895s].
[2023-05-13 19:45:44.487952] INFO moduleinvoker: chinaa_stock_filter.v1 开始运行..
[2023-05-13 19:45:44.499798] INFO moduleinvoker: 命中缓存
[2023-05-13 19:45:44.501848] INFO moduleinvoker: chinaa_stock_filter.v1 运行完成[0.013882s].
Out[33]:
instrument date factor 收盘 市值 行业
0 000001.SZA 2021-01-04 0.000011 2065.506104 3.609501e+11 480000.0
2752 600509.SHA 2021-01-04 0.000273 32.931404 6.067957e+09 410000.0
2753 600510.SHA 2021-01-04 0.000011 39.364868 7.151659e+09 430000.0
2754 600511.SHA 2021-01-04 -0.000137 266.836700 3.621614e+10 370000.0
2755 600512.SHA 2021-01-04 -0.000083 18.815666 4.716763e+09 620000.0
... ... ... ... ... ... ...
2612474 300149.SZA 2023-05-09 -0.000100 54.631954 4.163142e+09 370000.0
2612473 300148.SZA 2023-05-09 0.000282 39.781693 5.638541e+09 720000.0
2612472 300147.SZA 2023-05-09 -0.000042 28.898579 3.637035e+09 370000.0
2612471 300146.SZA 2023-05-09 -0.000113 688.990234 3.887269e+10 340000.0
2612478 300153.SZA 2023-05-09 -0.000023 35.402874 2.592000e+09 630000.0

2295238 rows × 6 columns

In [34]:
df
Out[34]:
instrument date factor 收盘 市值 行业
0 000001.SZA 2021-01-04 0.000011 2065.506104 3.609501e+11 480000.0
2752 600509.SHA 2021-01-04 0.000273 32.931404 6.067957e+09 410000.0
2753 600510.SHA 2021-01-04 0.000011 39.364868 7.151659e+09 430000.0
2754 600511.SHA 2021-01-04 -0.000137 266.836700 3.621614e+10 370000.0
2755 600512.SHA 2021-01-04 -0.000083 18.815666 4.716763e+09 620000.0
... ... ... ... ... ... ...
2612474 300149.SZA 2023-05-09 -0.000100 54.631954 4.163142e+09 370000.0
2612473 300148.SZA 2023-05-09 0.000282 39.781693 5.638541e+09 720000.0
2612472 300147.SZA 2023-05-09 -0.000042 28.898579 3.637035e+09 370000.0
2612471 300146.SZA 2023-05-09 -0.000113 688.990234 3.887269e+10 340000.0
2612478 300153.SZA 2023-05-09 -0.000023 35.402874 2.592000e+09 630000.0

2295238 rows × 6 columns

In [35]:
import statsmodels.api as sm

ind_dummies = pd.get_dummies(df['行业'],prefix='行业')
mkcap = df['市值']
train = pd.concat([ind_dummies,mkcap],axis=1)

X = sm.add_constant(train)

y = df['factor']

model = sm.OLS(y, X).fit()
df['neu_factor'] = model.resid
In [36]:
df.describe()
Out[36]:
factor 收盘 市值 行业 neu_factor
count 2.295238e+06 2.295238e+06 2.295238e+06 2.295238e+06 2.295238e+06
mean 1.479799e-05 1.041349e+02 2.093953e+10 4.553680e+05 -7.658405e-11
std 1.902281e-04 5.830368e+02 8.199493e+10 1.832108e+05 1.902038e-04
min -6.251126e-03 1.428713e+00 7.040722e+08 0.000000e+00 -6.268479e-03
25% -7.284362e-05 2.042406e+01 3.324022e+09 2.800000e+05 -8.765822e-05
50% 0.000000e+00 3.640000e+01 5.807902e+09 4.200000e+05 -1.445052e-05
75% 8.267806e-05 6.911484e+01 1.383297e+10 6.400000e+05 6.794659e-05
max 6.664068e-02 3.207749e+04 3.267370e+12 7.700000e+05 6.662188e-02
In [37]:
def rolling(df):
    df['factor'] = df['neu_factor'].rolling(10).mean()
    return df

df = df.groupby('instrument').apply(rolling)

df.dropna(inplace=True)

def calc_return(df):
    df['return'] = df['收盘'].shift(-1)/df['收盘']
    return df

df = df.groupby('instrument').apply(calc_return)


def cut(df):
    df['groups'] = pd.cut(df['factor'],bins=5,labels=False)
    
    df['len_0'] = len(df[df['groups'] == 0])
    df['len_1'] = len(df[df['groups'] == 1])
    df['len_2'] = len(df[df['groups'] == 2])
    df['len_3'] = len(df[df['groups'] == 3])
    df['len_4'] = len(df[df['groups'] == 4])

    return df 


df = df.groupby('date').apply(cut)
df.dropna(inplace=True)


def calc_mr(df):
    df['mr'] = df['return'].mean()
    return df



df = df.groupby(['date','groups']).apply(calc_mr)
df.drop_duplicates(subset=['date','groups'],inplace=True)

def calc_cumprod(df):
    df.sort_values(by='date',inplace=True)
    df['pnl'] = df['mr'].cumprod()
    return df



df = df.groupby('groups').apply(calc_cumprod)


df.sort_values(by='date',ascending=True,inplace=True)
In [38]:
def calc_cumprod(df):
    df.sort_values(by='date',inplace=True)
    df['pnl'] = df['mr'].cumprod()
    return df



df = df.groupby('groups').apply(calc_cumprod)


df.sort_values(by='date',ascending=True,inplace=True)
In [39]:
import plotly.graph_objects as go

# 创建一个折线图对象
fig = go.Figure()

# 添加折线到图表
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==0]['pnl'], mode='lines', name='group_0'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==1]['pnl'], mode='lines', name='group_1'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==2]['pnl'], mode='lines', name='group_2'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==3]['pnl'], mode='lines', name='group_3'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==4]['pnl'], mode='lines', name='group_4'))



# 设置图表标题和坐标轴标签
fig.update_layout(title='分层测试', xaxis_title='X Axis Label', yaxis_title='Y Axis Label')

# 显示图表
fig.show()
In [40]:
df
Out[40]:
instrument date factor 收盘 市值 行业 neu_factor return groups len_0 len_1 len_2 len_3 len_4 mr pnl
40086 600521.SHA 2021-01-15 -0.000043 529.903809 4.219818e+10 370000.0 0.000132 0.973802 1 125 2471 1020 69 9 1.014650 1.014650
40085 600520.SHA 2021-01-15 0.000033 14.845694 1.004446e+09 640000.0 0.000334 1.020505 2 125 2471 1020 69 9 1.016416 1.016416
40075 600509.SHA 2021-01-15 -0.000126 27.494909 5.066226e+09 410000.0 0.000012 1.100000 0 125 2471 1020 69 9 1.030148 1.030148
39950 600337.SHA 2021-01-15 0.000140 74.958138 8.792967e+09 360000.0 0.000493 1.009259 3 125 2471 1020 69 9 1.021276 1.021276
39643 300901.SZA 2021-01-15 0.000317 21.490000 5.157600e+09 350000.0 0.000228 0.979991 4 125 2471 1020 69 9 1.023289 1.023289
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2609166 600703.SHA 2023-05-08 -0.000020 378.616760 8.770695e+10 270000.0 -0.000024 1.000000 1 29 2930 1246 54 11 0.986286 1.384756
2609163 600698.SHA 2023-05-08 0.000054 24.647232 5.024015e+09 280000.0 -0.000048 0.989407 2 29 2930 1246 54 11 0.987647 0.715136
2609108 600629.SHA 2023-05-08 0.000350 96.822159 1.136003e+10 620000.0 -0.000293 0.948718 4 29 2930 1246 54 11 0.953268 0.005176
2609434 601100.SHA 2023-05-08 -0.000206 202.721878 7.905481e+10 640000.0 -0.000011 0.995081 0 29 2930 1246 54 11 0.984641 1.541755
2609566 601811.SHA 2023-05-08 0.000291 22.385059 2.302347e+10 720000.0 -0.000149 0.975884 3 29 2930 1246 54 11 0.957885 0.140639

2755 rows × 16 columns

In [41]:
df_ = df.drop_duplicates(subset=['date'])
In [42]:
df_
Out[42]:
instrument date factor 收盘 市值 行业 neu_factor return groups len_0 len_1 len_2 len_3 len_4 mr pnl
40086 600521.SHA 2021-01-15 -0.000043 529.903809 4.219818e+10 370000.0 0.000132 0.973802 1 125 2471 1020 69 9 1.014650 1.014650
44237 600522.SHA 2021-01-18 0.000028 97.885040 3.535274e+10 730000.0 0.000089 0.963573 1 174 2805 665 44 6 1.005737 1.020471
48406 600540.SHA 2021-01-19 -0.000107 15.018432 2.048516e+09 110000.0 0.000031 0.988506 0 108 2512 1004 62 12 1.012125 1.050937
52545 600521.SHA 2021-01-20 -0.000031 526.067871 4.189271e+10 370000.0 0.000148 0.968056 1 70 2352 1164 99 13 1.007328 1.030692
56721 600540.SHA 2021-01-21 -0.000087 14.880331 2.029679e+09 110000.0 -0.000192 0.972158 0 81 2261 1234 111 11 0.995241 1.052675
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2589267 600698.SHA 2023-04-27 0.000097 25.117199 5.119812e+09 280000.0 0.000048 1.002079 2 260 3590 426 26 6 1.019996 0.713243
2594239 600697.SHA 2023-04-28 0.000012 39.269375 2.025191e+09 450000.0 0.000064 1.001571 1 98 3578 569 44 8 1.006365 1.405769
2599158 600629.SHA 2023-05-04 0.000290 88.795021 1.041822e+10 620000.0 0.000423 1.071761 3 78 3757 403 23 8 0.984232 0.144092
2604458 601100.SHA 2023-05-05 -0.000174 202.653107 7.902799e+10 640000.0 0.000014 1.000339 0 28 3221 971 40 11 1.005548 1.565804
2609166 600703.SHA 2023-05-08 -0.000020 378.616760 8.770695e+10 270000.0 -0.000024 1.000000 1 29 2930 1246 54 11 0.986286 1.384756

557 rows × 16 columns

In [43]:
import plotly.graph_objects as go

# 创建一个折线图对象
fig = go.Figure()

# 添加折线到图表
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_0'], mode='lines', name='group_0'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_1'], mode='lines', name='group_1'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_2'], mode='lines', name='group_2'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_3'], mode='lines', name='group_3'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_4'], mode='lines', name='group_4'))



# 设置图表标题和坐标轴标签
fig.update_layout(title='每层数量', xaxis_title='X Axis Label', yaxis_title='Y Axis Label')

# 显示图表
fig.show()