import pandas as pd
import numpy as np
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed
sd = '2020-01-01'
ed = '2023-05-09'
df = DataSource('tide_factor_U').read()
df = df[df['date']>='2021-01-01']
df
m1 = M.instruments.v2(
start_date='2021-01-01',
end_date=ed,
market='CN_STOCK_A',
instrument_list='',
max_count=0
)
m2 = M.input_features.v1(
features="""
收盘=close_0
市值=market_cap_0
行业=industry_sw_level1_0
"""
)
m3 = M.general_feature_extractor.v7(
instruments=m1.data,
features=m2.data,
start_date='',
end_date='',
before_start_days=500
)
m4 = M.derived_feature_extractor.v3(
input_data=m3.data,
features=m2.data,
date_col='date',
instrument_col='instrument',
drop_na=False,
remove_extra_columns=False,
user_functions={}
)
m5 = M.chinaa_stock_filter.v1(
input_data=m4.data,
index_constituent_cond=['全部'],
board_cond=['上证主板', '深证主板', '创业板'],
industry_cond=['全部'],
st_cond=['正常'],
delist_cond=['非退市'],
output_left_data=False
)
df_ = m5.data.read()
df = pd.merge(df,df_[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')
df.sort_values(by='date',inplace=True)
df.dropna(inplace=True)
df
df
import statsmodels.api as sm
ind_dummies = pd.get_dummies(df['行业'],prefix='行业')
mkcap = df['市值']
train = pd.concat([ind_dummies,mkcap],axis=1)
X = sm.add_constant(train)
y = df['factor']
model = sm.OLS(y, X).fit()
df['neu_factor'] = model.resid
df.describe()
def rolling(df):
df['factor'] = df['neu_factor'].rolling(10).mean()
return df
df = df.groupby('instrument').apply(rolling)
df.dropna(inplace=True)
def calc_return(df):
df['return'] = df['收盘'].shift(-1)/df['收盘']
return df
df = df.groupby('instrument').apply(calc_return)
def cut(df):
df['groups'] = pd.cut(df['factor'],bins=5,labels=False)
df['len_0'] = len(df[df['groups'] == 0])
df['len_1'] = len(df[df['groups'] == 1])
df['len_2'] = len(df[df['groups'] == 2])
df['len_3'] = len(df[df['groups'] == 3])
df['len_4'] = len(df[df['groups'] == 4])
return df
df = df.groupby('date').apply(cut)
df.dropna(inplace=True)
def calc_mr(df):
df['mr'] = df['return'].mean()
return df
df = df.groupby(['date','groups']).apply(calc_mr)
df.drop_duplicates(subset=['date','groups'],inplace=True)
def calc_cumprod(df):
df.sort_values(by='date',inplace=True)
df['pnl'] = df['mr'].cumprod()
return df
df = df.groupby('groups').apply(calc_cumprod)
df.sort_values(by='date',ascending=True,inplace=True)
def calc_cumprod(df):
df.sort_values(by='date',inplace=True)
df['pnl'] = df['mr'].cumprod()
return df
df = df.groupby('groups').apply(calc_cumprod)
df.sort_values(by='date',ascending=True,inplace=True)
import plotly.graph_objects as go
# 创建一个折线图对象
fig = go.Figure()
# 添加折线到图表
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==0]['pnl'], mode='lines', name='group_0'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==1]['pnl'], mode='lines', name='group_1'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==2]['pnl'], mode='lines', name='group_2'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==3]['pnl'], mode='lines', name='group_3'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df[df['groups']==4]['pnl'], mode='lines', name='group_4'))
# 设置图表标题和坐标轴标签
fig.update_layout(title='分层测试', xaxis_title='X Axis Label', yaxis_title='Y Axis Label')
# 显示图表
fig.show()
df
df_ = df.drop_duplicates(subset=['date'])
df_
import plotly.graph_objects as go
# 创建一个折线图对象
fig = go.Figure()
# 添加折线到图表
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_0'], mode='lines', name='group_0'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_1'], mode='lines', name='group_1'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_2'], mode='lines', name='group_2'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_3'], mode='lines', name='group_3'))
fig.add_trace(go.Scatter(x=df.date.unique(), y=df_['len_4'], mode='lines', name='group_4'))
# 设置图表标题和坐标轴标签
fig.update_layout(title='每层数量', xaxis_title='X Axis Label', yaxis_title='Y Axis Label')
# 显示图表
fig.show()