import pandas as pd
import numpy as np
import statsmodels.api as sm
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
import copy
from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')
#import fai
# 连接和初始化fai
#fai.init(cluster="fai-anthonywan-emzbzcpl.fai-anthonywan",token="exEOSXJhSFJGAJYYINsu36YDFwKxGhKW")
sd = '2022-01-04'
ed = '2022-12-31'
def calc_data(instrument,sd,ed):
import warnings
warnings.filterwarnings('ignore')
try:
df = DataSource('bar1m_CN_STOCK_A').read(start_date=sd,end_date=ed,instruments = instrument)
def data_preprocess(df):
df['分钟收益率'] = (df['close']/df['close'].shift())-1
df['增量成交量'] = df['volume'] - df['volume'].shift()
df['增量成交量t-1'] = df['增量成交量'].shift()
df['增量成交量t-2'] = df['增量成交量'].shift(2)
df['增量成交量t-3'] = df['增量成交量'].shift(3)
df['增量成交量t-4'] = df['增量成交量'].shift(4)
df['增量成交量t-5'] = df['增量成交量'].shift(5)
return df
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df = df.groupby(['year','month','day']).apply(data_preprocess)
df.dropna(inplace=True)
def calc_factor(df):
y = np.array(df['分钟收益率'])
x = df[['增量成交量','增量成交量t-1','增量成交量t-2','增量成交量t-3','增量成交量t-4','增量成交量t-5']]
X = sm.add_constant(x)
result = sm.OLS(y,X).fit()
inter_ = result.params.const
thea0 = result.params[1]
thea1 = result.params[2]
thea2 = result.params[3]
thea3 = result.params[4]
thea4 = result.params[5]
thea5 = result.params[6]
f_value = result.fvalue
#回归系数 F-Value intercept 赋值
df['inter_'] = inter_
df['thea0'] = thea0
df['thea1'] = thea1
df['thea2'] = thea2
df['thea3'] = thea3
df['thea4'] = thea4
df['thea5'] = thea5
df['f_value'] = f_value
std_ = np.std([thea1,thea2,thea3,thea4,thea5])
#计算因子和截距的绝对值
df['factor1'] = std_
df['abs_inter'] = abs(df['inter_'])
return df
df = df.groupby(['year','month','day']).apply(calc_factor)
df.drop_duplicates(subset=['year','month','day'],inplace=True)
df['date'] = df['date'].dt.to_period('D')
df = df[['instrument','date','inter_','thea0','thea1','thea2','thea3','thea4','thea5','f_value','factor1']]
def f_mean(df):
df['f_mean'] = df['f_value'].mean()
return df
df = df.groupby('date').apply(f_mean)
df['factor2'] = np.where(df['f_value']<df['f_mean'], abs(df['inter_']) *-1,df['inter_'] )
return df
except:
return
df = calc_data('300170.SZA',sd,ed)
df
df
tmp = DataSource('bar1d_CN_STOCK_A').read(start_date='2022-01-01',end_date='2022-12-31')
instrument_list = tmp.instrument.unique()
results = Parallel(n_jobs=32)(delayed(calc_data)(ins,sd,ed) for ins in instrument_list)
df_all = pd.concat(results)
df_all.reset_index(inplace=True,drop=True)
df_all['date'] = df_all['date'].dt.to_timestamp()
m1 = M.instruments.v2(
start_date=sd,
end_date=ed,
market='CN_STOCK_A',
instrument_list='',
max_count=0
)
m2 = M.input_features.v1(
features="""
收盘=close_0
市值=market_cap_0
行业=industry_sw_level1_0
"""
)
m3 = M.general_feature_extractor.v7(
instruments=m1.data,
features=m2.data,
start_date='',
end_date='',
before_start_days=500
)
m4 = M.derived_feature_extractor.v3(
input_data=m3.data,
features=m2.data,
date_col='date',
instrument_col='instrument',
drop_na=False,
remove_extra_columns=False,
user_functions={}
)
m5 = M.chinaa_stock_filter.v1(
input_data=m4.data,
index_constituent_cond=['全部'],
board_cond=['上证主板', '深证主板', '创业板'],
industry_cond=['全部'],
st_cond=['正常'],
delist_cond=['非退市'],
output_left_data=False
)
df = m5.data.read()
df = pd.merge(df_all,df[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')
df.sort_values(by='date',inplace=True)
df_2 = df.copy()
df.dropna(inplace=True)
df.describe()
import plotly.graph_objs as go
import numpy as np
# 使用 Histogram 类创建概率分布图
fig = go.Figure(data=[go.Histogram(x=df.factor1, histnorm='probability')])
# 设置图形的标题和轴标签
fig.update_layout(title='Probability Distribution', xaxis_title='Value', yaxis_title='Probability')
# 显示图形
fig.show()