复制链接
克隆策略
In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
import copy


from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings('ignore')

#import fai
# 连接和初始化fai
#fai.init(cluster="fai-anthonywan-emzbzcpl.fai-anthonywan",token="exEOSXJhSFJGAJYYINsu36YDFwKxGhKW")
1)取个股 A 每天 1 分钟频率的行情数据。
2)使用分钟收盘价,分别计算 A 每一分钟的收益率(即 t 分钟收盘
价/t-1 分钟收盘价-1);再使用分钟成交量,分别计算 A 每一分钟的
增量成交量(即 t 分钟成交量减去 t-1 分钟成交量)。
3)对每天第 6 分钟至第 240 分钟的上述数据进行带截距项的最小二
乘回归,其中被解释变量 y 为第 t 分钟的收益率,解释变量 x 包括 6
项,分别为第 t、t-1、t-2、t-3、t-4、t-5 分钟的增量成交量。
4)记上述回归得到的截距项的 t 值为 t-intercept,第 t、t-1、t-2、t-3、
t-4、t-5 分钟的增量成交量的回归系数的 t 值分别为 t0、t1、t2、t3、t4、
t5,回归方程的 F 值为 F-all。
5)依据前述逻辑,t0、t1、t2、t3、t4、t5 分别表示了第 t、t-1、t-2、
t-3、t-4、t-5 分钟的增量成交量对第 t 分钟价格变化的推动程度,F-all
衡量了第 t、t-1、t-2、t-3、t-4、t-5 分钟中,是否存在某一分钟对第 t
分钟的价格变化存在显著推动。上述 6 个 t 值 1 个 F 值综合衡量了个
金融工程报告
5 敬请关注文后特别声明与免责条款
股短期内突然到来的信息的多少,t-intercept 则包含了市场层面的信息
和个股中长期的基本面信息,而残差部分则衡量了噪声的强弱。
至此,我们初步刻画了上述四种不同信息的量化指标,接下来我们对
这些指标进行组合、改进和拆分,使它们可以更好地表达前述的三条
逻辑,并分别构造了“朝没晨雾”因子、“午蔽古木”因子和“夜眠
霜路”因子。

 计算因子:

In [2]:
sd = '2022-01-04'
ed = '2022-12-31'
In [3]:
def calc_data(instrument,sd,ed):
    import warnings
    warnings.filterwarnings('ignore')


    try:
        df = DataSource('bar1m_CN_STOCK_A').read(start_date=sd,end_date=ed,instruments = instrument)

        def data_preprocess(df):
            df['分钟收益率'] = (df['close']/df['close'].shift())-1
            df['增量成交量'] = df['volume'] - df['volume'].shift()
            df['增量成交量t-1'] = df['增量成交量'].shift()
            df['增量成交量t-2'] = df['增量成交量'].shift(2)
            df['增量成交量t-3'] = df['增量成交量'].shift(3)
            df['增量成交量t-4'] = df['增量成交量'].shift(4)
            df['增量成交量t-5'] = df['增量成交量'].shift(5)

            return df

        df['year'] = df.date.dt.year
        df['month'] = df.date.dt.month
        df['day'] = df.date.dt.day
        df = df.groupby(['year','month','day']).apply(data_preprocess)

        df.dropna(inplace=True)

        def calc_factor(df):
            y = np.array(df['分钟收益率'])

            x = df[['增量成交量','增量成交量t-1','增量成交量t-2','增量成交量t-3','增量成交量t-4','增量成交量t-5']]
            X = sm.add_constant(x)
            result = sm.OLS(y,X).fit()

            inter_ = result.params.const
            thea0 = result.params[1]
            thea1 = result.params[2]
            thea2 = result.params[3]
            thea3 = result.params[4]
            thea4 = result.params[5]
            thea5 = result.params[6]
            f_value = result.fvalue

            #回归系数 F-Value intercept 赋值
            df['inter_'] = inter_
            df['thea0'] = thea0
            df['thea1'] = thea1
            df['thea2'] = thea2
            df['thea3'] = thea3
            df['thea4'] = thea4
            df['thea5'] = thea5
            df['f_value'] = f_value
            std_ = np.std([thea1,thea2,thea3,thea4,thea5])
            
            #计算因子和截距的绝对值
            df['factor1'] = std_
            df['abs_inter'] = abs(df['inter_'])

            return df
        
        df = df.groupby(['year','month','day']).apply(calc_factor)
        df.drop_duplicates(subset=['year','month','day'],inplace=True)

        df['date'] = df['date'].dt.to_period('D')
        df = df[['instrument','date','inter_','thea0','thea1','thea2','thea3','thea4','thea5','f_value','factor1']]

        def f_mean(df):
            df['f_mean'] = df['f_value'].mean()
            return df

        df = df.groupby('date').apply(f_mean)

        df['factor2'] = np.where(df['f_value']<df['f_mean'], abs(df['inter_']) *-1,df['inter_']  )

        return df
        
    except:
        return
    


    
In [4]:
df = calc_data('300170.SZA',sd,ed)
df
Out[4]:
instrument date inter_ thea0 thea1 thea2 thea3 thea4 thea5 f_value factor1 f_mean factor2
6 300170.SZA 2022-01-04 0.000019 -3.084580e-09 9.275010e-10 -1.645956e-09 -2.507793e-09 -6.309964e-10 -2.240749e-10 1.513273 1.181134e-09 1.513273 0.000019
246 300170.SZA 2022-01-05 -0.000127 -8.611417e-10 -2.107603e-10 2.300896e-09 8.214298e-10 8.853554e-10 1.806673e-09 0.816073 8.696156e-10 0.816073 -0.000127
486 300170.SZA 2022-01-06 0.000027 7.488682e-09 5.604965e-09 2.791516e-09 2.889036e-09 -5.503142e-10 -1.748527e-09 3.857935 2.636772e-09 3.857935 0.000027
726 300170.SZA 2022-01-07 -0.000280 1.424766e-09 -1.253514e-09 -1.308635e-10 -2.617025e-10 1.476637e-09 1.647092e-09 1.247636 1.105766e-09 1.247636 -0.000280
966 300170.SZA 2022-01-10 0.000250 1.297681e-08 6.969745e-09 2.841973e-09 1.476372e-09 -2.078435e-10 -6.484645e-10 19.638824 2.739822e-09 19.638824 0.000250
... ... ... ... ... ... ... ... ... ... ... ... ... ...
56886 300170.SZA 2022-12-26 0.000049 -7.190693e-10 -3.734219e-09 -2.733813e-09 -8.793942e-10 -2.025898e-09 -3.155600e-09 1.159796 9.857733e-10 1.159796 0.000049
57126 300170.SZA 2022-12-27 -0.000026 -3.487629e-09 1.434169e-10 -3.791204e-09 -7.048976e-10 -5.790493e-10 -3.400732e-10 3.187340 1.398802e-09 3.187340 -0.000026
57366 300170.SZA 2022-12-28 -0.000007 -4.919197e-09 -4.452662e-09 -1.144167e-09 -2.318970e-09 -2.787485e-09 -8.529038e-10 3.464348 1.288515e-09 3.464348 -0.000007
57606 300170.SZA 2022-12-29 0.000077 4.642357e-09 2.217049e-09 1.746071e-09 1.741365e-09 7.142220e-10 1.087838e-09 2.393457 5.329696e-10 2.393457 0.000077
57846 300170.SZA 2022-12-30 0.000039 3.937766e-09 1.082626e-09 1.581450e-09 2.507053e-09 1.715423e-09 7.977485e-10 2.175949 5.878370e-10 2.175949 0.000039

242 rows × 13 columns

In [13]:
df
Out[13]:
instrument date inter_ thea0 thea1 thea2 thea3 thea4 thea5 f_value factor1 f_mean factor2
6 300170.SZA 2022-01-04 0.000019 -3.084580e-09 9.275010e-10 -1.645956e-09 -2.507793e-09 -6.309964e-10 -2.240749e-10 1.513273 1.181134e-09 1.513273 0.000019
246 300170.SZA 2022-01-05 -0.000127 -8.611417e-10 -2.107603e-10 2.300896e-09 8.214298e-10 8.853554e-10 1.806673e-09 0.816073 8.696156e-10 0.816073 -0.000127
486 300170.SZA 2022-01-06 0.000027 7.488682e-09 5.604965e-09 2.791516e-09 2.889036e-09 -5.503142e-10 -1.748527e-09 3.857935 2.636772e-09 3.857935 0.000027
726 300170.SZA 2022-01-07 -0.000280 1.424766e-09 -1.253514e-09 -1.308635e-10 -2.617025e-10 1.476637e-09 1.647092e-09 1.247636 1.105766e-09 1.247636 -0.000280
966 300170.SZA 2022-01-10 0.000250 1.297681e-08 6.969745e-09 2.841973e-09 1.476372e-09 -2.078435e-10 -6.484645e-10 19.638824 2.739822e-09 19.638824 0.000250
... ... ... ... ... ... ... ... ... ... ... ... ... ...
56886 300170.SZA 2022-12-26 0.000049 -7.190693e-10 -3.734219e-09 -2.733813e-09 -8.793942e-10 -2.025898e-09 -3.155600e-09 1.159796 9.857733e-10 1.159796 0.000049
57126 300170.SZA 2022-12-27 -0.000026 -3.487629e-09 1.434169e-10 -3.791204e-09 -7.048976e-10 -5.790493e-10 -3.400732e-10 3.187340 1.398802e-09 3.187340 -0.000026
57366 300170.SZA 2022-12-28 -0.000007 -4.919197e-09 -4.452662e-09 -1.144167e-09 -2.318970e-09 -2.787485e-09 -8.529038e-10 3.464348 1.288515e-09 3.464348 -0.000007
57606 300170.SZA 2022-12-29 0.000077 4.642357e-09 2.217049e-09 1.746071e-09 1.741365e-09 7.142220e-10 1.087838e-09 2.393457 5.329696e-10 2.393457 0.000077
57846 300170.SZA 2022-12-30 0.000039 3.937766e-09 1.082626e-09 1.581450e-09 2.507053e-09 1.715423e-09 7.977485e-10 2.175949 5.878370e-10 2.175949 0.000039

242 rows × 13 columns

In [5]:
tmp = DataSource('bar1d_CN_STOCK_A').read(start_date='2022-01-01',end_date='2022-12-31')
instrument_list = tmp.instrument.unique()

results = Parallel(n_jobs=32)(delayed(calc_data)(ins,sd,ed) for ins in instrument_list)
In [32]:
df_all = pd.concat(results)
df_all.reset_index(inplace=True,drop=True)
df_all['date'] = df_all['date'].dt.to_timestamp()
In [33]:
m1 = M.instruments.v2(
    start_date=sd,
    end_date=ed,
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0
)

m2 = M.input_features.v1(
    features="""
收盘=close_0

市值=market_cap_0

行业=industry_sw_level1_0

"""
)


m3 = M.general_feature_extractor.v7(
    instruments=m1.data,
    features=m2.data,
    start_date='',
    end_date='',
    before_start_days=500
)

m4 = M.derived_feature_extractor.v3(
    input_data=m3.data,
    features=m2.data,
    date_col='date',
    instrument_col='instrument',
    drop_na=False,
    remove_extra_columns=False,
    user_functions={}
)

m5 = M.chinaa_stock_filter.v1(
    input_data=m4.data,
    index_constituent_cond=['全部'],
    board_cond=['上证主板', '深证主板', '创业板'],
    industry_cond=['全部'],
    st_cond=['正常'],
    delist_cond=['非退市'],
    output_left_data=False
)

df = m5.data.read()

df = pd.merge(df_all,df[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')

df.sort_values(by='date',inplace=True)

df_2 = df.copy()
[2023-05-10 20:29:52.466004] INFO moduleinvoker: instruments.v2 开始运行..
[2023-05-10 20:29:52.639015] INFO moduleinvoker: 命中缓存
[2023-05-10 20:29:52.640847] INFO moduleinvoker: instruments.v2 运行完成[0.174937s].
[2023-05-10 20:29:52.647206] INFO moduleinvoker: input_features.v1 开始运行..
[2023-05-10 20:29:52.657585] INFO moduleinvoker: 命中缓存
[2023-05-10 20:29:52.659489] INFO moduleinvoker: input_features.v1 运行完成[0.012311s].
[2023-05-10 20:29:52.685498] INFO moduleinvoker: general_feature_extractor.v7 开始运行..
[2023-05-10 20:29:52.695823] INFO moduleinvoker: 命中缓存
[2023-05-10 20:29:52.698010] INFO moduleinvoker: general_feature_extractor.v7 运行完成[0.012543s].
[2023-05-10 20:29:52.710107] INFO moduleinvoker: derived_feature_extractor.v3 开始运行..
[2023-05-10 20:29:52.716740] INFO moduleinvoker: 命中缓存
[2023-05-10 20:29:52.724682] INFO moduleinvoker: derived_feature_extractor.v3 运行完成[0.014589s].
[2023-05-10 20:29:52.738739] INFO moduleinvoker: chinaa_stock_filter.v1 开始运行..
[2023-05-10 20:29:52.748927] INFO moduleinvoker: 命中缓存
[2023-05-10 20:29:52.753934] INFO moduleinvoker: chinaa_stock_filter.v1 运行完成[0.015187s].
In [34]:
df.dropna(inplace=True)

标准化

In [9]:
df.describe()
Out[9]:
inter_ thea0 thea1 thea2 thea3 thea4 thea5 f_value factor1 f_mean factor2 收盘 市值 行业
count 9.996720e+05 9.996720e+05 9.996720e+05 9.996720e+05 9.996720e+05 9.996720e+05 9.996720e+05 999672.000000 9.996720e+05 999672.000000 9.996720e+05 999672.000000 9.996720e+05 999672.000000
mean 4.101788e-06 3.125230e-09 3.289330e-09 2.394088e-09 1.730032e-09 1.199013e-09 6.443146e-10 4.556633 6.458281e-09 4.556633 4.101788e-06 98.943718 2.004017e+10 463250.766251
std 1.105551e-04 2.906869e-08 2.364977e-08 2.030825e-08 1.801249e-08 1.585030e-08 1.319680e-08 150.525585 9.828468e-09 150.525585 1.105551e-04 574.330872 7.810005e+10 185415.134228
min -1.143124e-03 -9.648523e-07 -7.033653e-07 -5.414069e-07 -9.727502e-07 -5.768922e-07 -6.524611e-07 0.008946 1.049657e-12 0.008946 -1.143124e-03 1.596797 9.910668e+08 0.000000
25% -5.495091e-05 -4.370984e-09 -2.486953e-09 -2.269581e-09 -2.140300e-09 -2.051704e-09 -1.951170e-09 1.332511 1.363480e-09 1.332511 -5.495091e-05 20.399050 3.291012e+09 280000.000000
50% 6.940487e-07 1.069395e-09 1.153778e-09 8.395449e-10 5.788946e-10 3.957285e-10 2.156619e-10 2.402919 3.104939e-09 2.402919 6.940487e-07 35.703648 5.755491e+09 420000.000000
75% 5.606231e-05 8.690303e-09 7.000051e-09 5.568143e-09 4.501679e-09 3.681109e-09 2.846338e-09 4.507635 7.449821e-09 4.507635 5.606231e-05 67.312326 1.343160e+10 640000.000000
max 2.632662e-03 9.218350e-07 7.213235e-07 7.114079e-07 6.291178e-07 5.330586e-07 5.583636e-07 114175.410392 5.130407e-07 114175.410392 2.632662e-03 32077.492188 2.576751e+12 770000.000000
In [10]:
import plotly.graph_objs as go
import numpy as np


# 使用 Histogram 类创建概率分布图
fig = go.Figure(data=[go.Histogram(x=df.factor1, histnorm='probability')])

# 设置图形的标题和轴标签
fig.update_layout(title='Probability Distribution', xaxis_title='Value', yaxis_title='Probability')

# 显示图形
fig.show()