强有效因子下的线性模型选股策略

与之前三因子模型不同的是, 这里的因子是作为自变量出现的.

In [1]:
from bigdatasource.api import DataSource
from bigdata.api.datareader import D
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs

import pandas as pd
import numpy as np
import math
import dai
import warnings
import datetime
from datetime import timedelta

from zipline.finance.commission import PerOrder
from zipline.api import get_open_orders
from zipline.api import symbol

from bigtrader.sdk import *
from bigtrader.utils.my_collections import NumPyDeque
from bigtrader.constant import OrderType
from bigtrader.constant import Direction
In [6]:
sql = """
SELECT * FROM minute_alpha
INNER JOIN weituo_alpha USING (date, instrument)
INNER JOIN (
    SELECT date, instrument, m_lead(close, 1) / close - 1 AS label
    FROM cn_stock_bar1d
)
USING (date, instrument)
QUALIFY COLUMNS(*) IS NOT NULL
ORDER BY date
"""

dai.query(sql, filters={'date': ['2023-01-01', '2023-10-10']}).df()
Out[6]:
date instrument down_vol_perc volume_perc2 volume_perc3 volume_perc4 volume_perc5 volume_perc6 volume_perc7 down_single_amt_perc corr_ret_lastret corr_close_nextopen corr_volume_amplitude late_skew_ret early_corr_volume_ret skew_order_diff kurt_order_diff early_kurt_order corr_buyorder_volume_price label
0 2023-01-03 002738.SZ 0.266167 0.080171 0.123480 0.091333 0.178554 0.127639 0.082233 0.298775 -0.007054 0.999854 0.700767 2.774458 0.361358 1196.221918 14.492204 737.238476 0.012809 -0.036044
1 2023-01-03 002748.SZ 0.451847 0.116601 0.074829 0.069882 0.128844 0.135896 0.095054 0.217405 -0.021527 0.972888 0.308663 0.690576 -0.344760 -74.838301 -3.257406 55.211985 -0.015418 -0.009756
2 2023-01-03 002752.SZ 0.461865 0.101127 0.088390 0.067375 0.096710 0.090263 0.105141 0.293590 -0.028415 0.940570 0.389884 0.672345 0.010790 538.617691 7.904877 156.729122 -0.002300 0.021073
3 2023-01-03 002766.SZ 0.380686 0.186309 0.090969 0.048147 0.061302 0.152971 0.119289 0.212515 -0.043871 0.992600 0.534397 1.681527 0.482658 22.369606 1.297831 105.287647 0.006499 0.005803
4 2023-01-03 002817.SZ 0.430948 0.098881 0.095776 0.043160 0.099586 0.052897 0.187300 0.266625 0.050784 0.982427 0.555177 0.430523 -0.219197 840.748602 10.229307 85.294049 0.006635 -0.022901
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
899673 2023-10-09 688226.SH 0.368160 0.109335 0.090093 0.059702 0.068046 0.095271 0.075894 0.483640 -0.013662 0.985192 0.652367 -0.683109 0.149593 4.285160 0.554714 275.806926 -0.054617 -0.009965
899674 2023-10-09 688228.SH 0.503352 0.187937 0.066540 0.072269 0.120864 0.125047 0.109456 0.397684 0.030238 0.972630 0.415913 -1.845387 0.165375 -381.653595 -12.593961 334.459650 -0.140239 0.008408
899675 2023-10-09 688236.SH 0.612623 0.148503 0.084495 0.092732 0.184392 0.209866 0.093921 0.527868 0.023340 0.973093 0.616043 0.156523 -0.063220 -1309.149245 -22.545072 32.641505 -0.123548 -0.014644
899676 2023-10-09 300031.SZ 0.431712 0.139345 0.091109 0.087869 0.066692 0.055367 0.153521 0.446083 0.043671 0.988358 0.531287 -0.410373 0.251947 46.504903 0.416769 486.824225 -0.047110 -0.004454
899677 2023-10-09 300127.SZ 0.432610 0.156382 0.131676 0.085787 0.067032 0.063313 0.113939 0.350103 0.086167 0.983069 0.617035 0.824281 0.192460 124.594455 4.333408 82.287417 -0.015064 0.003857

899678 rows × 20 columns

In [29]:
def get_data(sd, ed, training=True):
    if training:
        # 训练集是需要标签的
        sql = """
        SELECT * FROM minute_alpha
        INNER JOIN weituo_alpha USING (date, instrument)
        INNER JOIN (
            SELECT date, instrument, m_lead(close, 1) / close - 1 AS label
            FROM cn_stock_bar1d
        )
        USING (date, instrument)

        INNER JOIN (
            -- 必须要保证是在中证1000当中的股票池
            SELECT date, instrument AS _index_code, member_code AS instrument
            FROM cn_stock_index_component
            WHERE _index_code = '000852.SH'
        )
        USING (date, instrument)

        QUALIFY COLUMNS(*) IS NOT NULL
        ORDER BY date
        """
        df = dai.query(sql, filters={'date': [sd, ed]}).df()
    
    else:
        # 测试集是不需要标签的
        sql = """
        SELECT * FROM minute_alpha
        INNER JOIN weituo_alpha USING (date, instrument)
        INNER JOIN (
            -- 必须要保证是在中证1000当中的股票池
            SELECT date, instrument AS _index_code, member_code AS instrument
            FROM cn_stock_index_component
            WHERE _index_code = '000852.SH'
        )
        USING (date, instrument)
        QUALIFY COLUMNS(*) IS NOT NULL
        ORDER BY date
        """
        df = dai.query(sql, filters={'date': [sd, ed]}).df()
    return df
In [59]:
def scroling(train_sd, train_ed, test_sd, test_ed):
    # 加载训练集
    train_df = get_data(train_sd, train_ed)
    label = np.array(train_df['label']).reshape(-1, 1)
    feature = np.array(train_df.drop(['date', 'instrument', 'label'], axis=1))
    beta = np.linalg.inv(feature.T@feature)@feature.T@label
    
    # 加载回测集
    back_df = get_data(test_sd, test_ed, training=False)
    feature = np.array(back_df.drop(['date', 'instrument'], axis=1))
    ypre = (feature@beta).reshape(1, -1)[0]
    pred = back_df[['date', 'instrument']]
    pred['pred_label'] = ypre
    return pred
In [60]:
import pandas as pd

# 设置开始日期和结束日期
start_date = '2020-01-01'
end_date = '2020-12-31'

# 生成每个月第一天的日期范围
dates = pd.date_range(start=start_date, end=end_date, freq='MS')  # 'MS' 表示月初(Month Start)
date = []
for d in dates:
    date.append(d.strftime('%Y-%m-%d'))
date = date[:-1]
In [61]:
date
Out[61]:
['2020-01-01',
 '2020-02-01',
 '2020-03-01',
 '2020-04-01',
 '2020-05-01',
 '2020-06-01',
 '2020-07-01',
 '2020-08-01',
 '2020-09-01',
 '2020-10-01',
 '2020-11-01']
In [68]:
table = []
for i in range(len(date)-2):
    train_sd = date[i]
    train_ed = date[i+1]
    test_sd = date[i+1]
    test_ed = date[i+2]
    print(train_sd, train_ed, test_sd, test_ed)
    try:
        df = scroling(train_sd, train_ed, test_sd, test_ed)
    except:
        continue
    table.append(df)

pre_table = pd.concat(table, axis=0)
pre_table
Out[68]:
date instrument pred_label
0 2020-02-03 000016.SZ 0.006620
1 2020-02-03 000429.SZ -0.004153
2 2020-02-03 000582.SZ -0.005233
3 2020-02-03 000688.SZ -0.010350
4 2020-02-03 000710.SZ -0.000031
... ... ... ...
10257 2020-10-30 002498.SZ -0.002124
10258 2020-10-30 300170.SZ -0.000021
10259 2020-10-30 300307.SZ -0.000955
10260 2020-10-30 300634.SZ -0.000583
10261 2020-10-30 300773.SZ 0.005419

117427 rows × 3 columns

In [69]:
# 开始回测(回测准备工作)
instruments = {'market': 'CN_STOCK_A', 'instruments': list(pre_table.instrument.unique()), 'start_date': '2020-02-03', 'end_date': '2020-10-30'}
instruments = DataSource.write_pickle(instruments)

df = DataSource.write_df(pre_table)
In [70]:
# 交易引擎:初始化函数,只执行一次
def m4_initialize_bigquant_run(context):
    # 加载预测数据
    context.df = context.options['data'].read_df()

# 交易引擎:每个单位时间开盘前调用一次。
def m4_before_trading_start_bigquant_run(context, data):
    # 盘前处理,订阅行情等
    pass

# 交易引擎:tick数据处理函数,每个tick执行一次
def m4_handle_tick_bigquant_run(context, tick):
    pass

# 交易引擎:bar数据处理函数,每个时间单位执行一次
def m4_handle_data_bigquant_run(context, data):
    dt = data.current_dt.strftime('%Y-%m-%d')

    # 获取数据
    df = context.df[context.df['date']==dt].sort_values('pred_label', ascending=False)
    instruments = list(df[df['pred_label']>0].instrument)[:10]
    
    # 获取持仓信息
    holding = context.get_account_positions()
    holding_list = list(holding.keys())

    # 卖出不在买入池中的股票
    for ins in holding_list:
        if ins not in instruments and data.can_trade(ins):
            context.order_target(ins, 0)
            holding_list.remove(ins)
    
    # 买入持仓中没有的票
    for ins in instruments:
        if ins not in holding_list and data.can_trade(ins) and len(holding_list)<10:
            context.order_target_percent(ins, 1/10)
            holding_list.append(ins)


# 交易引擎:成交回报处理函数,每个成交发生时执行一次
def m4_handle_trade_bigquant_run(context, trade):
    pass

# 交易引擎:委托回报处理函数,每个委托变化时执行一次
def m4_handle_order_bigquant_run(context, order):
    pass

# 交易引擎:盘后处理函数,每日盘后执行一次
def m4_after_trading_bigquant_run(context, data):
    pass


m4 = M.hftrade.v2(
    instruments=instruments,
    options_data=df,
    start_date='',
    end_date='',
    initialize=m4_initialize_bigquant_run,
    before_trading_start=m4_before_trading_start_bigquant_run,
    handle_tick=m4_handle_tick_bigquant_run,
    handle_data=m4_handle_data_bigquant_run,
    handle_trade=m4_handle_trade_bigquant_run,
    handle_order=m4_handle_order_bigquant_run,
    after_trading=m4_after_trading_bigquant_run,
    capital_base=1000000,
    frequency='daily',
    price_type='真实价格',
    product_type='股票',
    before_start_days='0',
    volume_limit=1,
    order_price_field_buy='open',
    order_price_field_sell='open',
    benchmark='000300.SH',
    plot_charts=True,
    disable_cache=False,
    replay_bdb=False,
    show_debug_info=False,
    backtest_only=False
)
HFTrade(回测/模拟)
  • 收益率29.07%
  • 年化收益率40.13%
  • 基准收益率27.3%
  • 阿尔法0.14
  • 贝塔0.67
  • 夏普比率1.5
  • 胜率0.49
  • 盈亏比1.27
  • 收益波动率23.23%
  • 信息比率0.01
  • 最大回撤11.81%
日期 时间 证券代码 证券名称 买/卖 数量 成交价 成交金额 平仓盈亏 交易费用
Loading... (need help?)
日期 证券代码 证券名称 数量 持仓均价 收盘价 持仓市值 收益
Loading... (need help?)
时间 级别 内容
Loading... (need help?)

我们来做个总结

1. 三因子线性体系

时序上共用一套因子收益率, 每个标的在不同时点上的因子暴露不一样. $$\hat{\Beta}=(X^TX)^{-1}X^TY$$

2. 因子值作为因子暴露

这里与三因子线性体系是反着来的, 我们估计的$\beta$其实是一个时点上的因子收益率. 但是$\beta$的最终目的还是预测收益率.

3. 衍生到一般模型

更为复杂的模型, 已经失去了估计$\beta$的概念, 即便是没有$\beta$, 但还是能起到预测收益的作用.

In [ ]: