import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 加载数据集
data = load_iris()
X = data.data
y = data.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 将数据转换为DMatrix格式
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 设置参数
params = {
'objective': 'multi:softmax',
'num_class': 3,
'eta': 0.1,
'max_depth': 3
}
# 训练模型
model = xgb.train(params, dtrain, num_boost_round=100)
# 在测试集上预测
y_pred = model.predict(dtest)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 100.00%
交叉熵描述的是两个概率分布之间的距离, 交叉熵是一个非负的实数, 当值越小, 表示两个概率分布越接近.
以样本$(x_i, label_i)$为例, 假如该样本标签$label_i=1$, 则有概率$P(label_i=1)=100\%$, 现在我使用模型$f(.)$对该数据点建模, 使用sigmoid激活后: $$P(pre\_label_i=1)=\frac{1}{1+\exp\{-f(x_i)\}}$$ $pre\_label$为预测标签, 由于样本实际标签为1, 所以我们只需要不断地去更新模型$f(.)$, 使得$P(pre\_label_i=1)$尽可能逼近1(最好等于1)即可.
所以, 交叉熵出现了:
$$L_i=-P(label_i=1)\times\log\{P(pre\_label_i=1)\}-(1-P(label_i=1))\times\log\{1-P(pre\_label_i=1)\}$$接下来引入一个中间变量$y_i$(通常是连续型标签), 其中$label_i=sgn(y_i)$, $sgn(x)$为符号函数, $x>=0$时取1其余取-1, 把概率和中间变量带入交叉熵, 整理一下一样可以得到华泰研报的形式: $$L_i=\log\{1+\exp\{-sgn(y_i)f(x_i)\}\}$$ 因为$P(label_i=1)$和$1-P(label_i=1)$总有一个是0!
import numpy as np
def sgn(x):
"""
符号函数
"""
return np.where(x>=0, 1, -1)
# 设定一组数据, 这组数据包含10个样本, 样本特征数为5
x = np.random.randn(10, 5)
y = np.random.randn(10)
# 设置一个线性模型
def model(x):
"""
假设我们已有一个线性模型, 参数全部已知
"""
beta = np.random.randn(x.shape[1], 1)
alpha = np.random.randn(1)
return np.reshape(x@beta + alpha, (1, -1))[0]
# 计算交叉熵
def cross_entropy(ypre, ytrue):
"""
:params ypre: 预测值
:params ytrue: 真实值
"""
# 这个是每个样本的交叉熵
out = np.log(1+np.exp(-sgn(ytrue)*ypre))
# 最终还要求和
out = np.sum(out)
return out
cross_entropy(model(x), y)
对于模型求得的概率, 我们给出一个通式: $$P(pre\_label_i)=\frac{1}{1+\exp\{-sgn(y_i)f(x_i)\}}=\frac{1}{1+\exp\{-label_i\times f(x_i)\}}$$ 当$pre\_label_i=1$时完全和$P(pre\_label_i)$能对应上, 当$pre\_label_i=-1$也能对上.
一阶导: $$\frac{\partial L_i}{\partial f(x_i)}=\frac{-sgn(y_i)\exp\{-sgn(y_i)f(x_i)\}}{1+\exp\{-sgn(y_i)f(x_i)\}}=-sgn(y_i)(1-P(pre\_label_i))=-label_i\times (1-P(pre\_label_i))$$
二阶导: $$\frac{\partial L_i^2}{\partial^2 f(x_i)}=\frac{\exp\{-sgn(y_i)f(x_i)\}}{(1+\exp\{-sgn(y_i)f(x_i)\})^2}=P(pre\_label_i)(1-P(pre\_label_i))$$ 为什么要求二阶导: 因为后续xgboost要用到二阶导.
import numpy as np
x = np.random.randn(10, 5)
y = np.random.randn(10)
# 设置一个线性模型
def model(x):
"""
假设我们已有一个线性模型, 参数全部已知
"""
beta = np.random.randn(x.shape[1], 1)
alpha = np.random.randn(1)
return np.reshape(x@beta + alpha, (1, -1))[0]
def grad(x, y):
prob = 1 / (1+np.exp(-sgn(y)*model(x)))
out = -sgn(y)*(1-prob)
return out
print('所有样本一阶导向量', grad(x, y))
所有样本一阶导向量 [-0.15436426 -0.99937067 -0.35164215 0.23757923 0.18162665 -0.09411906 0.87817236 -0.39347354 -0.91955165 0.39246827]
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
x = data.data
y = data.target
y = np.where(y==1, 1, -1)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
# 自定义损失函数
def logistic_obj(pred, dtrain):
# 获取标签
label = dtrain.get_label()
prob_pre_label = 1 / (np.exp(-label * pred) + 1)
# 计算一阶导数
grad = -label * (1 - prob_pre_label)
# 二阶导
hessian = prob_pre_label * (1 - prob_pre_label)
return grad, hessian
dtrain = xgb.DMatrix(xtrain, ytrain)
dtest = xgb.DMatrix(xtest, ytest)
# 自定义评估函数(使用MSE评估分类模型显然不适用了)
def metric(pred, y):
"""
此时y是一个离散值
"""
pre_label = 1 / (1+np.exp(-pred))
y = y.get_label()
acc = np.sum(np.where(pre_label>0.5, 1, -1)==y) / len(y)
return 'acc', acc
print("---------focal loss-----------")
params = {'tree_method': 'hist'}
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=5, early_stopping_rounds=50,
evals=[(dtrain, 'train'), (dtest, 'test')], verbose_eval=1, obj=logistic_obj, feval=metric)
---------focal loss----------- [0] train-rmse:0.537377 test-rmse:0.568361 train-acc:0.964835 test-acc:0.947368 Multiple eval metrics have been passed: 'test-acc' will be used for early stopping. Will train until test-acc hasn't improved in 50 rounds. [1] train-rmse:0.392799 test-rmse:0.52438 train-acc:0.982418 test-acc:0.95614 [2] train-rmse:0.583661 test-rmse:0.726952 train-acc:0.989011 test-acc:0.95614 [3] train-rmse:0.85857 test-rmse:0.971509 train-acc:0.991209 test-acc:0.95614 [4] train-rmse:1.14925 test-rmse:1.24465 train-acc:0.989011 test-acc:0.947368
# 我们来统计一下正确率
pred = model.predict(dtest)
p = 1 / (1+np.exp(-pred))
acc = np.sum(np.where(p>0.5, 1, -1)==ytest) / len(ytest)
print('预测正确率为: ', acc)
预测正确率为: 0.9473684210526315
其中$c_j$为阈值点. 我们可以从多任务的角度去理解它. 这里有K个任务, 所以会有K个损失函数, 之后将损失函数等全合成使得所有任务公用一个模型参数.
以阈值点$c_j$为例. 这里的$y_i$为连续型标签, 首先我们需要将标签二值化, 大于$c_j$的数据打上标签1, 反之打上标签-1, 很显然, 该任务是一个二分类任务, 其损失函数为: $$loss_i = \log\{1+\exp\{-sgn(y_i-c_j)(f(x_i)-c_j)\}\}$$ 根据之前的结论, 这是一个交叉熵, 所以所有的性质都一目了然. 用这种方式练出来的模型是为了预测样本是否大于阈值$c_j$.
现在我们将所有任务串起来看, 对于所有给定的阈值点, $c_1 < c_2 < ... < c_K$, 最小化损失函数得到的模型的最理想状态便是所有样本都能被正确的分到对应的区间.
对于${\forall}x_i, i=1, ..., n$, ${\exists}f(x_i)$, s.t $$f(x_i)=\mathop{argmin}\limits_{f(x_i)}\log\{1+\exp\{-sgn(y_i)f(x_i)\}\}$$ 若样本$x_i$的标签$y_i$是大于$c_1$小于$c_2$, 则模型$f(x_i)$的值刚好落在$c_1$与$c_2$之间.
研报上标签是10日收益率, 这里二分类效果太好了, 所以我试试一日收益率
import dai
sql = """
WITH main_table AS (
SELECT dt AS date, instrument,
MAX(down_vol_perc) AS down_vol_perc,
MAX(volume_perc2) AS volume_perc2,
MAX(volume_perc3) AS volume_perc3,
MAX(volume_perc4) AS volume_perc4,
MAX(volume_perc5) AS volume_perc5,
MAX(volume_perc6) AS volume_perc6,
MAX(volume_perc7) AS volume_perc7,
MAX(down_single_amt_perc) AS down_single_amt_perc,
MAX(corr_ret_lastret) AS corr_ret_lastret,
MAX(corr_close_nextopen) AS corr_close_nextopen
FROM (
SELECT date AS _date, instrument,
-- 只保留年月日
DATETRUNC('DAY', _date) AS dt,
LOG(close / (LAG(close, 1) OVER (PARTITION BY instrument ORDER BY _date))) AS ret,
-- 两分钟前的收益率
LAG(ret, 2) OVER (PARTITION BY instrument ORDER BY _date) AS ret_before,
-- 前一分钟收盘价和后一分钟开盘价
LAG(close, 1) OVER (PARTITION BY instrument ORDER BY _date) AS pre_close,
-- 提取时分, 方便标注第n个半小时
EXTRACT(HOUR FROM _date) AS _hour,
EXTRACT(MINUTE FROM _date) AS _min,
-- 计算分母
SUM(ret*ret) OVER (PARTITION BY instrument, dt) AS _low,
-- 计算分子
IF(ret>0, 0, 1) AS _down,
ret * _down AS _down_ret,
SUM(_down_ret*_down_ret) OVER (PARTITION BY instrument, dt) AS _up,
-- 计算下行波动率占比
_up / _low AS down_vol_perc,
-- 当天成交量总量
SUM(volume) OVER (PARTITION BY instrument, dt) AS total_volume,
-- 第二个半小时成交量(10点到10点半)
IF((_hour=10 AND _min<=30), 1, 0) AS _second_half,
(SUM(volume * _second_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc2,
-- 第三个半小时成交量(10点半到11点)
IF(((_hour=10 AND _min>=30) OR (_hour=11 AND _min=0)), 1, 0) AS _third_half,
(SUM(volume * _third_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc3,
-- 第四个半小时成交量(11点到11点半)
IF((_hour=11 AND _min<=30), 1, 0) AS _forth_half,
(SUM(volume * _forth_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc4,
-- 第五个半小时成交量(1点到1点半)
IF((_hour=13 AND _min<=30), 1, 0) AS _fifth_half,
(SUM(volume * _fifth_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc5,
-- 第六个半小时成交量(1点半到2点)
IF(((_hour=13 AND _min>=30) OR (_hour=14 AND _min=0)), 1, 0) AS _sixth_half,
(SUM(volume * _sixth_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc6,
-- 第七个半小时成交量(2点到2点半)
IF((_hour=14 AND _min<=30), 1, 0) AS _seventh_half,
(SUM(volume * _seventh_half) OVER (PARTITION BY instrument, dt)) / total_volume AS volume_perc7,
-- 标注收益率为负的时刻
IF(ret<0, 1, 0) AS neg_ret,
(SUM(amount * neg_ret) OVER (PARTITION BY instrument, dt)) / (SUM(amount) OVER (PARTITION BY instrument, dt)) AS down_single_amt_perc,
-- 前后两分钟收益率的相关系数
CORR(ret, ret_before) OVER (PARTITION BY instrument, dt) AS corr_ret_lastret,
-- 前一分钟收盘价和后一分钟开盘价相关性
CORR(pre_close, open) OVER (PARTITION BY instrument, dt) AS corr_close_nextopen
FROM cn_stock_bar1m
)
GROUP BY date, instrument
),
-- 尾盘收益率偏度
skew_table AS (
SELECT dt AS date, instrument, MAX(late_skew_yet) AS late_skew_yet FROM (
SELECT date AS _date, instrument,
-- 只保留年月日
DATETRUNC('DAY', date) AS dt,
-- 提取时分, 方便提取尾盘数据
EXTRACT(HOUR FROM _date) AS _hour,
EXTRACT(MINUTE FROM _date) AS _min,
RANK() OVER (PARTITION BY instrument ORDER BY _date) AS _rank,
-- 提取收益率
LOG(close / (LAG(close, 1) OVER (PARTITION BY instrument ORDER BY _date))) AS ret,
-- 提取尾盘偏度
SKEWNESS(ret) OVER (PARTITION BY instrument, dt) AS late_skew_yet
FROM cn_stock_bar1m
WHERE _hour >= 14 AND (_min>=30 OR _min=0)
QUALIFY _rank > 1
ORDER BY _date, instrument
)
GROUP BY date, instrument
),
-- 早盘成交量与收益率的相关性
v_r_corr AS (
SELECT dt AS date, instrument, MAX(early_corr_volume_ret) AS early_corr_volume_ret FROM (
SELECT date AS _date, instrument,
-- 只保留年月日
DATETRUNC('DAY', _date) AS dt,
LOG(close / (LAG(close, 1) OVER (PARTITION BY instrument ORDER BY _date))) AS ret,
-- 提取时分, 方便标注第一个半小时
EXTRACT(HOUR FROM _date) AS _hour,
EXTRACT(MINUTE FROM _date) AS _min,
-- 早盘半小时
CORR(volume, ret) OVER (PARTITION BY instrument, dt) AS early_corr_volume_ret
FROM cn_stock_bar1m
WHERE (_hour=9 AND _min>=30) OR (_hour=10 AND _min=0)
)
GROUP BY date, instrument
)
SELECT * FROM main_table
INNER JOIN skew_table USING (date, instrument)
INNER JOIN v_r_corr USING (date, instrument)
"""
df = dai.query(sql, filters={'date': ['2019-01-03', '2021-12-31']}).df()
df
# 获取标签(未来一日收益率)
sql = """
SELECT
-- 计算的是未来1日的收益率。这是通过将1天后的收盘价除以第二天的开盘价得到的。这里使用的是一个叫做m_lead的函数,它可以获取某个字段在未来某天的值。
-- _future_return 是一个中间变量名,以 _ 开始的别名列不会在最终结果返回
m_lead(close, 1) / m_lead(open, 1)-1 AS _future_return,
-- 计算未来5日收益率的1%分位数。all_quantile_cont是一个分位数函数,它能够计算出某个字段值的分位数,这里是计算1%的分位数。
c_quantile_cont(_future_return, 0.01) AS _future_return_1pct,
-- 计算未来5日收益率的99%分位数。同样,all_quantile_cont函数用来计算99%的分位数。
c_quantile_cont(_future_return, 0.99) AS _future_return_99pct,
-- 对未来5日收益率进行截断处理,值位于1%和99%分位数之间的数据被保留,超过这个范围的值将被设为边界值。
clip(_future_return, _future_return_1pct, _future_return_99pct) AS _clipped_return,
-- 将离散化后的数据作为标签使用,这是我们预测的目标。
_clipped_return AS _label,
-- 标准化标签
normalize(_label) AS label,
-- 日期,这是每个股票每天的数据
date,
-- 股票代码,代表每一支股票
instrument
-- 从cn_stock_bar1d这个表中选择数据,这个表存储的是股票的日线数据
FROM cn_stock_bar1d
-- 标签值不为空,且非涨跌停(未来一天的最高价不等于最低价)
QUALIFY label is NOT NULL AND m_lead(high, 1) != m_lead(low, 1)
ORDER BY instrument,date
"""
label = dai.query(sql, filters={'date': ['2019-01-03', '2021-12-31']}).df()
label
label['label'].hist()
import pandas as pd
df['date'] = pd.to_datetime(df['date'])
data = pd.merge(df, label, on=['date', 'instrument'], how='inner')
data
# 我们取前0.6的数据量作为训练集
import numpy as np
date = data['date'].unique()
date.sort()
index = int(len(date)*0.6)
split = pd.to_datetime(date[index]).strftime('%Y-%m-%d')
train_data = data[data['date']<split]
test_data = data[data['date']>=split]
# 加载数据
xtrain = np.array(train_data.drop(['date', 'instrument', 'label'], axis=1))
ytrain = np.array(train_data['label'])
xtest = np.array(test_data.drop(['date', 'instrument', 'label'], axis=1))
ytest = np.array(test_data['label'])
梯度运算法则 $$\frac{\partial L_i}{\partial f(x_i)}=\frac{\partial}{\partial f(x_i)}\sum_{j=1}^K\log\{1+\exp\{-sgn(y_i-c_j)(f(x_i)-c_j)\}\}=\sum_{j=1}^K\frac{\partial}{\partial f(x_i)}\log\{1+\exp\{-sgn(y_i-c_j)(f(x_i)-c_j)\}$$ 二阶导同理
import numpy as np
import xgboost as xgb
def sgn(x):
return np.where(x>=0, 1, -1)
# 自定义损失函数
def logistic_obj(pred, dtrain):
# 获取标签
label = dtrain.get_label()
# 阈值为-0.09时
prob_pre_label = 1 / (np.exp(-sgn(label + 0.09) * (pred + 0.09)) + 1)
grad_1 = -sgn(label + 0.09) * (1 - prob_pre_label)
hessian_1 = prob_pre_label * (1 - prob_pre_label)
# 阈值为0.09时
prob_pre_label = 1 / (np.exp(-sgn(label - 0.09) * (pred - 0.09)) + 1)
grad_2 = -sgn(label - 0.09) * (1 - prob_pre_label)
hessian_2 = prob_pre_label * (1 - prob_pre_label)
# 求和取梯度等于梯度再求和
grad = grad_1 + grad_2
hessian = hessian_1 + hessian_2
return grad, hessian
dtrain = xgb.DMatrix(xtrain, ytrain)
dtest = xgb.DMatrix(xtest, ytest)
# 自定义评估函数(使用MSE评估分类模型显然不适用了, 我们直接查看交叉熵损失函数的值)
def metric(pred, y):
"""
此时y是一个连续值
"""
y = np.reshape(y.get_label(), (-1, 1))
pred = np.reshape(pred, (-1, 1))
cut_point = np.array([[-0.09, 0.09]])
temp = sgn(y-cut_point) * (pred - cut_point)
temp = np.log(1 + np.exp(temp))
out = np.sum(temp)
return 'loss', out
print("---------focal loss-----------")
params = {'tree_method': 'hist'}
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=20, early_stopping_rounds=50,
evals=[(dtrain, 'train'), (dtest, 'test')], verbose_eval=1, obj=logistic_obj, feval=metric)
---------focal loss----------- [0] train-rmse:1.03401 test-rmse:1.03406 train-loss:2.23107e+06 test-loss:1.73982e+06 Multiple eval metrics have been passed: 'test-loss' will be used for early stopping. Will train until test-loss hasn't improved in 50 rounds. [1] train-rmse:1.00542 test-rmse:1.00562 train-loss:2.24204e+06 test-loss:1.74341e+06 [2] train-rmse:0.99992 test-rmse:1.00009 train-loss:2.26133e+06 test-loss:1.75467e+06 [3] train-rmse:1.00293 test-rmse:1.00307 train-loss:2.28079e+06 test-loss:1.76704e+06 [4] train-rmse:1.00846 test-rmse:1.00849 train-loss:2.29729e+06 test-loss:1.77776e+06 [5] train-rmse:1.01393 test-rmse:1.01398 train-loss:2.31063e+06 test-loss:1.78662e+06 [6] train-rmse:1.01856 test-rmse:1.01871 train-loss:2.32089e+06 test-loss:1.7935e+06 [7] train-rmse:1.02222 test-rmse:1.02234 train-loss:2.32852e+06 test-loss:1.79846e+06 [8] train-rmse:1.02496 test-rmse:1.02516 train-loss:2.33403e+06 test-loss:1.80209e+06 [9] train-rmse:1.02692 test-rmse:1.02711 train-loss:2.33822e+06 test-loss:1.8046e+06 [10] train-rmse:1.02832 test-rmse:1.02871 train-loss:2.34128e+06 test-loss:1.80657e+06 [11] train-rmse:1.02933 test-rmse:1.02982 train-loss:2.34368e+06 test-loss:1.80789e+06 [12] train-rmse:1.03001 test-rmse:1.03063 train-loss:2.34526e+06 test-loss:1.80883e+06 [13] train-rmse:1.03044 test-rmse:1.03121 train-loss:2.34662e+06 test-loss:1.80958e+06 [14] train-rmse:1.03076 test-rmse:1.03166 train-loss:2.34791e+06 test-loss:1.81019e+06 [15] train-rmse:1.031 test-rmse:1.03207 train-loss:2.34901e+06 test-loss:1.81071e+06 [16] train-rmse:1.03109 test-rmse:1.03233 train-loss:2.34971e+06 test-loss:1.81102e+06 [17] train-rmse:1.03114 test-rmse:1.03252 train-loss:2.35033e+06 test-loss:1.81129e+06 [18] train-rmse:1.03114 test-rmse:1.03266 train-loss:2.35104e+06 test-loss:1.8115e+06 [19] train-rmse:1.03113 test-rmse:1.03284 train-loss:2.3516e+06 test-loss:1.81172e+06
pre_data = test_data[['date', 'instrument']]
pre_data['pre_label'] = model.predict(dtest)
/tmp/ipykernel_53749/4251418004.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy pre_data['pre_label'] = model.predict(dtest)
pre_data.sort_values('date')
from bigdatasource.api import DataSource
from bigdata.api.datareader import D
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs
import pandas as pd
import numpy as np
import math
import dai
import warnings
import datetime
from datetime import timedelta
from zipline.finance.commission import PerOrder
from zipline.api import get_open_orders
from zipline.api import symbol
from bigtrader.sdk import *
from bigtrader.utils.my_collections import NumPyDeque
from bigtrader.constant import OrderType
from bigtrader.constant import Direction
# 开始回测(回测准备工作)
instruments = {'market': 'CN_STOCK_A', 'instruments': list(pre_data.instrument.unique()), 'start_date': '2020-10-23', 'end_date': '2021-12-30 '}
instruments = DataSource.write_pickle(instruments)
df = DataSource.write_df(pre_data)
# 交易引擎:初始化函数,只执行一次
def m4_initialize_bigquant_run(context):
# 加载预测数据
context.df = context.options['data'].read_df()
# 交易引擎:每个单位时间开盘前调用一次。
def m4_before_trading_start_bigquant_run(context, data):
# 盘前处理,订阅行情等
pass
# 交易引擎:tick数据处理函数,每个tick执行一次
def m4_handle_tick_bigquant_run(context, tick):
pass
# 交易引擎:bar数据处理函数,每个时间单位执行一次
def m4_handle_data_bigquant_run(context, data):
dt = data.current_dt.strftime('%Y-%m-%d')
# 获取数据
df = context.df[context.df['date']==dt].sort_values('pre_label', ascending=False)
instruments = list(df[df['pre_label']>0].instrument)[:10]
# 获取持仓信息
holding = context.get_account_positions()
holding_list = list(holding.keys())
# 卖出不在买入池中的股票
for ins in holding_list:
if ins not in instruments and data.can_trade(ins):
context.order_target(ins, 0)
holding_list.remove(ins)
# 买入持仓中没有的票
for ins in instruments:
if ins not in holding_list and data.can_trade(ins) and len(holding_list)<10:
context.order_target_percent(ins, 1/10)
holding_list.append(ins)
# 交易引擎:成交回报处理函数,每个成交发生时执行一次
def m4_handle_trade_bigquant_run(context, trade):
pass
# 交易引擎:委托回报处理函数,每个委托变化时执行一次
def m4_handle_order_bigquant_run(context, order):
pass
# 交易引擎:盘后处理函数,每日盘后执行一次
def m4_after_trading_bigquant_run(context, data):
pass
m4 = M.hftrade.v2(
instruments=instruments,
options_data=df,
start_date='',
end_date='',
initialize=m4_initialize_bigquant_run,
before_trading_start=m4_before_trading_start_bigquant_run,
handle_tick=m4_handle_tick_bigquant_run,
handle_data=m4_handle_data_bigquant_run,
handle_trade=m4_handle_trade_bigquant_run,
handle_order=m4_handle_order_bigquant_run,
after_trading=m4_after_trading_bigquant_run,
capital_base=1000000,
frequency='daily',
price_type='真实价格',
product_type='股票',
before_start_days='0',
volume_limit=1,
order_price_field_buy='open',
order_price_field_sell='open',
benchmark='000300.SH',
plot_charts=True,
disable_cache=False,
replay_bdb=False,
show_debug_info=False,
backtest_only=False
)
[2023-12-11 14:20:59.934439] INFO: moduleinvoker:747401460.py:53:hfbacktest.v1 开始运行..
[2023-12-11 14:20:59.942546] INFO hfbacktest: biglearning V1.5.5 [2023-12-11 14:20:59.944907] INFO hfbacktest: bigtrader v1.10.6 2023-12-02
[2023-12-11 14:20:59.997228] INFO: moduleinvoker:747401460.py:53:cached.v2 开始运行.. [2023-12-11 14:21:00.009734] INFO: moduleinvoker:747401460.py:53: 命中缓存 [2023-12-11 14:21:00.016468] INFO: moduleinvoker:747401460.py:53: cached.v2 运行完成[0.019216s]. [2023-12-11 14:21:00.183187] INFO: moduleinvoker:747401460.py:53: cached.v2 开始运行.. [2023-12-11 14:21:00.197029] INFO: moduleinvoker:747401460.py:53: 命中缓存 [2023-12-11 14:21:00.203060] INFO: moduleinvoker:747401460.py:53: cached.v2 运行完成[0.01991s].
[2023-12-11 14:21:58.457319] INFO hfbacktest: backtest done, raw_perf_ds:DataSource(660db94415a04213bc524da995a8160bT)
[2023-12-11 14:22:09.358658] INFO: bigcharts.impl.render:render.py:408:render_chart Data is None, skip loading it to chart.
[2023-12-11 14:22:18.078437] INFO: moduleinvoker:747401460.py:53:hfbacktest.v1 运行完成[78.143982s]. [2023-12-11 14:22:18.088879] INFO: moduleinvoker:747401460.py:53: hftrade.v2 运行完成[78.219792s].
import numpy as np
import xgboost as xgb
# 自定义损失函数
def logistic_obj(pred, dtrain):
# 获取标签
label = dtrain.get_label()
label = np.where(label>0, 1, -1)
prob_pre_label = 1 / (np.exp(-label * pred) + 1)
# 计算一阶导数
grad = -label * (1 - prob_pre_label)
# 二阶导
hessian = prob_pre_label * (1 - prob_pre_label)
return grad, hessian
dtrain = xgb.DMatrix(xtrain, ytrain)
dtest = xgb.DMatrix(xtest, ytest)
# 自定义评估函数(使用MSE评估分类模型显然不适用了)
def metric(pred, y):
pre_label = 1 / (1+np.exp(-pred))
y_ = y.get_label()
y_ = np.where(y_>0, 1, -1)
acc = np.sum(np.where(pre_label>0.5, 1, -1)==y_) / len(y_)
return 'acc', acc
print("---------focal loss-----------")
params = {'tree_method': 'hist'}
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=5, early_stopping_rounds=50,
evals=[(dtrain, 'train'), (dtest, 'test')], verbose_eval=1, obj=logistic_obj, feval=metric)
---------focal loss----------- [0] train-rmse:1.0334 test-rmse:1.0336 train-acc:0.434363 test-acc:0.444787 Multiple eval metrics have been passed: 'test-acc' will be used for early stopping. Will train until test-acc hasn't improved in 50 rounds. [1] train-rmse:1.00507 test-rmse:1.00541 train-acc:0.435889 test-acc:0.445859 [2] train-rmse:0.999928 test-rmse:1.00017 train-acc:0.529641 test-acc:0.514848 [3] train-rmse:1.00348 test-rmse:1.00352 train-acc:0.564054 test-acc:0.550274 [4] train-rmse:1.00942 test-rmse:1.00938 train-acc:0.567367 test-acc:0.554932
prob = 1 / (1 + np.exp(-model.predict(dtest)))
np.sum(np.where(prob>0.5, 1, -1) == np.where(ytest>0, 1, -1)) / (len(ytest))
pre_data_ = test_data[['date', 'instrument']]
pre_data_['pre_label'] = prob
pre_data_.sort_values('date')
from bigdatasource.api import DataSource
from bigdata.api.datareader import D
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs
import pandas as pd
import numpy as np
import math
import dai
import warnings
import datetime
from datetime import timedelta
from zipline.finance.commission import PerOrder
from zipline.api import get_open_orders
from zipline.api import symbol
from bigtrader.sdk import *
from bigtrader.utils.my_collections import NumPyDeque
from bigtrader.constant import OrderType
from bigtrader.constant import Direction
# 开始回测(回测准备工作)
instruments = {'market': 'CN_STOCK_A', 'instruments': list(pre_data_.instrument.unique()), 'start_date': '2020-10-23', 'end_date': '2021-12-30'}
instruments = DataSource.write_pickle(instruments)
df = DataSource.write_df(pre_data_)
# 交易引擎:初始化函数,只执行一次
def m4_initialize_bigquant_run(context):
# 加载预测数据
context.df = context.options['data'].read_df()
# 交易引擎:每个单位时间开盘前调用一次。
def m4_before_trading_start_bigquant_run(context, data):
# 盘前处理,订阅行情等
pass
# 交易引擎:tick数据处理函数,每个tick执行一次
def m4_handle_tick_bigquant_run(context, tick):
pass
# 交易引擎:bar数据处理函数,每个时间单位执行一次
def m4_handle_data_bigquant_run(context, data):
dt = data.current_dt.strftime('%Y-%m-%d')
# 获取数据
df = context.df[context.df['date']==dt].sort_values('pre_label', ascending=False)
instruments = list(df[df['pre_label']>0].instrument)[:10]
# 获取持仓信息
holding = context.get_account_positions()
holding_list = list(holding.keys())
# 卖出不在买入池中的股票
for ins in holding_list:
if ins not in instruments and data.can_trade(context.symbol(ins)):
context.order_target(ins, 0)
holding_list.remove(ins)
# 买入持仓中没有的票
for ins in instruments:
if ins not in holding_list and data.can_trade(context.symbol(ins)) and len(holding_list)<10:
context.order_target_percent(ins, 1/10)
holding_list.append(ins)
# 交易引擎:成交回报处理函数,每个成交发生时执行一次
def m4_handle_trade_bigquant_run(context, trade):
pass
# 交易引擎:委托回报处理函数,每个委托变化时执行一次
def m4_handle_order_bigquant_run(context, order):
pass
# 交易引擎:盘后处理函数,每日盘后执行一次
def m4_after_trading_bigquant_run(context, data):
pass
m4 = M.hftrade.v2(
instruments=instruments,
options_data=df,
start_date='',
end_date='',
initialize=m4_initialize_bigquant_run,
before_trading_start=m4_before_trading_start_bigquant_run,
handle_tick=m4_handle_tick_bigquant_run,
handle_data=m4_handle_data_bigquant_run,
handle_trade=m4_handle_trade_bigquant_run,
handle_order=m4_handle_order_bigquant_run,
after_trading=m4_after_trading_bigquant_run,
capital_base=1000000,
frequency='daily',
price_type='真实价格',
product_type='股票',
before_start_days='0',
volume_limit=1,
order_price_field_buy='open',
order_price_field_sell='open',
benchmark='000300.SH',
plot_charts=True,
disable_cache=False,
replay_bdb=False,
show_debug_info=False,
backtest_only=False
)
[2023-12-11 14:39:11.441843] INFO: moduleinvoker:2539210785.py:53:hfbacktest.v1 开始运行..
[2023-12-11 14:39:11.450694] INFO hfbacktest: biglearning V1.5.5
INFO:hfbacktest:biglearning V1.5.5
[2023-12-11 14:39:11.457805] INFO hfbacktest: bigtrader v1.10.6 2023-12-02
INFO:hfbacktest:bigtrader v1.10.6 2023-12-02 [2023-12-11 14:39:11.959239] INFO: moduleinvoker:2539210785.py:53:cached.v2 开始运行.. [2023-12-11 14:39:11.969991] INFO: moduleinvoker:2539210785.py:53: 命中缓存 [2023-12-11 14:39:11.975233] INFO: moduleinvoker:2539210785.py:53: cached.v2 运行完成[0.01601s]. [2023-12-11 14:39:12.154145] INFO: moduleinvoker:2539210785.py:53: cached.v2 开始运行.. [2023-12-11 14:39:17.655767] INFO: moduleinvoker:2539210785.py:53: cached.v2 运行完成[5.501605s].
[2023-12-11 14:40:26.647977] INFO hfbacktest: backtest done, raw_perf_ds:DataSource(69c3b8e1fcde4e7fb909a9b8edc25cd5T)
INFO:hfbacktest:backtest done, raw_perf_ds:DataSource(69c3b8e1fcde4e7fb909a9b8edc25cd5T) [2023-12-11 14:40:34.524302] INFO: bigcharts.impl.render:render.py:408:render_chart Data is None, skip loading it to chart.
[2023-12-11 14:40:43.753260] INFO: moduleinvoker:2539210785.py:53:hfbacktest.v1 运行完成[92.311427s]. [2023-12-11 14:40:43.757860] INFO: moduleinvoker:2539210785.py:53: hftrade.v2 运行完成[92.381875s].