问答交流

在aistudio种,改变平台提供的xgboost参数,回测结果一点都不变,为何?

由freestyle996创建,最终由small_q 被浏览 63 用户


在aistudio种,用平台的提供xgboost方法回测,参数改了无数次,回测结果的走势图一模一样,没有变化,xgboost的缓存参数也是关闭的,m_cached=False。


平台提供的xgboost算法参数设置{w:100}


虽然改变了不同的xgboost参数,但是回测结果一直是如图所示,一个小数点都不变{w:100}

标签

回测结果回测XGBoostAIStudioxgboost
评论
  • 改一下策略日期
  • 改过,不行。
  • w分享一下你全部包的代码
  • 按照文件分开粘贴import的代码,如下: **1、from v2_4_model import xgb** ```python # coding: utf-8 # 项目名称:BQ # 文件名称:v2_features # 开发工具:PyCharm # 功能描述: import pandas as pd import numpy as np import math from bigdatasource.api import DataSource from biglearning.api import M from biglearning.api import tools as T from biglearning.module2.common.data import Outputs from zipline.finance.commission import PerOrder import datetime import talib from models.decisiontree import decisiontree from models.xgb import xgb def model(): m10 = decisiontree return m10 ``` **2、from models.xgb import xgb** ```python import pandas as pd import numpy as np import xgboost as xgb # from typing import Tuple from sklearn.model_selection import train_test_split from biglearning.api import M # xgboost参数 params = { 'tree_method': 'hist', 'max_depth': 3, # 1 'eta': 0.5, # 0.1 'silent': 1, 'verbosity': 2, 'eval_metric': 'auc', "objective": "binary:logistic", 'seed': 2022, 'disable_default_eval_metric': 1, 'scale_pos_weight': 1, # 样本不均衡问题,负样本/正样本 # 正则化参数 'lambda': 1, # 'alpha ': 0 } num_boost_round=10 # 151 # focal loss 参数 gamma = 0 # 难易度,gamma指的是local foss的参数 [0, 5] alpha = params['scale_pos_weight']/(params['scale_pos_weight'] + 1) # 样本不均衡度,alpha指的是用来处理非平衡的参数, label为1的样本的损失是label为0的多少倍 pd.set_option('display.max_columns', None) #让pandas显示所有列 pd.set_option('display.max_rows', None) #显示所有行 # TODO 手动指定 feature = None #评估函数 def evalerror(preds, dtrain): labels = dtrain.get_label() # return a pair metric_name, result # since preds are margin(before logistic transformation, cutoff at 0) p = preds y = labels loss = alpha * (-y * np.log(p) * (1 - p) ** gamma) - (1 - alpha) * (1 - y) * np.log(1 - p) * p ** gamma return 'error', float(sum(loss)) / len(labels) #损失函数 def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: '''Compute the gradient squared log error.''' y = dtrain.get_label() return (np.log1p(predt) - np.log1p(y)) / (predt + 1) def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: '''Compute the hessian for squared log error.''' y = dtrain.get_label() return ((-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)) def squared_log(predt: np.ndarray, dtrain: xgb.DMatrix): # -> Tuple[np.ndarray, np.ndarray]: '''Squared Log Error objective. A simplified version for RMSLE used as objective function. ''' predt[predt < -1] = -1 + 1e-6 grad = gradient(predt, dtrain) hess = hessian(predt, dtrain) return grad, hess def sigmoid(x): return 1 / (1 + np.exp(-x)) def focal_loss(predt: np.ndarray, dtrain: xgb.DMatrix): # STEP1 获得label label = dtrain.get_label() y = label # STEP2 如果是二分类任务,需要让预测值通过sigmoid函数获得0~1之间的预测值 # 如果是回归任务则下述任务不需要通过sigmoid # 分类任务sigmoid化 sigmoid_pred = sigmoid(predt) p = sigmoid_pred #回归任务 # pred = predt # STEP3 一阶导和二阶导 # 1. alpha (0,1) 二分类场景下,类似于正例的sample_weight概念,可以按照样本占比,适度加权, 假如有5条正例、95条负例,则建议 alpha=0.95, 取0.5 相关于关掉此功能 # 2. gamma [0, +) 对于难度的区分程度, 取 gamma = 0 相当于关掉该功能, gamma越大则越重视难度,即专注于比较困难的样本。建议在 (0.5, 10.0) 范围尝试 # gamma = 5 # gamma指的是local foss的参数 [0, 5] # alpha = 0.9 # alpha指的是用来处理非平衡的参数, label为1的样本的损失是label为0的多少倍 grad = p * (1 - p) * (alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * ( 1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(1 - p) / p + p ** gamma * ( 1 - alpha) * (1 - y) / (1 - p)) hess = p * (1 - p) * (p * (1 - p) * ( -alpha * gamma ** 2 * y * (1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + alpha * gamma * y * ( 1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + 2 * alpha * gamma * y * (1 - p) ** gamma / ( p * (1 - p)) + alpha * y * (1 - p) ** gamma / p ** 2 - gamma ** 2 * p ** gamma * ( 1 - alpha) * (1 - y) * np.log(1 - p) / p ** 2 + 2 * gamma * p ** gamma * (1 - alpha) * ( 1 - y) / (p * (1 - p)) + gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log( 1 - p) / p ** 2 + p ** gamma * (1 - alpha) * (1 - y) / (1 - p) ** 2) - p * ( alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * ( 1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log( 1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p)) + (1 - p) * ( alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * ( 1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log( 1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p))) return grad, hess # Python 代码入口函数,input_1/2/3 对应三个输入端,data_1/2/3 对应三个输出端 def m19_run_bigquant_run(input_1, input_2, input_3): # 读取数据和特征 print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【数据】读取数据") train_data = input_1.read() feature = input_2.read() # feature = list(reversed(feature)) test_data = input_3.read() print(f'【模型】 feature_model = {feature}') #设置训练集和验证集 print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】1.划分训练和验证数据集") train, val = train_test_split(train_data[feature + ['label', 'date']], shuffle=False, test_size=0.3) #设置xgboost数据格式 print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】2.训练数据集标准化") dtrain = xgb.DMatrix(train[feature], train["label"]) dtrain.set_group(list(train.groupby('date').apply(len))) dtrain.feature_names = feature print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】3.验证数据集标准化") dval = xgb.DMatrix(val[feature], val["label"]) dval.set_group(list(val.groupby('date').apply(len))) dval.feature_names = feature print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】4.测试数据集标准化") dtest = xgb.DMatrix(test_data[feature], label = None) dtest.set_group(list(test_data.groupby('date').apply(len))) dtest.feature_names = feature #指定训练数据和验证数据 watchlist = [(dval, 'eval'), (dtrain, 'train')] #训练 print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】5.模型训练开始") print(f"params = {params}") print(f'num_boost_round = {num_boost_round}') model = xgb.train(params=params, dtrain=dtrain, evals=watchlist, num_boost_round=num_boost_round, # obj=focal_loss, # squared_log, feval=evalerror, verbose_eval=1) print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】6.模型训练结束") print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]【模型】7.特征重要性输出") features_importance_df = pd.DataFrame(model.get_score(importance_type='weight').items(), columns=['feature','importance']).sort_values('importance', ascending=False).reset_index() print(features_importance_df) print(f"selected_featuers = ") for f in features_importance_df['feature'].to_list(): print(f"'{f}',") #获取预测结果 pred = model.predict(dtest) test_data['prediction'] = pred # test_data['prediction'] *= test_data['risk_coef'] data = test_data[['date','instrument','prediction']].groupby('date').apply(lambda x:x.sort_values('prediction',ascending=False)).reset_index(drop=True) # 输出预测结果,方便导入通达信 instruments = data[['date','instrument','prediction']].groupby('date').apply(lambda x: x.iloc[0]['instrument'].split('.')[0] if len(x)>0 else None) print('==========') for i in instruments: print(i) return Outputs(model=None, predictions=DataSource.write_df(data), data_3=DataSource.write_df(test_data)) # 后处理函数,可选。输入是主函数的输出,可以在这里对数据做处理,或者返回更友好的outputs数据格式。此函数输出不会被缓存。 def m19_post_run_bigquant_run(outputs): return outputs ### TODO def xgb(m4, custom=False): print(f'params = {params} \n num_boost_round = {num_boost_round}') if custom: xgb_custom = M.cached.v3( input_1=m4.data_1, input_2=m4.data_2, input_3=m4.data_3, run=m19_run_bigquant_run, post_run=m19_post_run_bigquant_run, input_ports='', params='{}', output_ports='' ) return xgb_custom else: xgb_m = M.xgboost.v1( training_ds=m4.data_1, features=m4.data_2 if feature is None else feature, predict_ds=m4.data_3, num_boost_round=num_boost_round, objective='二分类(逻辑回归)', booster='gbtree', max_depth=params['max_depth'], key_cols='date,instrument', group_col='date', nthread=1, n_gpus=-1, m_cached=False, other_train_parameters=params, # other_train_parameters={ # 'tree_method': 'hist', # # # 'max_depth': 2, # 'eta': 0.4, # # # 'silent': 1, # # # 'verbosity': 2, # # # 'eval_metric': 'auc', # # # "objective": "binary:logistic", # # # 'seed': 2022, # 'disable_default_eval_metric': 1, # 'scale_pos_weight': 40, # 样本不均衡问题,负样本/正样本 # # # # 正则化参数 # # 'lambda': 0.5, # # 'alpha ': 0.3 # } # {'tree_method': 'approx'} #You can add other parameters as well ) return xgb_m ``` \ **3、from v2_6_trade import trade** \ ```python # coding: utf-8 # 项目名称:BQ # 文件名称:v2_features # 开发工具:PyCharm # 功能描述: import pandas as pd import numpy as np import math from bigdatasource.api import DataSource from biglearning.api import M from biglearning.api import tools as T from biglearning.module2.common.data import Outputs from zipline.finance.commission import PerOrder import datetime import talib # 回测引擎:初始化函数,只执行一次 def m19_initialize_bigquant_run(context): # 加载预测数据 context.ranker_prediction = context.options['data'].read_df() # 系统已经设置了默认的交易手续费和滑点,要修改手续费可使用如下函数 context.set_commission(PerOrder(buy_cost=0.0003, sell_cost=0.0013, min_cost=5)) # 预测数据,通过options传入进来,使用 read_df 函数,加载到内存 (DataFrame) # 设置买入的股票数量,这里买入预测股票列表排名靠前的5只 stock_count = 1 # 每只的股票的权重,如下的权重分配会使得靠前的股票分配多一点的资金,[0.339160, 0.213986, 0.169580, ..] # context.stock_weights = T.norm([1 / math.log(i + 2) for i in range(0, stock_count)]) context.stock_weights = [1 / stock_count for i in range(0, stock_count)] # 设置每只股票占用的最大资金比例 context.max_cash_per_instrument = 1 context.options['hold_days'] = 1 # 回测引擎:每日数据处理函数,每天执行一次 def m19_handle_data_bigquant_run(context, data): # 按日期过滤得到今日的预测数据 ranker_prediction = context.ranker_prediction[ context.ranker_prediction.date == data.current_dt.strftime('%Y-%m-%d')] dt = data.current_dt + datetime.timedelta(days=1) score_split = 0.001 # 1. 资金分配 # 平均持仓时间是hold_days,每日都将买入股票,每日预期使用 1/hold_days 的资金 # 实际操作中,会存在一定的买入误差,所以在前hold_days天,等量使用资金;之后,尽量使用剩余资金(这里设置最多用等量的1.5倍) is_staging = context.trading_day_index < context.options['hold_days'] # 是否在建仓期间(前 hold_days 天) cash_avg = context.portfolio.portfolio_value / context.options['hold_days'] cash_for_buy = min(context.portfolio.cash, (1 if is_staging else 1.5) * cash_avg) cash_for_sell = cash_avg - (context.portfolio.cash - cash_for_buy) positions = {e.symbol: p.amount * p.last_sale_price for e, p in context.portfolio.positions.items()} # 2. 生成卖出订单:hold_days天之后才开始卖出;对持仓的股票,按机器学习算法预测的排序末位淘汰 if not is_staging and cash_for_sell > 0: equities = {e.symbol: e for e, p in context.portfolio.positions.items()} instruments = list(reversed(list(ranker_prediction.instrument[ranker_prediction.instrument.apply( lambda x: x in equities)]))) for instrument in instruments: context.order_target(context.symbol(instrument), 0) cash_for_sell -= positions[instrument] if cash_for_sell <= 0: break # 3. 生成买入订单:按机器学习算法预测的排序,买入前面的stock_count只股票 buy_cash_weights = context.stock_weights buy_instruments = list(ranker_prediction.instrument[:len(buy_cash_weights)]) max_cash_per_instrument = context.portfolio.portfolio_value * context.max_cash_per_instrument for i, instrument in enumerate(buy_instruments): # 分数太低,不买入 ins_score = ranker_prediction[ranker_prediction["instrument"] == instrument]["prediction"].iloc[0] if ins_score < score_split: # print(f'忽略 {instrument}') continue # if dt >= (datetime.datetime.now() - datetime.timedelta(days=5)): print(f'【买入信号】 [{dt}] {instrument}') cash = cash_for_buy * buy_cash_weights[i] if cash > max_cash_per_instrument - positions.get(instrument, 0): # 确保股票持仓量不会超过每次股票最大的占用资金量 cash = max_cash_per_instrument - positions.get(instrument, 0) if cash > 0: context.order_value(context.symbol(instrument), cash) # 回测引擎:准备数据,只执行一次 def m19_prepare_bigquant_run(context): pass # 回测 def trade(m9, m10): print('回测中...') m19 = M.trade.v4( instruments=m9.data, options_data=m10.predictions, start_date='', end_date='', initialize=m19_initialize_bigquant_run, handle_data=m19_handle_data_bigquant_run, prepare=m19_prepare_bigquant_run, volume_limit=0.025, order_price_field_buy='open', order_price_field_sell='close', capital_base=100000, auto_cancel_non_tradable_orders=True, data_frequency='daily', price_type='真实价格', product_type='股票', plot_charts=True, backtest_only=False, benchmark='000300.HIX', m_cached=False, ) print('回测结束') return m19 ``` \ 补充代码如上,看一下还需要补充什么代码,可留言说一下。 另外,最近和很多同行交流了一下这个问题,也有人遇到过,那是在aistudio诞生之前了的事儿了,当时用的LightGBM,当时他不论怎么调参数,回测的曲线就是不变化,最后他放弃了,再也没在平台上活动了。
  • 所有的自定义python模块关闭之后,还是不行。 代码如下,用红框和绿框标注出来的就是关键代码,数据从dai数据平台读取出来的。 \ ![{w:100}](/wiki/api/attachments.redirect?id=4544bd5d-114e-4cdf-aea8-68df23551872) ![{w:100}](/wiki/api/attachments.redirect?id=ba247b45-db6d-4ec2-b13a-5582296c071d)
{link}