import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import as_strided
import bottleneck as bn
class Functions:
""" 设定在因子挖掘中会用到的所有函数
要添加相关函数实现可继续在此类中以静态方法的方式添加具体实现('_'下划线开头的方法会被忽略,不会加入到遗传算法原语集中)
注意一定要加上对应的输入输出类型注解,否则 deap 包无法正常 compile 进行因子值的计算
"""
@staticmethod
def _rolling_window(data, window_size):
shape = (data.shape[0] - window_size + 1, window_size) + data.shape[1:]
strides = (data.strides[0],) + data.strides
return as_strided(data, shape=shape, strides=strides)
@staticmethod
def prod(x : np.ndarray, n : int) -> np.ndarray:
res = np.full(x.shape, np.nan)
rolling_data = Functions._rolling_window(x, window_size=n)
rolling_res = np.prod(rolling_data, axis=1)
res[n - 1:] = rolling_res
return res
@staticmethod
def rank(x: np.ndarray) -> np.ndarray:
res = bn.nanrankdata(x, axis=1)
return res
@staticmethod
def max(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = np.full(x.shape, np.nan)
bool_ = x >= y
res[bool_] = x[bool_]
res[~bool_] = y[~bool_]
return res
@staticmethod
def min(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = np.full(x.shape, np.nan)
bool_ = x >= y
res[~bool_] = x[~bool_]
res[bool_] = y[bool_]
return res
@staticmethod
def delay(x: np.ndarray, n: int) -> np.ndarray:
res = np.full(x.shape, np.nan)
res[n:] = x[:-n]
return res
@staticmethod
def ts_std(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_std(x, n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def corr(x: np.ndarray,y: np.ndarray, n: int) -> np.ndarray:
min_count = max(1, n // 2)
c = x * y
d_count = np.ones(c.shape)
d_count[np.isnan(c)] = np.nan
d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)
ab_sum = bn.move_sum((c), window=n, min_count=min_count, axis=0)
a_sum = bn.move_sum((x), window=n, min_count=min_count, axis=0)
b_sum = bn.move_sum((y), window=n, min_count=min_count, axis=0)
aa_sum = bn.move_sum((x * x), window=n, min_count=min_count, axis=0)
bb_sum = bn.move_sum((y * y), window=n, min_count=min_count, axis=0)
res = (ab_sum * d_count - a_sum * b_sum) / (np.sqrt(d_count * aa_sum - a_sum ** 2) * np.sqrt(d_count * bb_sum - b_sum ** 2))
return res
@staticmethod
def delta(x: np.ndarray, n: int) -> np.ndarray:
res = x - Functions.delay(x, n)
return res
@staticmethod
def add(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = x + y
return res
@staticmethod
def sub(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = x - y
return res
@staticmethod
def mul(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = x * y
return res
@staticmethod
def div(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = x / y
res[np.isinf(res)] = np.nan
return res
@staticmethod
def log(x: np.ndarray) -> np.ndarray:
res = np.log(x + min(0,np.nanmin(x)) + 1e-20)
return res
@staticmethod
def abs(x: np.ndarray) -> np.ndarray:
res = np.abs(x)
return res
@staticmethod
def neg(x: np.ndarray) -> np.ndarray:
res = x * -1
return res
@staticmethod
def sign(x: np.ndarray) -> np.ndarray:
res = np.sign(x)
return res
@staticmethod
# 反正切函数
def arctan(x : np.ndarray) -> np.ndarray:
res = np.arctan(x)
return res
@staticmethod
# 工具函数,把数据转化为3d数据
def rolling_to_3d(mat, window, chunk_num=1):
s0, s1 = mat.strides
r, c = mat.shape
max_chunk_num = int(np.floor(r/window))
chunk_num = min(max_chunk_num, chunk_num)
def rolling(m):
shape0 = m.shape[0]-window+1
if shape0 <= 0:
shape0 = m.shape[0]
return as_strided(m, shape=(shape0, window, c), strides=(s0, s0, s1), writeable=False)
if chunk_num == 1:
yield rolling(mat)
else:
chunk_size = r // chunk_num
first_chunk = mat[:chunk_size]
yield rolling(first_chunk)
chunks = as_strided(
mat[chunk_size-window+1:],
shape=(chunk_num-1, chunk_size+window-1, c),
strides=(s0*chunk_size, s0, s1),
writeable=False
)
for sub_mat in chunks:
yield rolling(sub_mat)
left_rows = r % chunk_num
if left_rows > 0:
sub_mat = mat[-(left_rows+window-1):]
yield rolling(sub_mat)
@staticmethod
def decay_linear(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_mean(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def cov(x: np.ndarray, y: np.ndarray, n: int) -> np.ndarray:
res = np.full(x.shape, np.nan)
min_count = max(1, n // 2)
a, b = x, y
c = a * b
d_count = np.ones(c.shape)
d_count[np.isnan(c)] = np.nan
d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)
ab_sum = bn.move_sum((c), window=n, min_count=min_count, axis=0)
a_sum = bn.move_sum(a, window=n, min_count=min_count, axis=0)
b_sum = bn.move_sum(b, window=n, min_count=min_count, axis=0)
res = (ab_sum * d_count - a_sum * b_sum) / ((d_count-1) * d_count)
return res
@staticmethod
def ts_sum(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_sum(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def ts_mean(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_mean(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def ts_rank(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_rank(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def ts_min(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_min(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def ts_max(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_max(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def mean2(x: np.ndarray, y: np.ndarray) -> np.ndarray:
res = (x + y) / 2
return res
@staticmethod
def mean3(x: np.ndarray, y: np.ndarray, z: np.ndarray) -> np.ndarray:
res = (x + y + z) / 3
return res
@staticmethod
def argmax(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_argmax(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def argmin(x: np.ndarray, n: int) -> np.ndarray:
res = bn.move_argmin(x, window=n, min_count=max(1, n // 2), axis=0)
return res
@staticmethod
def power(x: np.ndarray, y: np.ndarray) -> np.ndarray:
y_a_min, y_a_max = bn.nanmin(y, axis=1), bn.nanmax(y, axis=1)
z = ((y.T - y_a_min) / (y_a_max - y_a_min)).T
x_a_min, x_a_max = bn.nanmin(x, axis=1), bn.nanmax(x, axis=1)
x = ((x.T - x_a_min) / (x_a_max - x_a_min)).T + 1
res = x ** z
return res
@staticmethod
def constant(type_int : int) -> int:
return type_int
@staticmethod
def standardation(x: np.ndarray) -> np.ndarray:
"""标准化算子"""
mean = bn.nanmean(x, axis=1).reshape(-1, 1)
std = bn.nanstd(x, axis=1, ddof=1).reshape(-1, 1)
with np.errstate(invalid='ignore'):
res = (x - mean) / std
return res
@staticmethod
def normalization(x: np.ndarray) -> np.ndarray:
"""归一化算子"""
x_min = bn.nanmin(x, axis=1).reshape(-1, 1)
x_max = bn.nanmax(x, axis=1).reshape(-1 ,1)
with np.errstate(invalid='ignore'):
res = (x - x_min) / (x_max - x_min)
return res
@staticmethod
def delay(x: np.ndarray, n: int) -> np.ndarray:
res = np.full(x.shape, np.nan)
res[n:] = x[:-n]
return res
@staticmethod
def delta(x: np.ndarray, n: int) -> np.ndarray:
res = x - Functions.delay(x, n)
return res
# @staticmethod
# # 过去n天变化率
# def pctchange_ts(n : int, x : np.ndarray) -> np.ndarray:
# res = Functions.delta(n, x) / x
# return res
@staticmethod
# 过去n天回归系数
def ts_regbeta(n : int, x : np.ndarray, y : np.ndarray) -> np.ndarray:
c = x * y
d_count = np.ones(c.shape)
d_count[np.isnan(c)] = np.nan
d_count = bn.move_sum(d_count, window=n, min_count=1, axis=0)
#bn.move_min(x_m, window=d, min_count=min_count, axis=0)
ab_sum = bn.move_sum((c), window=n, min_count=None, axis=0)
a_sum = bn.move_sum((x), window=n, min_count=None, axis=0)
b_sum = bn.move_sum((y), window=n, min_count=None, axis=0)
aa_sum = bn.move_sum((x * x), window=n, min_count=None, axis=0)
beta = (ab_sum * d_count - a_sum * b_sum) / \
(d_count * aa_sum - (a_sum) ** 2)
beta[np.isinf(beta)] = np.nan
return beta
import random
import time
import numpy as np
import pandas as pd
import inspect
import multiprocessing
import operator
from deap import base, creator, gp, tools
import empyrical as em
from biglearning.api import tools as T
from bigdatasource.api import DataSource
import bottleneck as bn
from numpy.lib.stride_tricks import as_strided
import warnings
warnings.filterwarnings('ignore') # 日志里忽视warning
# from biglearning.api.func import Functions
from bigtrader.sdk import *
print('start run ')
start run
class DataProcessor:
"""数据加载及处理类"""
def __init__(self, config):
self.bar1d_data_cols = ['close', 'high', 'low', 'open' , 'amount','volume']
from bigdata.api.datareader import D
self.ins = D.instruments(start_date= config.start_date, end_date= config.end_date)
# 数据开始日期
self.start_date = config.start_date
# 数据结束日期
self.end_date = config.end_date
# 整个数据中的训练集比例
self.train_test_data_retio = config.train_test_data_retio
self.train_validate_data_ratio = config.train_validate_data_ratio
self.config = config
# 键值对存储的基础因子数据
self.data = {}
# 数据的 index 和 columns,方便后续进行对齐操作
self.data_index = None
self.data_cols = None
# 原始收益率数据(dataframe 格式)
self.raw_ret = pd.DataFrame()
self._ret = None
# 加载数据
self._load_data()
# 根据指定的训练集比例进行短区间内的训练集与测试集划分
self._split_train_and_test(self.raw_ret.shape[0], self.train_test_data_retio, self.train_validate_data_ratio)
def _split_train_and_test(self, length, train_test_ratio, train_validate_ratio):
full_list = list(range(length))
offset_train_test = int(length * train_test_ratio)
offset_train_validate = int(offset_train_test * train_validate_ratio)
if length == 0 or offset_train_test < 1:
return [], full_list
self.train_series = full_list[:offset_train_validate]
self.val_series = full_list[offset_train_validate:offset_train_test]
self.test_series = full_list[offset_train_test:]
self.full_series = full_list
@property
def ret_values(self):
self._ret = self.raw_ret.values.astype(np.float64)
return self._ret
@property
def train_ret(self):
return self.ret_values[self.train_series]
@property
def validate_ret(self):
return self.ret_values[self.val_series]
@property
def test_ret(self):
return self.ret_values[self.test_series]
@property
def not_nan_num(self):
return pd.DataFrame(self.ret_values).count(axis=1).values
# 这里面基本
def _load_data(self):
print(f'loading data from {self.start_date} to {self.end_date}...')
# 读取基础因子数据
# table_name = 'bar5m_CN_FUTURE' # 5分钟k线
table_name = 'bar1d_CN_STOCK_A'
bar_data = DataSource(table_name).read(instruments=self.ins, start_date=self.start_date, end_date=self.end_date, fields=self.bar1d_data_cols)
bar_data['return'] = bar_data.groupby('instrument')['close'].apply(lambda x: x.pct_change().shift(-1).fillna(0))
bar_data_pivot_table = bar_data.pivot_table(index='date', columns='instrument')
raw_ret = bar_data_pivot_table['return']
self.raw_ret = raw_ret.fillna(method="ffill", axis=0)
# 判断是否已有 data_index 和 data_cols,没有则以收益数据的 index 和 columns 为标准,后续数据均以此进行 reindex
if self.data_index is None or self.data_cols is None:
self.data_index = self.raw_ret.index
self.data_cols = self.raw_ret.columns
# 将基础因子数据加入 data 中
for col in self.bar1d_data_cols:
self.data[col] = bar_data_pivot_table[col]
self.data[col] = self.data[col].reindex(index=self.data_index, columns=self.data_cols)
def outlier_limit(self, data, n_extremum=5):
"""对传入数据进行去极值"""
median = bn.nanmedian(data, axis=1).reshape(-1, 1)
Dmad = bn.nanmedian(abs(data - median), axis=1).reshape(-1, 1)
upper = (median + n_extremum * Dmad)
lower = (median - n_extremum * Dmad)
with np.errstate(invalid='ignore'):
res = np.clip(data, lower, upper)
return res
class Fitnesses(object):
"""适应度函数类"""
_methods = ["sharpe_ratio"]
def __init__(self, config):
# 适应度函数选取
self.fitness = config.fitness
self.window = 21 # 过去多少天的时序窗口
if self.fitness not in self._methods:
raise Exception("请输入正确的适应度函数类型")
def _nan_drop(self, x, y):
"""删除缺失值的处理方法"""
merged = np.vstack((x, y)).T
merged = merged[~np.isnan(merged).any(1)].T
if merged.size == 0:
return None
return merged
def _nan_fill(self, arr):
"""向前填充的处理方法"""
arr = arr.T
mask = np.isnan(arr)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
return arr[np.arange(idx.shape[0])[:, None], idx].T
def _calculate_daily_return(self, factor, ret):
## 适应度
def rolling_window(a, window):
shape = a.shape[:-1]+(a.shape[-1]-window+1, window)
strides = a.strides+(a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
# print(self.window)
global tmp
tmp = factor
roll_max = np.array([np.nanmax(rolling_window(factor[:,k], self.window),axis=1) for k in range(factor.shape[1])]).T
roll_min = np.array([np.nanmin(rolling_window(factor[:,k], self.window),axis=1) for k in range(factor.shape[1])]).T
condition = np.array([(factor[self.window:,k]>=roll_max[1:,k]).astype(int)+ (factor[self.window:,k]<=roll_min[1:,k]).astype(int)*(-1) for k in range(factor.shape[1])]).T
position = np.array([np.insert(condition[:,k],0,[0]*(self.window)) for k in range(factor.shape[1])]).T
position = np.array([k/(abs(k).sum()) if abs(k).sum()>0 else k for k in position])
daily_ret = position * ret - 0.0001*np.array([np.insert(abs(np.diff(position[:,k])),0,[0]) for k in range(position.shape[1])]).T
daily_ret[daily_ret == np.nan] = 0.0
counts = (np.array([abs(np.diff(position[:,k],1)).sum() for k in range(position.shape[1])])).sum() / position.shape[1]/2
return np.nansum(daily_ret, axis=1), counts
def calculate_longshort_index(self, factor, longshort, eval_type):
"""计算多空收益、多头收益的sharpe、总收益、波动率指标"""
if eval_type == 'train':
data = data_dp
ret_values = data.train_ret
elif eval_type == "val":
data = data_dp
ret_values = data.validate_ret
elif eval_type == 'test':
data = data_dp
ret_values = data.test_ret
returns, counts = self._calculate_daily_return(factor, ret_values)
nav = np.cumsum(returns) + 1
nav_cummax = np.array([np.max(nav[:k]) for k in range(1, len(nav)+1)])
# # 要求交易次数
# if returns.size == 0 or counts < returns.size/(6*240/5) or counts > 10*returns.size/(6*240/5) or max(1-nav/nav_cummax) >0.2:
# return np.nan
returns = returns[~np.isnan(returns)]
if returns.size == 0:
return np.nan
return em.sharpe_ratio(returns, 0.035/252)
def fitness_choose(self, factor, longshort, eval_type):
fit = self.calculate_longshort_index(factor=factor, longshort=longshort, eval_type=eval_type)
return fit
def evaluate_factor(self, individual, eval_type):
"""计算传入个体的因子值"""
if eval_type == 'train': # 训练集
data = data_dp
index_series = data.train_series
ret_values = data.train_ret
elif eval_type == "val": # 验证集
data = data_dp
index_series = data.val_series
ret_values = data.validate_ret
elif eval_type == 'test': # 测试集
data = data_dp
index_series = data.test_series
ret_values = data.test_ret
# 将个体转换为表达式函数
func = toolbox.compile(expr=individual)
# 获取不同函数的参数
func_names = list(inspect.signature(func).parameters.keys())
# 根据获取的函数参数构建对应的键值对
# 传入数据前记得转换数据类型为 np.float64
param = {i: data.data[i].values.astype(np.float64) for i in func_names}
# 将构建好的键值对解包传入 func 中,进行对应表达式的因子值计算
factor = func(**param)[index_series]
# 出现 np.inf 大部分是因为 float 类型的问题,numpy 默认转换出来的 float 是 float32,需要自行转换成 float64 避免溢出为 np.inf
factor[np.isinf(factor)] = np.nan
if (factor != factor).sum() == (factor.shape[0] * factor.shape[1]):
return np.nan, None
if len(np.unique(factor[~np.isnan(factor)])) < 10000:
factor = factor
else:
factor = data.outlier_limit(factor)
return factor
def evaluate_fitness(self, individual, longshort, eval_type):
"""计算传入个体的因子值并计算 IR 值"""
factor = self.evaluate_factor(individual=individual, eval_type=eval_type)
if isinstance(factor, float):
if factor != factor:
return factor, individual
if type(factor) == tuple:
return np.nan, individual
factor = self._nan_fill(factor)
fit = self.fitness_choose(factor, longshort, eval_type=eval_type)
return fit, individual
def compare_fitness(self, Threshold, fit):
"""比较阈值和适应度值"""
if self.fitness.endswith("vol"): # 波动率要反着来
if Threshold > fit:
return True
elif Threshold < fit:
return True
else:
return False
def cross_mutation_handle(population):
"""对传入的种群进行交叉、子树变异、提升变异以及点变异等操作"""
offspring = [toolbox.clone(ind) for ind in population]
# 交叉
for i in range(1, len(offspring), 2):
if random.random() < config.cross_prob:
offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1], offspring[i])
del offspring[i - 1].fitness.values, offspring[i].fitness.values
# 变异
for i in range(len(offspring)):
if random.random() < config.mutation_prob:
offspring[i], = toolbox.mutate(offspring[i])
del offspring[i].fitness.values
# 提升变异
for i in range(len(offspring)):
if random.random() < config.boost_mutation_prob:
offspring[i], = toolbox.mutate_shrink(offspring[i])
del offspring[i].fitness.values
# 点变异
for i in range(len(offspring)):
if random.random() < config.point_mutation_prob:
offspring[i], = toolbox.mutate_NodeReplacement(offspring[i])
del offspring[i].fitness.values
return offspring
def drop_duplicates(individuals):
"""移除生成表达式相同的个体"""
ind_dict = {}
for ind in individuals:
expr = str(ind)
if expr in list(ind_dict.keys()):
continue
ind_dict[expr] = ind
return list(ind_dict.values())
class Config:
# 设定短区间数据读取时间范围
start_date = '2021-01-01'
end_date = '2021-10-01'
return_field = 'close'
train_test_data_retio = 3/4
train_validate_data_ratio = 3/4
# 初始种群数量
init_ind_num = 200
# 种群代数
num_gen = 1
# 训练集适应度标准
train_fitness = 1
# 验证集适应度标准
val_fitness = 0.8
# 测试集适应度标准
test_fitness = 0.55
fitness = 'sharpe_ratio'
# 交叉概率
cross_prob = 0.8
# 子树变异概率
mutation_prob = 0.6
# 提升变异概率
boost_mutation_prob = 0.6
# 点变异概率
point_mutation_prob = 0.6
# 常数项
constant_ranges = list(range(1, 11))
# 初始化相关实例,定义遗传算法结构
# 设置随机数种子
random_seed_num = 44 # should:49
random.seed(random_seed_num)
# config 配置类,遗传算法运行中的相关参数均由该 config 实例控制
config = Config()
data_dp = DataProcessor(config)
# 适应度函数类
fitness = Fitnesses(config)
# 创建个体
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)
# 构建原语集
# 设置遗传算法原语集,将相关函数、常数项及待计算的基础因子注入到 pset 对象中
pset = gp.PrimitiveSetTyped("MAIN", (np.ndarray,) * len(list(data_dp.data.keys())), np.ndarray)
funcs = inspect.getmembers(Functions)
for name, func in funcs:
if not name.startswith('_'):
bool_ = True
sig = inspect.signature(func)
params=sig.parameters
pa = []
for param in list(params.keys()):
if params[param].annotation == inspect._empty:
bool_ = False
pa.append(params[param].annotation)
if bool_:
# print(pa, sig.return_annotation, name, func)
pset.addPrimitive(func, pa, sig.return_annotation)
for i in config.constant_ranges:
pset.addTerminal(i, int, str(i))
args_dict = {f'ARG{index}': key_name for index, key_name in enumerate(list(data_dp.data.keys()))}
pset.renameArguments(**args_dict)
loading data from 2021-01-01 to 2021-10-01...
# 构建工具箱
# 设置遗传算法中会用到的相关工具方法,注册到 toolbox 中,方便后续直接调用
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=5)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint) # 交叉
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset) #变异
toolbox.register("mutate_shrink", gp.mutShrink) # 提升变异
toolbox.register("mutate_NodeReplacement", gp.mutNodeReplacement, pset=pset) # 点变异
toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate_shrink", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.decorate("mutate_NodeReplacement", gp.staticLimit(key=operator.attrgetter("height"), max_value=10))
toolbox.register("evaluate_train", fitness.evaluate_fitness, longshort=True, eval_type='train')
toolbox.register("evaluate_val", fitness.evaluate_fitness, longshort=True, eval_type='val')
toolbox.register("evaluate_test", fitness.evaluate_fitness, longshort=True, eval_type='test')
# 开始使用定义的遗传算法进行因子挖掘
# 保存符合条件的因子表达式
saved_factor_exprs = {}
# 初始化种群
population = toolbox.population(n=config.init_ind_num)
# 声明统计指标
stats = tools.Statistics(key=lambda ind: ind.fitness.values)
stats.register("avg", bn.nanmean)
stats.register("std", bn.nanstd)
stats.register("min", bn.nanmin)
stats.register("max", bn.nanmax)
# 日志记录,与指标记录一起使用,可方便打印指标相关记录
logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])
for gen in range(1, config.num_gen + 1):
random.seed(random_seed_num)
# 个体优选器(能自动按给定的最大长度将适应度最优的个体从大到小排序存放,根据你的目标进行判定)
pass_hall_of_fame = tools.HallOfFame(config.init_ind_num)
final_hall_of_fame = tools.HallOfFame(int(config.init_ind_num * 0.1))
invalid_ind = [ind for ind in population if not ind.fitness.valid]
# 开始计算之前预先针对相同表达式去重,减少重复计算量
invalid_ind = drop_duplicates(invalid_ind)
print('==============================第{0}代开始挖掘因子中:=============================='.format(gen))
fitnesses_train_lst = []
for i in invalid_ind:
fitnesses_train = toolbox.evaluate_train(i)
fitnesses_train_lst.append(fitnesses_train)
pass1check_population = []
record_population = []
print('开始for loop ')
# 根据指定的训练 IR 值筛选符合条件的个体并存入 pass_population 中
for fit, ind in fitnesses_train_lst:
ind.fitness.values = (fit,)
record_population.append(ind)
# 计算个体在验证集的适应度
val_fitness = toolbox.evaluate_val(ind)[0]
print(f"因子{ind}在训练集适应度值为{fit}")
if fitness.compare_fitness(config.train_fitness, fit):
if fitness.compare_fitness(config.val_fitness, val_fitness):
print(f"通过第一层检查的因子 {ind} 在训练集适应度值为{fit} 在验证集适应度值为{val_fitness}")
pass1check_population.append(ind)
print(f'共「{len(pass1check_population)}」个表达式:{[str(pop) for pop in pass1check_population]}通过训练集检测')
print('-- 使用训练数据计算表达式适应度完成 --')
pass_hall_of_fame.update(pass1check_population)
fitnesses_test_lst = []
for j in pass1check_population:
fitnesses_test = toolbox.evaluate_test(j)
fitnesses_test_lst.append(fitnesses_test)
pass2check_population = []
pass2check_population_dict = {}
# 根据指定的测试 IR 值筛选符合条件的个体并存入 final_population 中
for fit, ind in fitnesses_test_lst:
ind.fitness.values = (fit,)
if fitness.compare_fitness(config.test_fitness, fit):
print(f"通过第二层检查的因子 {ind} 在测试集适应度值为{fit}")
pass2check_population.append(ind)
pass2check_population_dict[str(ind)] = (ind,fit)
# 记录每一代的双重通过检验的个体
saved_factor_exprs[gen] = pass2check_population_dict
print(f'共「{len(pass2check_population)}」个表达式:{[str(pop) for pop in pass2check_population]}通过测试数据检测')
print('-- 使用测试数据计算表达式适应度完成 --')
# Update the final_hall_of_fame with the generated individuals
final_hall_of_fame.update(pass2check_population)
# 使用通过训练集 IR 值检测的个体替换原有的随机个体
population = pass2check_population
# 判断当前种群是否为空,为空则拿到本代父代种群,进行交叉变异
if not population:
population = record_population
print(f"pass:{len(pass2check_population)}, record:{len(record_population)}, population: {len(pass2check_population)}")
record = stats.compile(population) if stats else {}
logbook.record(gen=gen, nevals=len(invalid_ind), **record)
print(f'因子挖掘过程中相关指标:{repr(logbook)}')
print('-- 开始进行下一代因子挖掘 --')
offspring = toolbox.select(population, config.init_ind_num)
offspring = cross_mutation_handle(offspring)
population[:] = offspring
"""绘制每代的统计指标折线图"""
df = pd.DataFrame(logbook)
df.set_index("gen", inplace=True)
df['avg'].plot(title='每代适应度统计指标')
print(f'=====================因子挖掘结束=======================')
==============================第1代开始挖掘因子中:============================== 开始for loop 因子decay_linear(log(sub(mean3(normalization(close), mean3(close, amount, close), cov(high, amount, 5)), sub(ts_max(volume, 6), sign(close)))), constant(constant(constant(constant(9)))))在训练集适应度值为nan 因子cov(low, volume, constant(7))在训练集适应度值为-0.6175458427796249 因子ts_std(max(volume, close), 7)在训练集适应度值为-2.6296825508955246 因子standardation(ts_regbeta(constant(4), sign(amount), arctan(low)))在训练集适应度值为7.943467839572381 通过第一层检查的因子 standardation(ts_regbeta(constant(4), sign(amount), arctan(low))) 在训练集适应度值为7.943467839572381 在验证集适应度值为1.3632407818276155 因子mean3(close, amount, amount)在训练集适应度值为-3.5569230125297095 因子max(ts_max(decay_linear(low, 6), constant(4)), argmax(ts_min(close, 10), constant(2)))在训练集适应度值为-1.33015025261717 因子abs(normalization(volume))在训练集适应度值为0.5785331913029493 因子argmin(low, 9)在训练集适应度值为1.3143307612923878 通过第一层检查的因子 argmin(low, 9) 在训练集适应度值为1.3143307612923878 在验证集适应度值为1.9904242815294244 因子neg(open)在训练集适应度值为-0.28549913623155365 因子rank(amount)在训练集适应度值为-2.8682169364073062 因子normalization(open)在训练集适应度值为-0.18123225351031935 因子ts_max(power(ts_rank(argmin(low, 5), constant(3)), delay(argmin(open, 7), constant(8))), constant(constant(constant(5))))在训练集适应度值为0.6920625605371129 因子arctan(mean3(amount, open, close))在训练集适应度值为-2.2743638760051246 因子delay(ts_rank(mul(delay(high, 10), standardation(amount)), constant(constant(6))), constant(constant(constant(2))))在训练集适应度值为0.829603792107679 因子sub(ts_rank(mean3(mean3(high, volume, high), abs(low), power(close, amount)), constant(constant(3))), max(add(argmin(close, 2), ts_max(amount, 9)), argmax(power(open, high), constant(3))))在训练集适应度值为3.1325814913890886 因子prod(normalization(high), 1)在训练集适应度值为1.1347742462038337 通过第一层检查的因子 prod(normalization(high), 1) 在训练集适应度值为1.1347742462038337 在验证集适应度值为5.581777905473629 因子log(amount)在训练集适应度值为-2.651555770490047 因子ts_min(neg(open), constant(10))在训练集适应度值为0.15233008264078046 因子power(mul(open, high), log(open))在训练集适应度值为0.899880474105388 因子ts_rank(ts_sum(power(volume, amount), constant(7)), constant(constant(10)))在训练集适应度值为-2.215754034734214 因子mean3(log(standardation(high)), power(low, ts_min(close, 1)), amount)在训练集适应度值为-1.2406428337907778 因子log(volume)在训练集适应度值为-3.14255538826355 因子rank(abs(argmax(low, 3)))在训练集适应度值为0.2439253676238492 因子rank(abs(div(volume, argmin(close, 5))))在训练集适应度值为0.4654732118691496 因子add(argmax(close, 1), rank(volume))在训练集适应度值为-2.905122786783565 因子rank(open)在训练集适应度值为-0.264421211098399 因子ts_sum(close, constant(10))在训练集适应度值为-2.5325125869504084 因子ts_min(argmax(mean2(sub(amount, delay(high, 2)), low), 9), constant(constant(constant(8))))在训练集适应度值为-2.450840416692982 因子abs(ts_max(high, 4))在训练集适应度值为-1.3244688833987517 因子prod(ts_mean(arctan(ts_regbeta(constant(10), arctan(open), cov(low, low, 4))), constant(constant(constant(4)))), constant(constant(5)))在训练集适应度值为-1.460952169475528 因子corr(open, open, 9)在训练集适应度值为0.872010382434017 因子mean3(max(power(delta(ts_max(low, 9), constant(9)), low), close), sign(ts_max(ts_sum(div(close, high), 7), 5)), abs(volume))在训练集适应度值为-3.5272837604649485 因子log(close)在训练集适应度值为1.9416553658541473 通过第一层检查的因子 log(close) 在训练集适应度值为1.9416553658541473 在验证集适应度值为6.033746183451963 因子max(power(add(close, high), sign(open)), ts_min(normalization(open), constant(5)))在训练集适应度值为1.8876502874409495 因子max(volume, ts_rank(min(volume, sub(close, high)), 9))在训练集适应度值为-3.5274350665512455 因子sub(argmax(min(amount, open), constant(1)), ts_min(cov(low, high, 10), constant(2)))在训练集适应度值为1.3230908201029203 因子argmax(high, 10)在训练集适应度值为0.5233631974998 因子corr(close, high, 3)在训练集适应度值为4.403046423096347 因子abs(argmax(max(amount, amount), 9))在训练集适应度值为1.1411302435396542 因子add(close, ts_regbeta(constant(constant(2)), argmax(neg(volume), constant(8)), normalization(open)))在训练集适应度值为1.48964779831689 通过第一层检查的因子 add(close, ts_regbeta(constant(constant(2)), argmax(neg(volume), constant(8)), normalization(open))) 在训练集适应度值为1.48964779831689 在验证集适应度值为4.8921394148687 因子div(low, arctan(amount))在训练集适应度值为0.506461791853082 因子normalization(ts_min(ts_max(close, constant(1)), constant(constant(6))))在训练集适应度值为0.047345437367095004 因子add(volume, decay_linear(sign(neg(delta(high, 5))), constant(5)))在训练集适应度值为-3.8993679503291867 因子mean3(high, close, amount)在训练集适应度值为-3.5569230125297095 因子ts_max(close, 5)在训练集适应度值为-0.9029746545322718 因子arctan(min(low, sign(min(prod(amount, 1), log(open)))))在训练集适应度值为-1.1346048274611584 因子argmax(delta(high, 1), constant(7))在训练集适应度值为-2.0071521199129894 因子ts_min(ts_mean(sub(standardation(volume), ts_mean(close, 7)), constant(constant(8))), constant(constant(constant(7))))在训练集适应度值为0.194562784264589 因子ts_mean(div(open, close), constant(3))在训练集适应度值为-2.0149562784867228 因子ts_rank(argmin(open, 1), constant(10))在训练集适应度值为-6226367360619556.0 因子abs(standardation(mean2(corr(volume, close, 3), standardation(open))))在训练集适应度值为-3.454524958128587 因子log(corr(close, volume, constant(10)))在训练集适应度值为nan 因子max(mean3(decay_linear(low, 6), ts_max(open, 6), cov(open, open, 10)), delay(ts_regbeta(9, high, volume), constant(9)))在训练集适应度值为-0.34870658893523565 因子ts_std(close, 4)在训练集适应度值为0.8505143275895801 因子arctan(ts_rank(max(low, open), constant(3)))在训练集适应度值为-0.7238653384316528 因子rank(high)在训练集适应度值为0.8252090060249735 因子argmax(normalization(mul(div(open, open), add(close, open))), constant(constant(constant(9))))在训练集适应度值为-0.5933799425499006 因子min(min(high, close), neg(volume))在训练集适应度值为2.2460589740293178 通过第一层检查的因子 min(min(high, close), neg(volume)) 在训练集适应度值为2.2460589740293178 在验证集适应度值为2.521669917073375 因子ts_min(rank(arctan(open)), constant(constant(constant(6))))在训练集适应度值为-2.3169481687579774 因子delay(power(amount, mul(close, close)), constant(5))在训练集适应度值为-0.18005282740531545 因子add(delay(max(corr(amount, high, 10), low), constant(constant(4))), close)在训练集适应度值为-1.1549843873238987 因子ts_regbeta(10, open, volume)在训练集适应度值为-2.131769292835745 因子prod(amount, 9)在训练集适应度值为-2.6985047312563517 因子log(normalization(ts_sum(volume, 2)))在训练集适应度值为-0.9226000591195893 因子argmin(high, 3)在训练集适应度值为-0.5131773067004418 因子ts_mean(amount, 3)在训练集适应度值为-4.3944902790402525 因子ts_rank(amount, constant(5))在训练集适应度值为-2.609127771456981 因子div(amount, low)在训练集适应度值为-3.5147130285278907 因子abs(close)在训练集适应度值为1.615526519480938 通过第一层检查的因子 abs(close) 在训练集适应度值为1.615526519480938 在验证集适应度值为5.238416867218753 因子min(mul(open, volume), low)在训练集适应度值为0.5509720803529632 因子standardation(abs(cov(neg(ts_regbeta(6, amount, close)), sign(log(amount)), constant(constant(2)))))在训练集适应度值为-0.7829090732737205 因子ts_sum(ts_regbeta(constant(5), open, mean2(argmax(sign(high), constant(5)), arctan(low))), constant(constant(constant(5))))在训练集适应度值为-1.081406803471418 因子log(ts_min(mean2(volume, close), constant(1)))在训练集适应度值为-3.14255538826355 因子corr(neg(ts_max(abs(volume), constant(7))), div(argmin(normalization(high), constant(6)), ts_regbeta(constant(8), arctan(amount), ts_max(close, 2))), constant(constant(constant(9))))在训练集适应度值为-1.428941683373443 因子mean2(mean3(min(close, open), max(amount, volume), ts_regbeta(2, low, amount)), argmax(neg(volume), constant(1)))在训练集适应度值为-0.05348926151850602 因子ts_std(low, constant(constant(constant(10))))在训练集适应度值为-2.5203648977426227 因子corr(high, open, 3)在训练集适应度值为0.7796155109780339 因子log(open)在训练集适应度值为-1.008444296078567 因子decay_linear(power(power(volume, volume), rank(high)), constant(constant(2)))在训练集适应度值为0.5599165011674587 因子corr(close, amount, 4)在训练集适应度值为2.2339527188184345 通过第一层检查的因子 corr(close, amount, 4) 在训练集适应度值为2.2339527188184345 在验证集适应度值为2.663737831331714 因子min(amount, low)在训练集适应度值为0.5509720803529632 因子power(amount, close)在训练集适应度值为-1.2212788564734092 因子add(rank(rank(close)), ts_std(delay(low, 6), constant(5)))在训练集适应度值为0.30777282840544334 因子ts_max(rank(open), constant(10))在训练集适应度值为-1.0025490128795873 因子ts_std(low, 9)在训练集适应度值为-2.2684027779982165 因子prod(ts_rank(amount, constant(4)), 2)在训练集适应度值为-0.15263042894844603 因子ts_sum(add(max(max(decay_linear(open, 10), neg(amount)), ts_max(normalization(close), constant(9))), neg(argmin(log(volume), constant(5)))), constant(constant(constant(constant(2)))))在训练集适应度值为-2.534292795875279 因子sign(neg(close))在训练集适应度值为-6226367360619556.0 因子ts_regbeta(constant(constant(constant(9))), log(arctan(sign(amount))), standardation(delta(sub(amount, close), constant(2))))在训练集适应度值为1.0559891514314288 因子arctan(high)在训练集适应度值为-0.3532212691117485 因子standardation(rank(sub(close, low)))在训练集适应度值为-1.840490008773565 因子mul(sign(mean3(ts_max(argmax(open, 6), constant(4)), rank(normalization(close)), abs(mul(low, amount)))), prod(mean2(cov(argmax(open, 2), sign(open), constant(4)), add(normalization(amount), rank(close))), constant(constant(constant(4)))))在训练集适应度值为-2.0536812658480357 因子ts_regbeta(constant(constant(constant(constant(8)))), mean3(ts_min(delta(ts_min(amount, 3), constant(5)), constant(constant(7))), prod(ts_rank(mean3(amount, volume, volume), constant(5)), constant(constant(4))), prod(abs(prod(close, 3)), constant(constant(4)))), standardation(max(ts_std(normalization(open), constant(1)), decay_linear(delay(close, 9), constant(7)))))在训练集适应度值为1.8826838190903095 因子ts_max(normalization(power(amount, open)), constant(constant(6)))在训练集适应度值为-2.459893980038407 因子decay_linear(volume, constant(constant(3)))在训练集适应度值为-4.084712515082248 因子prod(amount, 6)在训练集适应度值为-4.738637249386603 因子ts_regbeta(constant(constant(8)), volume, low)在训练集适应度值为0.3741313724014318 因子div(mul(power(div(close, open), log(amount)), mean2(sign(low), log(open))), normalization(max(ts_min(low, 4), log(high))))在训练集适应度值为2.461320031995423 因子ts_rank(power(arctan(high), amount), constant(constant(4)))在训练集适应度值为-0.26340247855716925 因子decay_linear(delta(close, 8), constant(5))在训练集适应度值为-0.5282379273970544 因子add(high, high)在训练集适应度值为-0.1284402041264282 因子power(ts_max(amount, 7), mean2(close, open))在训练集适应度值为-3.687277092504784 因子argmin(amount, 9)在训练集适应度值为-2.458567061619689 因子max(normalization(ts_max(add(amount, open), constant(3))), ts_regbeta(constant(constant(7)), min(div(amount, close), ts_min(open, 4)), mul(ts_std(volume, 6), cov(volume, volume, 5))))在训练集适应度值为-2.937506613050172 因子mul(amount, close)在训练集适应度值为-3.438138939631833 因子prod(volume, constant(constant(7)))在训练集适应度值为-3.365349957260432 因子ts_sum(normalization(abs(arctan(close))), constant(constant(constant(2))))在训练集适应度值为-1.4372350904986897 因子argmax(volume, 3)在训练集适应度值为0.3120409103987417 因子sign(cov(volume, rank(high), 7))在训练集适应度值为1.4640178184988593 因子ts_std(low, 10)在训练集适应度值为-2.5203648977426227 因子max(argmax(low, 6), normalization(amount))在训练集适应度值为0.7961850454797081 因子sign(power(abs(argmax(rank(volume), constant(4))), ts_std(ts_sum(argmax(low, 8), constant(5)), constant(constant(8)))))在训练集适应度值为-6226367360619556.0 因子abs(delay(abs(mean2(normalization(open), abs(low))), 8))在训练集适应度值为-2.5971106523838228 因子ts_regbeta(constant(constant(4)), close, ts_max(argmax(open, constant(2)), constant(constant(2))))在训练集适应度值为-0.10196677467789594 因子mul(low, high)在训练集适应度值为0.6158262892423655 因子cov(abs(ts_regbeta(constant(8), decay_linear(high, 1), max(amount, volume))), div(min(mean3(close, high, amount), log(high)), ts_std(decay_linear(low, 7), constant(9))), constant(constant(constant(4))))在训练集适应度值为0.5749322549313851 因子sign(close)在训练集适应度值为-6226367360619556.0 因子cov(log(low), div(argmax(amount, 8), open), 1)在训练集适应度值为nan 因子div(amount, amount)在训练集适应度值为-6226367360619556.0 因子max(delta(close, 1), delay(low, 1))在训练集适应度值为-0.6106949269022396 因子ts_sum(max(rank(volume), ts_sum(low, 3)), 3)在训练集适应度值为-3.1064866532769853 因子cov(ts_mean(open, 3), add(close, open), constant(1))在训练集适应度值为nan 因子min(argmin(close, 3), sign(open))在训练集适应度值为1.0770365545430634 通过第一层检查的因子 min(argmin(close, 3), sign(open)) 在训练集适应度值为1.0770365545430634 在验证集适应度值为4.27452683186898 因子mean3(mul(high, high), ts_mean(amount, 3), ts_mean(close, 4))在训练集适应度值为-4.17893792114576 因子ts_rank(mul(neg(delta(ts_sum(open, 1), constant(8))), delta(mean3(log(low), ts_max(high, 8), ts_regbeta(5, open, close)), constant(constant(4)))), constant(constant(constant(constant(3)))))在训练集适应度值为-2.3508851726692193 因子max(argmax(log(close), constant(8)), power(ts_max(low, 7), ts_mean(high, 8)))在训练集适应度值为2.7240005465629036 因子corr(power(neg(corr(standardation(low), argmin(high, 10), constant(10))), mean2(ts_std(rank(amount), constant(9)), abs(decay_linear(low, 9)))), normalization(normalization(argmin(mean3(open, low, high), constant(4)))), constant(constant(constant(constant(9)))))在训练集适应度值为-0.1998511893788716 因子ts_mean(min(low, low), constant(1))在训练集适应度值为0.8144080454648105 因子abs(sign(max(open, amount)))在训练集适应度值为-6226367360619556.0 因子delta(ts_mean(max(ts_regbeta(constant(8), neg(amount), add(close, close)), ts_rank(ts_sum(close, 1), constant(6))), constant(constant(constant(6)))), constant(constant(constant(constant(9)))))在训练集适应度值为-1.938324913154291 因子sign(low)在训练集适应度值为-6226367360619556.0 因子ts_mean(decay_linear(ts_regbeta(constant(constant(10)), decay_linear(decay_linear(volume, 2), constant(3)), min(normalization(high), power(open, close))), constant(constant(constant(1)))), constant(constant(constant(constant(10)))))在训练集适应度值为-1.1015136233064262 因子ts_rank(prod(mean2(volume, high), constant(10)), constant(constant(5)))在训练集适应度值为-3.8575327686392864 因子normalization(high)在训练集适应度值为1.1347742462038337 通过第一层检查的因子 normalization(high) 在训练集适应度值为1.1347742462038337 在验证集适应度值为5.581777905473629 因子prod(corr(mean2(amount, low), abs(amount), constant(5)), constant(constant(2)))在训练集适应度值为-1.2466027706966596 因子sub(delay(amount, 2), power(low, low))在训练集适应度值为-3.6189046016613045 因子arctan(min(argmax(power(cov(volume, amount, 8), argmax(amount, 2)), constant(constant(1))), max(add(ts_mean(open, 7), ts_mean(volume, 10)), ts_min(add(high, low), constant(2)))))在训练集适应度值为-6226367360619556.0 因子decay_linear(low, constant(constant(1)))在训练集适应度值为0.8144080454648105 因子argmax(sign(decay_linear(ts_sum(ts_min(low, 10), constant(4)), constant(constant(5)))), constant(constant(constant(constant(1)))))在训练集适应度值为-6226367360619556.0 因子cov(normalization(low), add(close, volume), constant(5))在训练集适应度值为0.40699481116402136 因子delta(max(low, open), constant(1))在训练集适应度值为0.6310435828563283 因子ts_mean(ts_min(add(prod(close, 7), standardation(open)), constant(constant(8))), constant(constant(constant(3))))在训练集适应度值为-2.1141703434424968 因子normalization(amount)在训练集适应度值为-1.4028219423712989 因子neg(mul(sign(normalization(volume)), mean2(arctan(low), ts_sum(low, 3))))在训练集适应度值为0.8899462981241301 因子neg(corr(volume, open, 5))在训练集适应度值为-1.9720519341786393 因子standardation(ts_rank(rank(arctan(sub(high, low))), constant(constant(constant(5)))))在训练集适应度值为0.8214568205283139 因子ts_min(low, 3)在训练集适应度值为-2.5162579662910938 因子ts_sum(corr(sub(ts_sum(high, 4), prod(volume, 6)), ts_sum(mul(high, high), constant(6)), constant(constant(9))), constant(constant(constant(2))))在训练集适应度值为-0.5488069720146732 因子decay_linear(abs(delta(open, 8)), constant(constant(6)))在训练集适应度值为-1.2904701562056957 因子sub(corr(open, sign(max(high, delta(close, 7))), constant(constant(10))), arctan(argmax(ts_max(arctan(open), constant(7)), constant(constant(4)))))在训练集适应度值为nan 因子corr(high, amount, constant(constant(constant(constant(9)))))在训练集适应度值为-0.5921849390796665 因子prod(log(abs(delta(corr(amount, low, 7), constant(8)))), constant(constant(constant(constant(4)))))在训练集适应度值为1.8898050220206184 通过第一层检查的因子 prod(log(abs(delta(corr(amount, low, 7), constant(8)))), constant(constant(constant(constant(4))))) 在训练集适应度值为1.8898050220206184 在验证集适应度值为1.3310236211885684 因子delta(div(decay_linear(open, 2), min(volume, close)), constant(constant(5)))在训练集适应度值为-3.19484811871365 因子abs(max(sub(open, low), argmax(amount, 9)))在训练集适应度值为1.2349860938207768 因子ts_min(high, constant(2))在训练集适应度值为-2.3350781876218156 因子delta(volume, 9)在训练集适应度值为-4.365452353351219 因子decay_linear(delay(arctan(ts_sum(delay(volume, 3), constant(4))), constant(constant(constant(10)))), constant(constant(constant(constant(5)))))在训练集适应度值为-0.6007672396321595 因子rank(prod(close, 2))在训练集适应度值为0.04159165541523652 因子delta(mean3(open, ts_regbeta(6, high, close), div(volume, close)), constant(constant(7)))在训练集适应度值为-3.5102045282821788 因子div(low, sub(open, amount))在训练集适应度值为-2.951106288813996 因子mean3(volume, ts_max(volume, constant(constant(1))), argmin(mean3(ts_std(high, 4), power(log(amount), volume), close), constant(constant(7))))在训练集适应度值为-3.517897054304117 因子arctan(close)在训练集适应度值为1.6168291715440435 通过第一层检查的因子 arctan(close) 在训练集适应度值为1.6168291715440435 在验证集适应度值为6.036044091532433 因子ts_mean(max(mul(corr(high, high, 2), cov(low, low, 3)), rank(div(close, close))), constant(constant(constant(4))))在训练集适应度值为1.9278732853252025 因子add(add(close, volume), delay(open, 7))在训练集适应度值为-3.737812200085102 因子prod(volume, 8)在训练集适应度值为-2.9087730701925176 因子abs(argmax(normalization(standardation(ts_mean(open, 4))), constant(constant(constant(2)))))在训练集适应度值为-0.907947951295576 因子sign(sub(rank(argmax(argmax(low, 5), 2)), open))在训练集适应度值为-0.9143756009854005 因子abs(volume)在训练集适应度值为-3.5951662785567566 因子max(decay_linear(mul(close, amount), constant(3)), ts_max(delay(low, 1), constant(3)))在训练集适应度值为-3.671585930901808 因子mean2(sub(open, high), delta(volume, 10))在训练集适应度值为-3.5066573436608737 因子power(power(add(ts_mean(close, 10), log(open)), corr(arctan(open), ts_rank(amount, 6), constant(2))), argmax(neg(decay_linear(amount, 2)), constant(constant(1))))在训练集适应度值为nan 因子cov(min(log(ts_mean(high, 3)), ts_sum(corr(volume, high, 4), constant(3))), mean3(rank(power(amount, high)), prod(log(close), constant(5)), ts_rank(mul(amount, volume), constant(7))), constant(constant(constant(3))))在训练集适应度值为-0.8256025236331231 因子mean3(ts_max(prod(delay(normalization(amount), constant(10)), constant(constant(7))), constant(constant(constant(2)))), mean3(rank(ts_rank(normalization(volume), constant(6))), prod(argmin(ts_std(close, 4), constant(7)), constant(constant(9))), delay(delta(arctan(open), constant(1)), constant(constant(6)))), delay(ts_max(sign(log(high)), constant(constant(4))), constant(constant(constant(4)))))在训练集适应度值为1.0924502602809683 因子corr(sign(mean3(close, close, amount)), arctan(delta(high, 8)), constant(constant(4)))在训练集适应度值为nan 因子argmax(abs(open), 5)在训练集适应度值为1.345706079371616 因子cov(max(corr(mean3(close, close, amount), high, constant(10)), argmax(add(low, high), constant(3))), rank(ts_rank(close, constant(4))), constant(constant(constant(9))))在训练集适应度值为-0.07080619128922687 因子max(power(min(low, high), low), open)在训练集适应度值为-0.9347306865582876 因子add(close, volume)在训练集适应度值为-3.6029969571875284 因子log(high)在训练集适应度值为0.34117471671874205 因子corr(low, ts_std(high, 6), constant(3))在训练集适应度值为-0.21069463022698603 因子ts_mean(amount, 5)在训练集适应度值为-3.8938509853523664 因子decay_linear(arctan(prod(cov(ts_rank(amount, 4), mean3(volume, close, close), constant(10)), constant(constant(2)))), constant(constant(constant(constant(7)))))在训练集适应度值为-1.1285464119145714 因子rank(volume)在训练集适应度值为-2.905122786783565 因子max(amount, amount)在训练集适应度值为-3.5587472275221845 因子delay(log(open), constant(10))在训练集适应度值为-2.213171000909509 因子ts_rank(mean3(low, power(amount, open), ts_rank(amount, 2)), 9)在训练集适应度值为-0.27061360219749586 因子ts_sum(delay(prod(prod(ts_sum(close, 1), constant(3)), 9), 3), constant(constant(1)))在训练集适应度值为-2.119132067906117 因子power(sign(close), mul(low, open))在训练集适应度值为-6226367360619556.0 因子mul(sub(amount, high), cov(open, low, 1))在训练集适应度值为nan 因子neg(argmin(amount, 4))在训练集适应度值为1.4011895105260896 通过第一层检查的因子 neg(argmin(amount, 4)) 在训练集适应度值为1.4011895105260896 在验证集适应度值为3.591314612432694 因子ts_rank(log(ts_min(ts_min(add(volume, volume), 5), constant(constant(2)))), 10)在训练集适应度值为-3.8162542433328404 因子prod(mean3(ts_max(argmax(volume, 10), constant(constant(2))), close, prod(volume, 8)), constant(constant(constant(4))))在训练集适应度值为-2.3530324906412696 因子ts_max(high, 2)在训练集适应度值为-0.40692032231093134 因子decay_linear(close, 2)在训练集适应度值为0.18840688974835557 因子standardation(min(close, rank(argmin(power(low, close), constant(9)))))在训练集适应度值为1.663498960806158 通过第一层检查的因子 standardation(min(close, rank(argmin(power(low, close), constant(9))))) 在训练集适应度值为1.663498960806158 在验证集适应度值为5.592606647068409 因子delta(add(open, low), constant(2))在训练集适应度值为0.9377738276146833 因子ts_max(corr(div(ts_regbeta(5, amount, high), ts_sum(high, 5)), normalization(mean2(open, volume)), constant(constant(5))), constant(constant(constant(10))))在训练集适应度值为1.3077215685167398 因子power(abs(ts_std(ts_sum(min(close, high), constant(8)), constant(constant(2)))), power(decay_linear(arctan(ts_rank(open, 2)), constant(constant(9))), ts_mean(div(decay_linear(volume, 5), arctan(high)), constant(constant(9)))))在训练集适应度值为-3.3849482513505986 因子arctan(low)在训练集适应度值为0.2536829921608848 共「14」个表达式:['standardation(ts_regbeta(constant(4), sign(amount), arctan(low)))', 'argmin(low, 9)', 'prod(normalization(high), 1)', 'log(close)', 'add(close, ts_regbeta(constant(constant(2)), argmax(neg(volume), constant(8)), normalization(open)))', 'min(min(high, close), neg(volume))', 'abs(close)', 'corr(close, amount, 4)', 'min(argmin(close, 3), sign(open))', 'normalization(high)', 'prod(log(abs(delta(corr(amount, low, 7), constant(8)))), constant(constant(constant(constant(4)))))', 'arctan(close)', 'neg(argmin(amount, 4))', 'standardation(min(close, rank(argmin(power(low, close), constant(9)))))']通过训练集检测 -- 使用训练数据计算表达式适应度完成 -- 通过第二层检查的因子 standardation(ts_regbeta(constant(4), sign(amount), arctan(low))) 在测试集适应度值为3.1131828208453345 通过第二层检查的因子 argmin(low, 9) 在测试集适应度值为0.5735898327715538 通过第二层检查的因子 prod(normalization(high), 1) 在测试集适应度值为2.411480967169883 通过第二层检查的因子 log(close) 在测试集适应度值为1.3535637652949488 通过第二层检查的因子 add(close, ts_regbeta(constant(constant(2)), argmax(neg(volume), constant(8)), normalization(open))) 在测试集适应度值为0.623058947301701 通过第二层检查的因子 abs(close) 在测试集适应度值为1.2737237062279403 通过第二层检查的因子 normalization(high) 在测试集适应度值为2.411480967169883 通过第二层检查的因子 arctan(close) 在测试集适应度值为1.393816186971015 通过第二层检查的因子 standardation(min(close, rank(argmin(power(low, close), constant(9))))) 在测试集适应度值为6.443255129418495 共「9」个表达式:['standardation(ts_regbeta(constant(4), sign(amount), arctan(low)))', 'argmin(low, 9)', 'prod(normalization(high), 1)', 'log(close)', 'add(close, ts_regbeta(constant(constant(2)), argmax(neg(volume), constant(8)), normalization(open)))', 'abs(close)', 'normalization(high)', 'arctan(close)', 'standardation(min(close, rank(argmin(power(low, close), constant(9)))))']通过测试数据检测 -- 使用测试数据计算表达式适应度完成 -- pass:9, record:199, population: 9 因子挖掘过程中相关指标:[{'gen': 1, 'nevals': 199, 'avg': 2.177461369241195, 'std': 1.7088111844089575, 'min': 0.5735898327715538, 'max': 6.443255129418495}] -- 开始进行下一代因子挖掘 -- =====================因子挖掘结束=======================
fitness_record_df =pd.DataFrame()
for j in range(1,config.num_gen+1):
if len(saved_factor_exprs[j]) ==0:
continue
fit = [saved_factor_exprs[j][i][1] for i in saved_factor_exprs[j].keys()]
max_f = np.max(fit)
min_f = np.min(fit)
mean_f = np.mean(fit)
num_f = len(fit)
exprs = saved_factor_exprs[j]
exprs_dict = {i:exprs[i][1] for i in exprs.keys()}
sorted_dict = dict(sorted(exprs_dict.items(), key=lambda x: x[1], reverse=True))
top_expr = list(sorted_dict.keys())[0]
tmp = pd.DataFrame({'代数':[j],'均值':[mean_f], '因子数':[num_f], '最大':[max_f], '最小':[min_f], '最佳因子':top_expr} )
fitness_record_df = fitness_record_df.append(tmp)
fitness_record_df.index = range(len(fitness_record_df))
print('本次挖掘结果:', fitness_record_df)
本次挖掘结果: 代数 均值 因子数 最大 最小 \ 0 1 2.177461 9 6.443255 0.57359 最佳因子 0 standardation(min(close, rank(argmin(power(low...
fitness_record_df
def stock_backtest(_ind, eval_type):
factor_array_test_set = fitness.evaluate_factor(_ind, eval_type)
factor_df_test_set = pd.DataFrame(factor_array_test_set , columns=data_dp.data_cols, index=data_dp.data_index[data_dp.test_series])
factor_df_stack = factor_df_test_set.stack().reset_index().rename(columns={0:'factor'})
st = factor_df_stack.date.min().strftime('%Y-%m-%d')
et = factor_df_stack.date.max().strftime('%Y-%m-%d')
from biglearning.api import M
def m1_initialize_bigquant_run(context):
msg = "initialize:"
context.PRINT = 1
context.write_log(msg, stdout=context.PRINT)
context.all_data = context.options["data"].read()
context.rebalance_period = 1 # 调仓周期设置
context.mono = True # 因子单调性: True为做多因子小的做空因子大的; False则相反
# 设置买入股票数量
context.target_hold_count = 100
# 每只股票的目标权重
context.target_percent_per_instrument = 1.0 / context.target_hold_count
def m1_before_trading_start_bigquant_run(context, data):
pass
# 交易引擎:tick数据处理函数,每个tick执行一次
def m1_handle_tick_bigquant_run(context, tick):
pass
def m1_handle_data_bigquant_run(context, data):
context.today = data.trading_day_dt.strftime('%Y-%m-%d')
context.today_data = context.all_data[context.all_data.date==context.today]
if context.trading_day_index % context.rebalance_period == 0:
r1 = context.today_data
r1.sort_values(by='factor', ascending=context.mono, inplace=True)
# context.ins_to_long = r1.instrument[:context.num_trades].tolist() # 做多因子排序后靠前的品种
# 取前10只
# 获取当日目标持有股票
target_hold_instruments = set(r1.instrument[:context.target_hold_count].tolist())
# 获取当前已持有股票
current_hold_instruments = set(context.get_account_positions().keys())
# 卖出不在目标持有列表中的股票
for instrument in current_hold_instruments - target_hold_instruments:
context.order_target_percent(instrument, 0)
# 买入目标持有列表中的股票
for instrument in target_hold_instruments - current_hold_instruments:
context.order_target_percent(instrument, context.target_percent_per_instrument)
# 交易引擎:盘后处理函数,每日盘后执行一次
def m1_after_trading_bigquant_run(context, data):
pass
m6 = M.instruments.v2(
start_date=T.live_run_param('trading_date', st),
end_date=T.live_run_param('trading_date', et),
market='CN_STOCK_A',
instrument_list='',
max_count=0)
m1 = M.hftrade.v2(
instruments=m6.data,
options_data=DataSource.write_df(factor_df_stack),
start_date='',
end_date='',
initialize=m1_initialize_bigquant_run,
before_trading_start=m1_before_trading_start_bigquant_run,
handle_data=m1_handle_data_bigquant_run,
after_trading=m1_after_trading_bigquant_run,
capital_base=10000000+np.random.int(),
frequency='daily',
price_type='真实价格',
product_type='股票',
before_start_days='0',
volume_limit=1,
order_price_field_buy='open',
order_price_field_sell='open',
plot_charts=True,
disable_cache=True,
replay_bdb=False,
show_debug_info=False,
backtest_only=False,
m_cached=False
)
return m1
fine_inds = fitness_record_df['最佳因子'].tolist() # 全部因子
fine_inds = list(set(fine_inds)) # 去重
tail_ind = fine_inds[-1] # 最后因子
def get_factor_data(expr):
factor_array_test_set = fitness.evaluate_factor(gp.PrimitiveTree.from_string(tail_ind, pset), eval_type='test')
factor_df_test_set = pd.DataFrame(factor_array_test_set , columns=data_dp.data_cols, index=data_dp.data_index[data_dp.test_series])
factor_df = factor_df_test_set.stack().reset_index().rename(columns={0:'factor'})
return factor_df
base_factor_df = get_factor_data(tail_ind) # 最后因子的因子数据
cnt = 0
for expr_ind in fine_inds:
cnt += 1
cur_factor_df = get_factor_data(expr_ind)
corr = cur_factor_df['factor'].corr(base_factor_df['factor'])
if corr < 0.3 or expr_ind == tail_ind: # 相关系数低于0.3
print('当前回测因子:', expr_ind)
_ind = gp.PrimitiveTree.from_string(expr_ind, pset)
from biglearning.api import M
stock_backtest(_ind , 'test')
当前回测因子: standardation(min(close, rank(argmin(power(low, close), constant(9)))))
[2023-12-14 11:06:42.817038] INFO: moduleinvoker:2848179974.py:60:stock_backtest instruments.v2 开始运行.. [2023-12-14 11:06:42.828218] INFO: moduleinvoker:2848179974.py:60:stock_backtest 命中缓存 [2023-12-14 11:06:42.834245] INFO: moduleinvoker:2848179974.py:60:stock_backtest instruments.v2 运行完成[0.017224s]. [2023-12-14 11:06:43.050223] INFO: moduleinvoker:2848179974.py:68:stock_backtest hfbacktest.v1 开始运行..
[2023-12-14 11:06:43.056065] INFO hfbacktest: biglearning V1.5.6
INFO:hfbacktest:biglearning V1.5.6
[2023-12-14 11:06:43.062846] INFO hfbacktest: bigtrader v1.10.6 2023-12-11
INFO:hfbacktest:bigtrader v1.10.6 2023-12-11
2023-12-14 11:06:49.878285 strategy(bkt000,): initialize: [2023-12-14 11:06:58.372687] INFO hfbacktest: backtest done, raw_perf_ds:DataSource(ac039848fba44eb7a8211863651a9bceT)
INFO:hfbacktest:backtest done, raw_perf_ds:DataSource(ac039848fba44eb7a8211863651a9bceT) [2023-12-14 11:06:59.915515] INFO: bigcharts.impl.render:render.py:408:render_chart Data is None, skip loading it to chart.
[2023-12-14 11:07:00.844632] INFO: moduleinvoker:2848179974.py:68:stock_backtest hfbacktest.v1 运行完成[17.794343s]. [2023-12-14 11:07:00.850134] INFO: moduleinvoker:2848179974.py:68:stock_backtest hftrade.v2 运行完成[17.840806s].