import numpy as np
import pandas as pd
np.random.seed(7)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from gplearn.genetic import SymbolicTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import warnings
warnings.filterwarnings('ignore')
def data_prepare():
boston = load_boston()
boston_feature = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_label = pd.Series(boston.target).to_frame("TARGET")
boston = pd.concat([boston_label, boston_feature], axis=1)
return boston
data = data_prepare()
print(data.shape)
# 查看现有特征分布
def plot_dist(df, feature, pic_name='dist_plot.png'):
fcols = 2
frows = len(feature) + 1
print(fcols, frows)
plt.figure(figsize=(5*fcols, 4*frows))
i = 0
for col in feature:
i += 1
ax = plt.subplot(frows, fcols, i)
plt.scatter(df[col], df['TARGET'])
plt.xlabel(col)
plt.ylabel('price')
i += 1
ax = plt.subplot(frows, fcols, i)
sns.distplot(df[col].dropna(), fit=stats.norm)
plt.xlabel(col)
plt.tight_layout()
plot_dist(data, data.columns)
# 使用log1p变换将特征基本拉到一个尺度进行建模
for col in data.columns.drop('TARGET'):
data[col] = np.log1p(data[col])
# 观察新的特征的分布
plot_dist(data, data.columns)
corr = data.corr('spearman')
corr
sns.heatmap(corr)
threshold = 0.85
correlated_pairs= {}
for col in corr:
# Find correlations above the threshold
above_threshold_vars = [x for x in list(corr.index[abs(corr[col]) > threshold]) if x != col]
correlated_pairs[col] = above_threshold_vars
plt.scatter(data['DIS'], data['NOX'])
corr['TARGET'].sort_values().plot.barh()
features = data.columns.drop('TARGET')
x_train, x_test, y_train, y_test = train_test_split(data[features], data['TARGET'].to_frame(),
test_size=0.3, shuffle=True)
def Preds(x, y, x_test, y_test, alpha, n_splits=4, random_state=23, verbose=0):
feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold'])
folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
oof_preds, sub_preds = np.zeros(x.shape[0]), np.zeros(x_test.shape[0])
oof_train = np.zeros(x.shape[0])
if verbose > 0:
print(x.shape, x_test.shape)
train_scores = []
valid_scores = []
test_scores = []
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x, y)):
trn_x, trn_y = x.iloc[trn_idx, :], y.iloc[trn_idx, :]
val_x, val_y = x.iloc[val_idx, :], y.iloc[val_idx, :]
# print(type(trn_x.values), type(trn_y.values), type(val_x.values), type(val_y.values))
#
# print(trn_x.shape, trn_y.shape)
# print(val_x.shape, val_y.values.ravel().shape)
model = Ridge(alpha=alpha)
model.fit(trn_x.values, trn_y.values.ravel())
trn_preds = model.predict(trn_x)
val_preds = model.predict(val_x)
test_preds = model.predict(x_test)
oof_preds[val_idx] = val_preds
sub_preds += test_preds/folds.n_splits
train_score = mean_squared_error(trn_y, trn_preds)
val_score = mean_squared_error(val_y, val_preds)
test_score = mean_squared_error(y_test, test_preds)
train_scores.append(train_score)
valid_scores.append(val_score)
test_scores.append(test_score)
feature_importance = feature_importance.append(pd.DataFrame({
'importance': model.coef_,
'fold': [n_fold + 1] * x.shape[1],
'feature': x.columns.tolist()
}))
feature_importance['importance'] = feature_importance['importance'].astype(float)
fi = feature_importance.groupby(['feature']).agg(['mean'])['importance'].sort_values(by=['mean'], ascending=False)
fold_names = list(range(folds.n_splits))
fold_names.append('overall')
valid_mse = mean_squared_error(y, oof_preds)
valid_scores.append(valid_mse)
train_scores.append(np.mean(train_scores))
test_scores.append(np.mean(test_scores))
# 构建记录分数的 Dataframe
metrics = pd.DataFrame({'fold': fold_names,
'train': train_scores,
'valid': valid_scores,
'test': test_scores})
oof_preds = pd.Series(oof_preds.flatten(), index=x.index).rename('TARGET')
sub_preds = pd.Series(sub_preds.flatten(), index=x_test.index).rename('TARGET')
return metrics, fi
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
for alpha in alphas:
metrics, fi = Preds(x_train, y_train, x_test, y_test, alpha)
print('\n{}'.format(alpha))
print(metrics)
# 验证集上 0.01最优
metrics, fi = Preds(x_train, y_train, x_test, y_test, 0.01)
fi
fi.plot.barh()
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']
gp1 = SymbolicTransformer(generations=10, population_size=1000,
hall_of_fame=100, n_components=10,
function_set=function_set,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=3)
train_idx = x_train.index
test_idx = x_test.index
gp1.fit(x_train, y_train)
print(gp1)
from IPython.display import Image
import pydotplus
graph = gp1._best_programs[0].export_graphviz()
graph = pydotplus.graphviz.graph_from_dot_data(graph)
Image(graph.create_png())
gp_train_feature = gp1.transform(x_train)
gp_test_feature = gp1.transform(x_test)
new_feature_name = [str(i)+'V' for i in range(1, 11)]
train_new_feature = pd.DataFrame(gp_train_feature, columns=new_feature_name, index=train_idx)
test_new_feature = pd.DataFrame(gp_test_feature, columns=new_feature_name, index=test_idx)
x_train_0 = pd.concat([x_train, train_new_feature], axis=1)
x_test_0 = pd.concat([x_test, test_new_feature], axis=1)
new_x_data = pd.concat([x_train_0, x_test_0], axis=0)
new_data = pd.concat([data['TARGET'], new_x_data], axis=1)
new_data.columns
plot_dist(new_data, new_feature_name)
new_corr = new_data.corr('spearman')
new_corr
sns.heatmap(new_corr)
new_corr['TARGET'].sort_values().plot.barh()
新特征与label的相关性都很高
new_features_corr = new_data[new_feature_name].corr('spearman')
new_features_corr
sns.heatmap(new_features_corr)
新特征之间的相关性很高, 都在0.95以上
alphas = [1e-4, 5e-4, 0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]
for alpha in alphas:
metrics, fi = Preds(x_train_0, y_train, x_test_0, y_test, alpha)
print('\n{}'.format(alpha))
print(metrics)
# 验证集上 0.001最优
metrics, fi = Preds(x_train_0, y_train, x_test_0, y_test, 0.001)
fi.plot.barh()
1 3 5 相对重要程度较小, 2 4 重要程度较高, 但是都没有超过原来的特征
metrics
对比元特征训练的 mse分数,val 19.636047 test 19.362313 性能基本没变化
在这个例子中,实际上模型效果没有提升,主要原因感觉是生成的特征属性较为单一,遗传算法最终的目标都是拟合label,导致特征之间相关性太高。
感觉可以适当降低遗传算法的轮数,保存特征的多样性,降低特征之间的相关性。
将遗传代数减低至一代进行模型构建
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min'] #
gp2 = SymbolicTransformer(generations=1, population_size=1000,
hall_of_fame=100, n_components=10,
function_set=function_set,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=3)
train_idx = x_train.index
test_idx = x_test.index
gp2.fit(x_train, y_train)
print(gp2)
gp_train_feature_1 = gp2.transform(x_train)
gp_test_feature_1 = gp2.transform(x_test)
new_feature_name_W = [str(i)+'W' for i in range(1, 11)]
train_new_feature_1 = pd.DataFrame(gp_train_feature_1, columns=new_feature_name_W, index=train_idx)
test_new_feature_1 = pd.DataFrame(gp_test_feature_1, columns=new_feature_name_W, index=test_idx)
train_new_feature_1 = pd.concat([x_train, train_new_feature_1], axis=1)
test_new_feature_1 = pd.concat([x_test, test_new_feature_1], axis=1)
new_data_1 = pd.concat([train_new_feature_1, test_new_feature_1], axis=0)
new_data_1 = pd.concat([new_data_1, data['TARGET']], axis=1)
plot_dist(new_data_1, new_feature_name_W)
new_corr_1 = new_data_1.corr('spearman')
new_corr_1
sns.heatmap(new_corr_1)
空值遗传1代(那不就是没有遗传) 这样的相关性才变低一点
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]
for alpha in alphas:
metrics, fi = Preds(train_new_feature_1, y_train, test_new_feature_1, y_test, alpha)
print('\n{}'.format(alpha))
print(metrics)
# 验证集上 0.01最优
metrics, fi = Preds(train_new_feature_1, y_train, test_new_feature_1, y_test, 0.01)
fi
metrics
实验名 特征数量 验证集分数 测试集分数
控制组 13 19.636047 19.362313
实验一 23 14.220659 19.142288
实验二 23 16.628614 16.854483
实验一 和 实验二分别只是调整了遗传算法的训练代数,实验一训练了三代,生成的特征与目标相关性高,但是特征之间的相关性也很高,实验二只训练了一代其实就是没有进行遗传迭代,虽然生成的特征与目标相关性不强,但是生成的特征具有更高的复杂性,反而在更好的提升了模型的性能。
所以可以两个策略,一种是训练多代,输出少量特征;另一种是不进行遗传迭代,随机产生大量特征,综合两种特征进行建模。
所以实验三中,综合前两个实验gplearn模型生成特征的方式,看是否有模型提升效果。
# 新构建 符号转换对象
gp3 = SymbolicTransformer(generations=1, population_size=1000, # 增加随机生成特征的种类
hall_of_fame=100, n_components=10, # 同时增加筛选出来的特征数量
function_set=function_set,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=3)
gp3.fit(x_train, y_train)
print(gp3)
gp4 = SymbolicTransformer(generations=3, population_size=1000,
hall_of_fame=100, n_components=1, # 生成特征生成减少到2
function_set=function_set,
parsimony_coefficient=0.0005,
max_samples=0.9, verbose=1,
random_state=0, n_jobs=3)
gp4.fit(x_train, y_train)
gp_train_feature_3 = gp3.transform(x_train)
gp_test_feature_3 = gp3.transform(x_test)
new_feature_name_M = [str(i)+'M' for i in range(1, 11)]
train_new_feature_3 = pd.DataFrame(gp_train_feature_3, columns=new_feature_name_M, index=train_idx)
test_new_feature_3 = pd.DataFrame(gp_test_feature_3, columns=new_feature_name_M, index=test_idx)
gp_train_feature_4 = gp4.transform(x_train)
gp_test_feature_4 = gp4.transform(x_test)
new_feature_name_N = [str(i)+'N' for i in range(1, 2)]
train_new_feature_4 = pd.DataFrame(gp_train_feature_4, columns=new_feature_name_N, index=train_idx)
test_new_feature_4 = pd.DataFrame(gp_test_feature_4, columns=new_feature_name_N, index=test_idx)
train_new_feature_34 = pd.concat([x_train, train_new_feature_3, train_new_feature_4], axis=1)
test_new_feature_34 = pd.concat([x_test, test_new_feature_3, test_new_feature_4], axis=1)
print(train_new_feature_34.shape)
new_data_34 = pd.concat([train_new_feature_34, test_new_feature_34], axis=0)
new_data_34 = pd.concat([new_data_34, data['TARGET']], axis=1)
features_34 = new_feature_name_M + new_feature_name_N
plot_dist(new_data_34, new_feature_name_M + new_feature_name_N)
corr_34 = new_data_34[new_feature_name_M + new_feature_name_N + ['TARGET']].corr('spearman')
corr_34
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]
for alpha in alphas:
metrics, fi = Preds(train_new_feature_34, y_train, test_new_feature_34, y_test, alpha)
print('\n{}'.format(alpha))
print(metrics)
# 验证集上 0.01最优
metrics, fi = Preds(train_new_feature_34, y_train, test_new_feature_34, y_test, 0.01)
metrics
fi
实验名 特征数量 验证集分数 测试集分数
控制组 13 19.636047 19.362313
实验一 23 14.220659 19.142288
实验二 23 16.628614 16.854483
实验三 24 16.598239 16.837614
性能稍微提升了一点,当然还可以继续增加生成的特征数量看看能不能继续提升,这里就不再继续了。
当然后续还有空值gplearn模型复杂的的操作方式,这里就没有继续研究了,设想一下,如果控制了生成公式的复杂程度,是可以适当提升一下遗传迭代的层数来生成一些保存了多样性的特征的。