# 导入包
from sklearn import preprocessing
# 基础设置
class conf:
start_date = '2015-01-01'
end_date='2017-08-10'
split_date = '2016-01-01'
instruments = D.instruments(start_date, split_date)
features = ['fs_current_assets_0','market_cap_0','close_0']
# 计算特征数据
m2 = M.general_feature_extractor.v5(
instruments=conf.instruments, start_date=conf.start_date, end_date=conf.split_date,
features=conf.features)
# 数据预处理:缺失数据处理,数据规范化,T.get_stock_ranker_default_transforms为StockRanker模型做数据预处理
m3 = M.transform.v2(
data=m2.data, transforms=T.get_stock_ranker_default_transforms(),
drop_null=True, astype='int32', except_columns=['date', 'instrument'],)
# # 全部数据
all_data = m2.data.read_df()
# 某一天数据
df = all_data[all_data['date']=='2015-01-05']
# # 某一天数据的直方图分析,可以发现数据很粗糙,极值存在,需要做一些处理和转化
# df[df.dtypes[df.dtypes == 'float32'].index.values].hist(bins=50, figsize=[15,12])
## 1. 缺失值处理
for factor in ['close_0', 'market_cap_0', 'fs_current_assets_0']:
df[factor].fillna(np.nanmean(df[factor]), inplace=True)
## 2. 极值处理
for factor in ['close_0', 'market_cap_0', 'fs_current_assets_0']:
p_95 = np.percentile(df[factor], 95)
p_5 = np.percentile(df[factor], 5)
df[factor][df[factor] > p_95] = p_95
df[factor][df[factor] < p_5] = p_5
## 3. 标准化
df = all_data[all_data['date']=='2015-01-05'].dropna()
for factor in ['close_0', 'market_cap_0', 'fs_current_assets_0']:
df[factor] = (df[factor] - df[factor].mean()) / df[factor].std()
df[['close_0', 'market_cap_0', 'fs_current_assets_0']].values
# 上述标准化结果与sklearn的scale处理结果是大致一样的(因为scale函数内部实现细节有点差异)
preprocessing.scale(df[['close_0', 'market_cap_0', 'fs_current_assets_0']])
## 4. 规范化
preprocessing.normalize(df[['close_0', 'market_cap_0', 'fs_current_assets_0']])
# 因子预处理函数
def preprocess(df):
## 1. 缺失值处理
for factor in ['close_0', 'market_cap_0', 'fs_current_assets_0']:
# 缺失值处理
df[factor].fillna(np.nanmean(df[factor]), inplace=True)
# 极值处理
p_95 = np.percentile(df[factor], 95)
p_5 = np.percentile(df[factor], 5)
df[factor][df[factor] > p_95] = p_95
df[factor][df[factor] < p_5] = p_5
# 标准化处理
df[factor] = (df[factor] - df[factor].mean()) / df[factor].std()
return df
# 按每个交易日进行因子预处理,此时因子预处理完成,我们可以用预处理后的结果加入更多的机器学习算法中
all_data.groupby('date').apply(preprocess)