克隆策略

因子预处理

In [51]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns
# 生成一个正态分布
def demo1():
    mu ,sigma = 0, 1
    sampleNo = 10000
    np.random.seed(0)
    s = np.random.normal(mu, sigma, sampleNo)
    plt.hist(s, bins=100)
    plt.show()
    sns.distplot(s)
    
demo1()

一、基础数据获取并观察分布

In [52]:
start_date = '2015-01-01'
end_date='2015-08-10'
# 获取估值数据
all_data = DataSource('market_value_CN_STOCK_A').read(start_date = start_date, end_date = end_date)
In [53]:
# 查看前5条
all_data.head()
Out[53]:
instrument date pb_lf pe_ttm market_cap_float ps_ttm market_cap pe_lyr
0 300390.SZA 2015-01-05 8.866899 71.789818 5.646192e+08 7.475822 2.825995e+09 76.708939
1 300357.SZA 2015-01-05 11.273505 61.554371 1.378448e+09 24.156355 5.513792e+09 78.138496
2 600162.SHA 2015-01-05 2.886564 19.859253 5.190413e+09 1.429412 5.190413e+09 26.972548
3 600851.SHA 2015-01-05 3.201072 81.585381 6.540506e+09 9.175343 1.069452e+10 163.542542
4 600820.SHA 2015-01-05 1.718861 19.929968 1.951967e+10 1.067159 2.647329e+10 20.532928
In [54]:
# 获取其中某一天数据
df = all_data[all_data['date']=='2015-01-05']
# 等效的写法
df = all_data[all_data.date=='2015-01-05']
df = all_data.query("date == '2015-01-05'")
In [55]:
# 某一天数据列的密度图分析,可以发现原始的流通市值密度呈现右侧长尾分布,因为大市值公司较少
import seaborn as sns
sns.kdeplot(df['market_cap_float'])
Out[55]:
<AxesSubplot:xlabel='market_cap_float', ylabel='Density'>

二、数据的分布转换

右侧长尾分布的话可以对所有数据取对数、取平方根等,它的原理是因为这样的变换的导数是逐渐减小的,也就是说它的增速逐渐减缓,所以就可以把大的数据向左移,使数据接近正态分布。如果左侧长尾分布的话可以取相反数后转化为右偏的情况。
In [56]:
df['log_market_cap_float'] = np.log(df['market_cap_float'])
sns.kdeplot(df['log_market_cap_float'])
Out[56]:
<AxesSubplot:xlabel='log_market_cap_float', ylabel='Density'>

三、数据处理

1. 缺失值处理

In [57]:
'''缺失值查看'''
df.isnull().sum()
Out[57]:
instrument                0
date                      0
pb_lf                   230
pe_ttm                  230
market_cap_float        224
ps_ttm                  230
market_cap              234
pe_lyr                  270
log_market_cap_float    224
dtype: int64
In [58]:
'''缺失值去除 或者向后填充(避免未来的数据泄露到历史)后去除历史的缺失值  '''
df.ffill(inplace=True) # 向后填充
df.dropna(inplace=True) # 直接剔除
In [59]:
'''缺失值查看'''
df.isnull().sum()
Out[59]:
instrument              0
date                    0
pb_lf                   0
pe_ttm                  0
market_cap_float        0
ps_ttm                  0
market_cap              0
pe_lyr                  0
log_market_cap_float    0
dtype: int64

2. 极值处理

In [60]:
p_95 = df['log_market_cap_float'].quantile(0.95)
p_5 = df['log_market_cap_float'].quantile(0.05)
'''两种处理方式: 截断或者替换'''
df_winsorize_1 = df[(df.log_market_cap_float<p_95)&(df.log_market_cap_float>p_5)] # 截断处理
df_winsorize_2 = df.copy()
df_winsorize_2.loc[df.log_market_cap_float>p_95,'log_market_cap_float'] = p_95 # 替换处理大分位数极值
df_winsorize_2.loc[df.log_market_cap_float<p_5,'log_market_cap_float'] = p_5 # 替换处理小分位数极值
'''查看极值截断处理后的分布'''
sns.kdeplot(df_winsorize_1['log_market_cap_float'], color='r')
sns.kdeplot(df_winsorize_2['log_market_cap_float'], color='b')
Out[60]:
<AxesSubplot:xlabel='log_market_cap_float', ylabel='Density'>

3、标准化

In [61]:
df_standalize = df_winsorize_1.copy()

''' 标准化方法1(StandardScaler): (x- 均值)/标准差  也叫归一化 均值为0,方差为1'''
df_standalize['log_market_cap_float_1'] = df_standalize[['log_market_cap_float']].apply(lambda x:(x-np.mean(x))/np.std(x))

'''标准化方法2(MinMaxScaler): (x- xmin)/(xmax-xmin) 缩放到0和1之间'''
df_standalize['log_market_cap_float_2'] = df_standalize[['log_market_cap_float']].apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x))) 
In [62]:
'''查看标准化处理后的数据范围分布 近似均值为0的分布'''
sns.kdeplot(df_standalize['log_market_cap_float_1'], color='r')
sns.kdeplot(df_standalize['log_market_cap_float_2'], color='b')
Out[62]:
<AxesSubplot:xlabel='log_market_cap_float_1', ylabel='Density'>

三、整合预处理

In [63]:
df_preprocess
Out[63]:
instrument date pb_lf pe_ttm market_cap_float ps_ttm market_cap pe_lyr log_market_cap_float
date
2015-01-05 1 300357.SZA 2015-01-05 11.273505 61.554371 1.378448e+09 24.156355 5.513792e+09 78.138496 0.076095
2 600162.SHA 2015-01-05 2.886564 19.859253 5.190413e+09 1.429412 5.190413e+09 26.972548 0.430564
3 600851.SHA 2015-01-05 3.201072 81.585381 6.540506e+09 9.175343 1.069452e+10 163.542542 0.492376
4 600820.SHA 2015-01-05 1.718861 19.929968 1.951967e+10 1.067159 2.647329e+10 20.532928 0.784700
5 300051.SZA 2015-01-05 6.454600 228.134033 2.018901e+09 11.065672 3.181110e+09 231.250488 0.178115
... ... ... ... ... ... ... ... ... ...
2856 002529.SZA 2015-01-05 2.440132 -24123.927734 2.345921e+09 11.473438 2.404800e+09 274.601166 0.218251
2857 600592.SHA 2015-01-05 2.116207 61.928097 3.784229e+09 5.558588 4.091429e+09 80.572807 0.346089
2859 000099.SZA 2015-01-05 3.059600 37.799770 8.491047e+09 6.621035 8.491047e+09 44.180542 0.562154
2860 002362.SZA 2015-01-05 5.011569 -24.517582 2.971363e+09 10.978460 3.624760e+09 -16.752464 0.281438
2861 300226.SZA 2015-01-05 20.814474 807.668518 7.719532e+09 2.669812 8.968440e+09 415.382751 0.536687

2574 rows × 9 columns

In [64]:
# 因子预处理函数
def preprocess(df,factor):
    # 缺失值处理(均值填充)
    df[factor].fillna(np.nanmean(df[factor]), inplace=True)
    # 极值处理
    p_95 = df[factor].quantile(0.95)
    p_5 = df[factor].quantile(0.05)
    df = df[(df[factor] < p_95)&(df[factor] > p_5)]
    # 标准化处理
    df[factor] = (df[factor] - df[factor].min()) / (df[factor].max()-df[factor].min())
    return df

# 按每个交易日进行因子预处理,此时因子预处理完成
df_preprocess = df.groupby('date').apply(lambda x:preprocess(x,factor='log_market_cap_float'))
In [65]:
sns.kdeplot(df_preprocess['log_market_cap_float'], color='r')
Out[65]:
<AxesSubplot:xlabel='log_market_cap_float', ylabel='Density'>