import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns
# 生成一个正态分布
def demo1():
mu ,sigma = 0, 1
sampleNo = 10000
np.random.seed(0)
s = np.random.normal(mu, sigma, sampleNo)
plt.hist(s, bins=100)
plt.show()
sns.distplot(s)
demo1()
start_date = '2015-01-01'
end_date='2015-08-10'
# 获取估值数据
all_data = DataSource('market_value_CN_STOCK_A').read(start_date = start_date, end_date = end_date)
# 查看前5条
all_data.head()
# 获取其中某一天数据
df = all_data[all_data['date']=='2015-01-05']
# 等效的写法
df = all_data[all_data.date=='2015-01-05']
df = all_data.query("date == '2015-01-05'")
# 某一天数据列的密度图分析,可以发现原始的流通市值密度呈现右侧长尾分布,因为大市值公司较少
import seaborn as sns
sns.kdeplot(df['market_cap_float'])
df['log_market_cap_float'] = np.log(df['market_cap_float'])
sns.kdeplot(df['log_market_cap_float'])
'''缺失值查看'''
df.isnull().sum()
'''缺失值去除 或者向后填充(避免未来的数据泄露到历史)后去除历史的缺失值 '''
df.ffill(inplace=True) # 向后填充
df.dropna(inplace=True) # 直接剔除
'''缺失值查看'''
df.isnull().sum()
p_95 = df['log_market_cap_float'].quantile(0.95)
p_5 = df['log_market_cap_float'].quantile(0.05)
'''两种处理方式: 截断或者替换'''
df_winsorize_1 = df[(df.log_market_cap_float<p_95)&(df.log_market_cap_float>p_5)] # 截断处理
df_winsorize_2 = df.copy()
df_winsorize_2.loc[df.log_market_cap_float>p_95,'log_market_cap_float'] = p_95 # 替换处理大分位数极值
df_winsorize_2.loc[df.log_market_cap_float<p_5,'log_market_cap_float'] = p_5 # 替换处理小分位数极值
'''查看极值截断处理后的分布'''
sns.kdeplot(df_winsorize_1['log_market_cap_float'], color='r')
sns.kdeplot(df_winsorize_2['log_market_cap_float'], color='b')
df_standalize = df_winsorize_1.copy()
''' 标准化方法1(StandardScaler): (x- 均值)/标准差 也叫归一化 均值为0,方差为1'''
df_standalize['log_market_cap_float_1'] = df_standalize[['log_market_cap_float']].apply(lambda x:(x-np.mean(x))/np.std(x))
'''标准化方法2(MinMaxScaler): (x- xmin)/(xmax-xmin) 缩放到0和1之间'''
df_standalize['log_market_cap_float_2'] = df_standalize[['log_market_cap_float']].apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
'''查看标准化处理后的数据范围分布 近似均值为0的分布'''
sns.kdeplot(df_standalize['log_market_cap_float_1'], color='r')
sns.kdeplot(df_standalize['log_market_cap_float_2'], color='b')
df_preprocess
# 因子预处理函数
def preprocess(df,factor):
# 缺失值处理(均值填充)
df[factor].fillna(np.nanmean(df[factor]), inplace=True)
# 极值处理
p_95 = df[factor].quantile(0.95)
p_5 = df[factor].quantile(0.05)
df = df[(df[factor] < p_95)&(df[factor] > p_5)]
# 标准化处理
df[factor] = (df[factor] - df[factor].min()) / (df[factor].max()-df[factor].min())
return df
# 按每个交易日进行因子预处理,此时因子预处理完成
df_preprocess = df.groupby('date').apply(lambda x:preprocess(x,factor='log_market_cap_float'))
sns.kdeplot(df_preprocess['log_market_cap_float'], color='r')