from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import rc
rc('mathtext', default='regular')
import seaborn as sns
sns.set_style('white')
from matplotlib import dates
import numpy as np
import pandas as pd
import statsmodels.api as sm
import time
import datetime as DT
import scipy.stats as st
from math import sqrt
在估算因子IC的协方差矩阵时,出现极大极小值情况,此处需要去极值。计算IC时采用的是皮尔逊相关系数。
# 去级值函数
def winsorize_series(se):
q = se.quantile([0.025, 0.975])
if isinstance(q, pd.Series) and len(q) == 2:
se[se < q.iloc[0]] = q.iloc[0]
se[se > q.iloc[1]] = q.iloc[1]
return se
# 皮尔逊相关系数函数(IC)
def multiply(a,b):
sum_ab=0.0
for i in range(len(a)):
temp=a[i]*b[i]
sum_ab+=temp
return sum_ab
def cal_pearson(x,y):
n=len(x)
sum_x=sum(x)
sum_y=sum(y)
sum_xy=multiply(x,y)
sum_x2 = sum([pow(i,2) for i in x])
sum_y2 = sum([pow(j,2) for j in y])
molecular=sum_xy-(float(sum_x)*float(sum_y)/n)
denominator=sqrt((sum_x2-float(sum_x**2)/n)*(sum_y2-float(sum_y**2)/n))
return molecular/denominator
# 起始日期
start_date = '2010-01-01'
# 结束日期
end_date = '2017-09-29'
rebalance_period = 5
instruments = D.instruments() #获取所有股票列表
factors = ['return_5','avg_amount_20','pb_lf_0','ps_ttm_0','pe_ttm_0','volatility_10_0','avg_turn_20']
features_data = D.features(instruments=instruments, start_date=start_date, end_date=end_date, fields=factors)
mydata = features_data.set_index(['date','instrument'])[factors].unstack()
for col in mydata['return_5'].columns:
mydata['return_5'][col] = mydata['return_5'][col].shift(-rebalance_period)
data_ret = mydata['return_5']
group_fac = []
for fac in factors[1:]:
data_tmp = mydata[fac]
for col in data_ret.T.columns[:-rebalance_period]:
tmp_df = pd.DataFrame(index=data_ret.T.index)
tmp_df['fac'] = data_tmp.T[col]
tmp_df['ret'] = data_ret.T[col]
tmp_df = tmp_df.dropna()
coff = cal_pearson(tmp_df['fac'],tmp_df['ret'])
dict_temp = {'factor':fac,'date':col,'coff':coff}
group_fac.append(dict_temp)
_ic = pd.DataFrame(group_fac,columns=['factor','date','coff'])
ic_temp = _ic.set_index(['date','factor']).unstack()
ic_df = ic_temp['coff']
ic_df.head()
先把IC_DF的图画一下:
ic_df.plot(figsize=(12,6), title='Factor weight using sample covariance')
这个图看出,如果直接用因子IC去做预测的话,会比较乱。下面用两种方法估算因子IC的协方差矩阵,进而得到两种不同的因子权重:
ic_weight_df,使用Sample 协方差矩阵估算
ic_weight_shrink_df,使用 Ledoit-Wolf 压缩方法得到的协方差矩阵估算
下面计算中的滚动窗口为120天,即计算每一天的因子权重时,使用了之前6个月的IC时间序列来计算IC均值向量和IC协方差矩阵。
# ---------------------------------------------------------------------------
# unshrunk covariance
n = 120
ic_weight_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns)
for dt in ic_df.index:
ic_dt = ic_df[ic_df.index<dt].tail(n)
if len(ic_dt) < n:
continue
ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float))
inv_ic_cov_mat = np.linalg.inv(ic_cov_mat)
weight = inv_ic_cov_mat*np.mat(ic_dt.mean()).reshape(len(inv_ic_cov_mat),1)
weight = np.array(weight.reshape(len(weight),))[0]
ic_weight_df.ix[dt] = weight/np.sum(weight)
# ---------------------------------------------------------------------------
# Ledoit-Wolf shrink covariance
n = 120
ic_weight_shrink_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns)
lw = LedoitWolf()
for dt in ic_df.index:
ic_dt = ic_df[ic_df.index<dt].tail(n)
if len(ic_dt) < n:
continue
ic_cov_mat = lw.fit(ic_dt.as_matrix()).covariance_
inv_ic_cov_mat = np.linalg.inv(ic_cov_mat)
weight = inv_ic_cov_mat*np.mat(ic_dt.mean()).reshape(len(inv_ic_cov_mat),1)
weight = np.array(weight.reshape(len(weight),))[0]
ic_weight_shrink_df.ix[dt] = weight/np.sum(weight)
ic_weight_df = ic_weight_df.apply(winsorize_series)
ic_weight_shrink_df = ic_weight_shrink_df.apply(winsorize_series)
ic_weight_shrink_df.tail()
# 因子权重作图展示
ic_weight_df.plot(figsize=(12,6), title='Factor weight using sample covariance')
ic_weight_shrink_df.plot(figsize=(12,6), title='Factor weight using shrink covariance')
后续:下一节将用以上因子和权重计算方法,采用5天、10天、20天周期调仓,进行2010-2017年回测,查看这些因子的Alpha预测能力