使用Python计算变量之间的相关系数和绘图¶

In [1]:

# 导入包
import numpy as np
import statsmodels.tsa.stattools as sts
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

1、随机变量¶

In [2]:

X = np.random.randn(1000)
Y = np.random.randn(1000)

plt.scatter(X,Y)
plt.show()
print("correlation of X and Y is ")
np.corrcoef(X,Y)[0,1] # 可以看出，随机变量几乎不相关

correlation of X and Y is

Out[2]:

0.010505052938688659

2、使用生成的相关序列，并加入正态分布的噪音¶

In [3]:

X = np.random.randn(1000)
Y = X + np.random.normal(0,0.1,1000)

plt.scatter(X,Y)
plt.show()
print("correlation of X and Y is ")
np.corrcoef(X,Y)[0,1]

correlation of X and Y is

Out[3]:

0.99509473689553696

3、转向更为实际的对象¶

我们探索两只股票相关关系，因为在金融市场上，对价格的分析较少，而对收益率的关注较多，因此相关性也是从收益率的角度来看

In [6]:

# 计算两只股票的日收益率
Stock1 = D.history_data(["601186.SHA"],start_date='2016-12-01',end_date='2017-05-01',fields = ['close'])['close'].pct_change()[1:]
Stock2 = D.history_data(["601390.SHA"],start_date='2016-12-01',end_date='2017-05-01',fields = ['close'])['close'].pct_change()[1:]

In [17]:

plt.scatter(Stock1,Stock2)
plt.xlabel("601186.SHA daily return")
plt.ylabel("601390.SHA daily return")
plt.show()
print("the corrlation for two stocks is: ")
Stock2.corr(Stock1)

the corrlation for two stocks is:

Out[17]:

0.8846017046026351

4、计算滚动相关系数¶

相关关系的计算离不开一个时间窗口，通过时间窗口我们也能看出相关性随时间的一个变动情况

In [8]:

Stock1 = D.history_data(["601186.SHA"],start_date='2010-01-01',end_date='2017-05-01',fields = ['close'])['close'].pct_change()[1:]
Stock2 = D.history_data(["601390.SHA"],start_date='2010-01-01',end_date='2017-05-01',fields = ['close'])['close'].pct_change()[1:]

In [11]:

# 借助Pandas包计算滚动相关系数
rolling_corr = pd.rolling_corr(Stock1,Stock2,60)
rolling_corr.index = D.trading_days(start_date='2010-01-01',end_date='2017-05-01').date[1:]

In [18]:

plt.plot(rolling_corr)
plt.xlabel('Day')
plt.ylabel('60-day Rolling Correlation')
plt.show()

但是对于成百上千的股票，怎样才能找到高度相关的股票对？

In [19]:

# 我们以10只股票举例
instruments = D.instruments()[:10]
Stock_matrix = D.history_data(instruments,start_date='2016-01-01',end_date='2016-09-01',fields=['close']) 
# 不用收盘价数据，而是用收益率数据
# 通过pivot_table函数将Stock_matrix整理成一个以股票日收益率为列的df 
Stock_matrix = pd.pivot_table(Stock_matrix,values='close',index=['date'],columns=['instrument']).apply(lambda x:x.pct_change())
Stock_matrix.head()

Out[19]:

instrument	000001.SZA	000002.SZA	000004.SZA	000005.SZA	000006.SZA	000007.SZA	000008.SZA	000009.SZA	000010.SZA	000011.SZA
date
2016-01-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2016-01-05	0.006178	0.0	-0.063665	-0.015487	-0.032755	0.0	0.018850	-0.047030	-0.056044	-0.042081
2016-01-06	0.011404	0.0	0.012926	0.031461	0.025897	0.0	0.013876	0.036364	0.040745	0.022364
2016-01-07	-0.051171	0.0	-0.100051	-0.099129	-0.100000	0.0	-0.088504	-0.100251	-0.099553	-0.100000
2016-01-08	0.016453	0.0	0.006239	0.003628	0.009709	0.0	-0.002002	0.009749	0.001242	0.006944

In [20]:

# 相关系数矩阵
Stock_matrix.corr()

Out[20]:

instrument	000001.SZA	000002.SZA	000004.SZA	000005.SZA	000006.SZA	000007.SZA	000008.SZA	000009.SZA	000010.SZA	000011.SZA
instrument
000001.SZA	1.000000	0.018993	0.595322	0.600269	0.622749	0.027863	0.531736	0.657898	0.591505	0.458707
000002.SZA	0.018993	1.000000	0.000170	0.050937	0.138133	0.169131	0.026653	0.018328	0.054138	0.072238
000004.SZA	0.595322	0.000170	1.000000	0.597882	0.659429	-0.000203	0.528496	0.621535	0.642140	0.544813
000005.SZA	0.600269	0.050937	0.597882	1.000000	0.665327	0.060434	0.590306	0.681779	0.665582	0.568800
000006.SZA	0.622749	0.138133	0.659429	0.665327	1.000000	0.055961	0.507439	0.681861	0.670731	0.777092
000007.SZA	0.027863	0.169131	-0.000203	0.060434	0.055961	1.000000	0.054658	0.043501	0.032836	0.002523
000008.SZA	0.531736	0.026653	0.528496	0.590306	0.507439	0.054658	1.000000	0.554532	0.562442	0.421347
000009.SZA	0.657898	0.018328	0.621535	0.681779	0.681861	0.043501	0.554532	1.000000	0.672703	0.523347
000010.SZA	0.591505	0.054138	0.642140	0.665582	0.670731	0.032836	0.562442	0.672703	1.000000	0.591624
000011.SZA	0.458707	0.072238	0.544813	0.568800	0.777092	0.002523	0.421347	0.523347	0.591624	1.000000

5、通过相关关系热力图可视化股票相关性¶

In [15]:

# 绘制相关系数热力图
mask = np.zeros_like(Stock_matrix.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(Stock_matrix.corr(), mask=mask, cmap=cmap)
plt.show()