克隆策略

任务二: 获取历史数据并初步统计

1、首先通过API获取历史数据

In [3]:
df = DataSource('bar1d_CN_STOCK_A').read(['000001.SZA','300023.SZA'],'2017-02-10','2017-02-20',fields=['close','open','high','low','amount'])
In [4]:
df.head(6)
Out[4]:
instrument amount close low open high date
0 000001.SZA 450701345.0 977.394470 975.299316 976.346924 980.537231 2017-02-10
1 300023.SZA 59811422.0 67.790512 65.680298 66.320900 67.790512 2017-02-10
2 000001.SZA 599951024.0 985.775146 977.394470 978.442078 988.917908 2017-02-13
3 300023.SZA 157240617.0 74.573334 67.376007 68.016609 74.573334 2017-02-13
4 000001.SZA 340318431.0 984.727600 981.584839 985.775146 986.822754 2017-02-14
5 300023.SZA 252524730.0 75.213928 70.993507 73.103722 77.060364 2017-02-14

2、分组统计groupby+agg的使用

'测试'

In [10]:
'''查看按股票分组数据'''
groups = df.groupby('instrument')
for k in groups:
    print(k)
('000001.SZA',     instrument       amount        close         low        open         high  \
0   000001.SZA  450701345.0   977.394470  975.299316  976.346924   980.537231   
2   000001.SZA  599951024.0   985.775146  977.394470  978.442078   988.917908   
4   000001.SZA  340318431.0   984.727600  981.584839  985.775146   986.822754   
7   000001.SZA  716759655.0   989.965515  983.679993  984.727600   999.393738   
9   000001.SZA  388636225.0   991.013062  986.822754  989.965515   995.203430   
11  000001.SZA  400161795.0   983.679993  981.584839  991.013062   994.155823   
12  000001.SZA  855710805.0  1001.488892  984.727600  984.727600  1003.584045   

         date  
0  2017-02-10  
2  2017-02-13  
4  2017-02-14  
7  2017-02-15  
9  2017-02-16  
11 2017-02-17  
12 2017-02-20  )
('300023.SZA',     instrument       amount      close        low       open       high  \
1   300023.SZA   59811422.0  67.790512  65.680298  66.320900  67.790512   
3   300023.SZA  157240617.0  74.573334  67.376007  68.016609  74.573334   
5   300023.SZA  252524730.0  75.213928  70.993507  73.103722  77.060364   
6   300023.SZA  175938876.0  72.387756  72.086296  73.744316  76.042938   
8   300023.SZA  109264512.0  71.370331  70.126816  72.350075  72.689217   
10  300023.SZA   93135166.0  69.561584  69.184761  71.219604  71.973251   
13  300023.SZA   72052822.0  70.089134  69.335487  69.599266  70.654366   

         date  
1  2017-02-10  
3  2017-02-13  
5  2017-02-14  
6  2017-02-15  
8  2017-02-16  
10 2017-02-17  
13 2017-02-20  )
In [19]:
'''查看按日期分组数据'''
groups = df.groupby('date')
for k in groups:
    print(k)
(Timestamp('2017-02-10 00:00:00'),    instrument       amount       close         low        open        high  \
0  000001.SZA  450701345.0  977.394470  975.299316  976.346924  980.537231   
1  300023.SZA   59811422.0   67.790512   65.680298   66.320900   67.790512   

        date  
0 2017-02-10  
1 2017-02-10  )
(Timestamp('2017-02-13 00:00:00'),    instrument       amount       close         low        open        high  \
2  000001.SZA  599951024.0  985.775146  977.394470  978.442078  988.917908   
3  300023.SZA  157240617.0   74.573334   67.376007   68.016609   74.573334   

        date  
2 2017-02-13  
3 2017-02-13  )
(Timestamp('2017-02-14 00:00:00'),    instrument       amount       close         low        open        high  \
4  000001.SZA  340318431.0  984.727600  981.584839  985.775146  986.822754   
5  300023.SZA  252524730.0   75.213928   70.993507   73.103722   77.060364   

        date  
4 2017-02-14  
5 2017-02-14  )
(Timestamp('2017-02-15 00:00:00'),    instrument       amount       close         low        open        high  \
6  300023.SZA  175938876.0   72.387756   72.086296   73.744316   76.042938   
7  000001.SZA  716759655.0  989.965515  983.679993  984.727600  999.393738   

        date  
6 2017-02-15  
7 2017-02-15  )
(Timestamp('2017-02-16 00:00:00'),    instrument       amount       close         low        open        high  \
8  300023.SZA  109264512.0   71.370331   70.126816   72.350075   72.689217   
9  000001.SZA  388636225.0  991.013062  986.822754  989.965515  995.203430   

        date  
8 2017-02-16  
9 2017-02-16  )
(Timestamp('2017-02-17 00:00:00'),     instrument       amount       close         low        open        high  \
10  300023.SZA   93135166.0   69.561584   69.184761   71.219604   71.973251   
11  000001.SZA  400161795.0  983.679993  981.584839  991.013062  994.155823   

         date  
10 2017-02-17  
11 2017-02-17  )
(Timestamp('2017-02-20 00:00:00'),     instrument       amount        close         low        open         high  \
12  000001.SZA  855710805.0  1001.488892  984.727600  984.727600  1003.584045   
13  300023.SZA   72052822.0    70.089134   69.335487   69.599266    70.654366   

         date  
12 2017-02-20  
13 2017-02-20  )
In [11]:
'''按股票分组计算每个股票一段时间的成交量和收盘价中位数'''
df.groupby('instrument').agg({'amount':'mean','close':'median'})
Out[11]:
amount close
instrument
000001.SZA 5.360342e+08 985.775146
300023.SZA 1.314240e+08 71.370331
In [12]:
'''若聚合函数相同可以简化写法'''
df.groupby('instrument')[['amount','close']].mean()
Out[12]:
amount close
instrument
000001.SZA 5.360342e+08 987.720642
300023.SZA 1.314240e+08 71.569511
In [13]:
'''按日期分组计算每日所有股票的成交量和收盘价平均值'''
df.groupby('date')[['amount','close']].mean()
Out[13]:
amount close
date
2017-02-10 255256383.5 522.592468
2017-02-13 378595820.5 530.174255
2017-02-14 296421580.5 529.970764
2017-02-15 446349265.5 531.176636
2017-02-16 248950368.5 531.191711
2017-02-17 246648480.5 526.620789
2017-02-20 463881813.5 535.789001
In [23]:
'''按股票分组计算某列的标量统计量,eg.和、数量、中位数、分位数、最小值、最大值、平均值、方差、标准差、峰度、平均绝对离差'''
df.groupby('instrument')[['close']].agg({'sum','count','median','min','max','mean','var','std','skew','mad'})
Out[23]:
close
var count median min sum max mean std mad skew
instrument
000001.SZA 56.857349 7 985.775146 977.394470 6914.044922 1001.488892 987.720642 7.540381 5.515851 0.79229
300023.SZA 7.253007 7 71.370331 67.790512 500.986572 75.213928 71.569511 2.693141 2.133281 0.12830
In [24]:
'''按股票分组后对多列进行不同的聚合计算'''
df.groupby('instrument').agg({'amount':['sum','min'],'close':['max','std']})
Out[24]:
amount close
sum min max std
instrument
000001.SZA 3.752239e+09 340318431.0 1001.488892 7.540381
300023.SZA 9.199681e+08 59811422.0 75.213928 2.693141
In [56]:
'''多层索引的处理'''
datas = df.groupby('instrument').agg({'amount':['sum','min'],'close':['max','std']})
print(datas.columns)
columns = [k+'_'+v for k,v in datas.columns]
datas.columns = columns
datas
MultiIndex([('amount', 'sum'),
            ('amount', 'min'),
            ( 'close', 'max'),
            ( 'close', 'std')],
           )
Out[56]:
amount_sum amount_min close_max close_std
instrument
000001.SZA 3.752239e+09 340318431.0 1001.488892 7.540381
300023.SZA 9.199681e+08 59811422.0 75.213928 2.693141
In [69]:
''''按股票分组计算统计量序列,eg.累积和、累乘、峰度、偏度, 累积最小、累积最大、一阶差分、变化率'''
df_tj = df.groupby('instrument').agg({'low':['cumsum','cumprod','cummin','cummax','diff','pct_change']})
df_tj.head()
Out[69]:
low
cumsum cumprod cummin cummax diff pct_change
0 975.299316 9.752993e+02 975.299316 975.299316 NaN NaN
1 65.680298 6.568030e+01 65.680298 65.680298 NaN NaN
2 1952.693848 9.532522e+05 975.299316 977.394470 2.095154 0.002148
3 133.056305 4.425276e+03 65.680298 67.376007 1.695709 0.025818
4 2934.278564 9.356979e+08 975.299316 981.584839 4.190369 0.004287
In [16]:
'''常用统计量的分组快速查看'''
df.groupby('instrument')[['amount','close']].describe()
Out[16]:
amount close
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
instrument
000001.SZA 7.0 5.360342e+08 1.935261e+08 340318431.0 394399010.0 450701345.0 658355339.5 855710805.0 7.0 987.720703 7.540381 977.394470 984.203796 985.775146 990.489288 1001.488892
300023.SZA 7.0 1.314240e+08 6.822003e+07 59811422.0 82593994.0 109264512.0 166589746.5 252524730.0 7.0 71.569511 2.693141 67.790512 69.825359 71.370331 73.480545 75.213928
案例:通过groupby将历史数据按照股票分组,计算滚动最近2日的成交额平均值
In [17]:
df.head()
Out[17]:
open low high instrument date close amount
0 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0
1 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0
2 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0
3 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0
4 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0
In [18]:
'''股票分组后计算成交量和收盘价序列的滚动两日平均值(移动均线)'''
df[['avg_amount','avg_close']] = df.groupby('instrument')[['amount','close']].apply(lambda x:x.rolling(2,min_periods=1).mean())
df.head()
Out[18]:
open low high instrument date close amount avg_amount avg_close
0 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470
1 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512
2 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808
3 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923
4 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373

3、计算昨日收盘价

In [19]:
df['close_1'] = df.groupby('instrument')['close'].apply(lambda x:x.shift(1))
df.head()
Out[19]:
open low high instrument date close amount avg_amount avg_close close_1
0 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470 NaN
1 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512 NaN
2 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808 977.394470
3 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923 67.790512
4 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373 985.775146

4、日收益率的计算、净值计算和累计收益率计算

In [20]:
'''将DataFrame转化为收盘价的日期序列表格'''
close_table = df.set_index(['date','instrument'])['close'].unstack() # 或者使用 pd.pivot_table(df, index='date',columns='instrument',values='close')
close_table
Out[20]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 977.394470 67.790512
2017-02-13 985.775146 74.573334
2017-02-14 984.727600 75.213928
2017-02-15 989.965515 72.387756
2017-02-16 991.013062 71.370331
2017-02-17 983.679993 69.561584
2017-02-20 1001.488892 70.089134
In [21]:
'''计算日收益率 即 收盘价的1日变化率'''
daily_ret_table = close_table.pct_change()
daily_ret_table
Out[21]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 NaN NaN
2017-02-13 0.008574 0.100056
2017-02-14 -0.001063 0.008590
2017-02-15 0.005319 -0.037575
2017-02-16 0.001058 -0.014055
2017-02-17 -0.007400 -0.025343
2017-02-20 0.018104 0.007584
In [22]:
'''第一天的日收益率填充为0'''
daily_ret_table.fillna(0,inplace=True)
daily_ret_table
Out[22]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 0.000000 0.000000
2017-02-13 0.008574 0.100056
2017-02-14 -0.001063 0.008590
2017-02-15 0.005319 -0.037575
2017-02-16 0.001058 -0.014055
2017-02-17 -0.007400 -0.025343
2017-02-20 0.018104 0.007584
In [23]:
'''计算当日净值 = 上一日净值 * (1+日收益率), 这里默认第一天的净值为1.0'''
net_table = (daily_ret_table+1).cumprod()
net_table
Out[23]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 1.000000 1.000000
2017-02-13 1.008574 1.100056
2017-02-14 1.007503 1.109505
2017-02-15 1.012862 1.067815
2017-02-16 1.013934 1.052807
2017-02-17 1.006431 1.026126
2017-02-20 1.024652 1.033908
In [24]:
'''计算累计收益率 = 每日净值 - 1'''
cum_ret = net_table - 1
cum_ret
Out[24]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 0.000000 0.000000
2017-02-13 0.008574 0.100056
2017-02-14 0.007503 0.109505
2017-02-15 0.012862 0.067815
2017-02-16 0.013934 0.052807
2017-02-17 0.006431 0.026126
2017-02-20 0.024652 0.033908
In [25]:
'''整合后的快速计算'''
cum_ret = (df.set_index(['date','instrument'])['close'].unstack().pct_change().fillna(0) + 1).cumprod() - 1
cum_ret
Out[25]:
instrument 000001.SZA 300023.SZA
date
2017-02-10 0.000000 0.000000
2017-02-13 0.008574 0.100056
2017-02-14 0.007503 0.109505
2017-02-15 0.012862 0.067815
2017-02-16 0.013934 0.052807
2017-02-17 0.006431 0.026126
2017-02-20 0.024652 0.033908
In [27]:
'''绘制累计收益率曲线'''
T.plot(cum_ret, title= '股票累计收益率曲线')

5、查看instrument列的唯一值(股票代码去重)

In [28]:
df['instrument'].unique()
Out[28]:
array(['000001.SZA', '300023.SZA'], dtype=object)

6、行运算举例:计算收盘价和开盘价的平均值

In [29]:
'''使用axis=1参数改变计算方向为沿行计算'''
df['avg_price'] = df[['close','open']].mean(axis=1)
df.head()
Out[29]:
open low high instrument date close amount avg_amount avg_close close_1 avg_price
0 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470 NaN 976.870728
1 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512 NaN 67.055710
2 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808 977.394470 982.108643
3 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923 67.790512 71.294968
4 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373 985.775146 985.251343

7、数据拼接concat

In [32]:
'''纵向拼接'''
pd.concat([df,df,df]).reset_index(drop=True)
Out[32]:
open low high instrument date close amount avg_amount avg_close close_1 avg_price
0 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470 NaN 976.870728
1 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512 NaN 67.055710
2 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808 977.394470 982.108643
3 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923 67.790512 71.294968
4 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373 985.775146 985.251343
5 73.103722 70.993507 77.060364 300023.SZA 2017-02-14 75.213928 252524730.0 204882673.5 74.893631 74.573334 74.158829
6 73.744316 72.086296 76.042938 300023.SZA 2017-02-15 72.387756 175938876.0 214231803.0 73.800842 75.213928 73.066040
7 984.727600 983.679993 999.393738 000001.SZA 2017-02-15 989.965515 716759655.0 528539043.0 987.346558 984.727600 987.346558
8 72.350075 70.126816 72.689217 300023.SZA 2017-02-16 71.370331 109264512.0 142601694.0 71.879044 72.387756 71.860199
9 989.965515 986.822754 995.203430 000001.SZA 2017-02-16 991.013062 388636225.0 552697940.0 990.489288 989.965515 990.489258
10 71.219604 69.184761 71.973251 300023.SZA 2017-02-17 69.561584 93135166.0 101199839.0 70.465958 71.370331 70.390594
11 991.013062 981.584839 994.155823 000001.SZA 2017-02-17 983.679993 400161795.0 394399010.0 987.346527 991.013062 987.346558
12 984.727600 984.727600 1003.584045 000001.SZA 2017-02-20 1001.488892 855710805.0 627936300.0 992.584442 983.679993 993.108276
13 69.599266 69.335487 70.654366 300023.SZA 2017-02-20 70.089134 72052822.0 82593994.0 69.825359 69.561584 69.844200
14 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470 NaN 976.870728
15 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512 NaN 67.055710
16 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808 977.394470 982.108643
17 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923 67.790512 71.294968
18 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373 985.775146 985.251343
19 73.103722 70.993507 77.060364 300023.SZA 2017-02-14 75.213928 252524730.0 204882673.5 74.893631 74.573334 74.158829
20 73.744316 72.086296 76.042938 300023.SZA 2017-02-15 72.387756 175938876.0 214231803.0 73.800842 75.213928 73.066040
21 984.727600 983.679993 999.393738 000001.SZA 2017-02-15 989.965515 716759655.0 528539043.0 987.346558 984.727600 987.346558
22 72.350075 70.126816 72.689217 300023.SZA 2017-02-16 71.370331 109264512.0 142601694.0 71.879044 72.387756 71.860199
23 989.965515 986.822754 995.203430 000001.SZA 2017-02-16 991.013062 388636225.0 552697940.0 990.489288 989.965515 990.489258
24 71.219604 69.184761 71.973251 300023.SZA 2017-02-17 69.561584 93135166.0 101199839.0 70.465958 71.370331 70.390594
25 991.013062 981.584839 994.155823 000001.SZA 2017-02-17 983.679993 400161795.0 394399010.0 987.346527 991.013062 987.346558
26 984.727600 984.727600 1003.584045 000001.SZA 2017-02-20 1001.488892 855710805.0 627936300.0 992.584442 983.679993 993.108276
27 69.599266 69.335487 70.654366 300023.SZA 2017-02-20 70.089134 72052822.0 82593994.0 69.825359 69.561584 69.844200
28 976.346924 975.299316 980.537231 000001.SZA 2017-02-10 977.394470 450701345.0 450701345.0 977.394470 NaN 976.870728
29 66.320900 65.680298 67.790512 300023.SZA 2017-02-10 67.790512 59811422.0 59811422.0 67.790512 NaN 67.055710
30 978.442078 977.394470 988.917908 000001.SZA 2017-02-13 985.775146 599951024.0 525326184.5 981.584808 977.394470 982.108643
31 68.016609 67.376007 74.573334 300023.SZA 2017-02-13 74.573334 157240617.0 108526019.5 71.181923 67.790512 71.294968
32 985.775146 981.584839 986.822754 000001.SZA 2017-02-14 984.727600 340318431.0 470134727.5 985.251373 985.775146 985.251343
33 73.103722 70.993507 77.060364 300023.SZA 2017-02-14 75.213928 252524730.0 204882673.5 74.893631 74.573334 74.158829
34 73.744316 72.086296 76.042938 300023.SZA 2017-02-15 72.387756 175938876.0 214231803.0 73.800842 75.213928 73.066040
35 984.727600 983.679993 999.393738 000001.SZA 2017-02-15 989.965515 716759655.0 528539043.0 987.346558 984.727600 987.346558
36 72.350075 70.126816 72.689217 300023.SZA 2017-02-16 71.370331 109264512.0 142601694.0 71.879044 72.387756 71.860199
37 989.965515 986.822754 995.203430 000001.SZA 2017-02-16 991.013062 388636225.0 552697940.0 990.489288 989.965515 990.489258
38 71.219604 69.184761 71.973251 300023.SZA 2017-02-17 69.561584 93135166.0 101199839.0 70.465958 71.370331 70.390594
39 991.013062 981.584839 994.155823 000001.SZA 2017-02-17 983.679993 400161795.0 394399010.0 987.346527 991.013062 987.346558
40 984.727600 984.727600 1003.584045 000001.SZA 2017-02-20 1001.488892 855710805.0 627936300.0 992.584442 983.679993 993.108276
41 69.599266 69.335487 70.654366 300023.SZA 2017-02-20 70.089134 72052822.0 82593994.0 69.825359 69.561584 69.844200
In [34]:
'''横向拼接, 指定按公共轴date和instrument对齐,返回索引交集结果'''
df1 = df[['date','instrument','close']].set_index(['date','instrument']).head(3)
print(df1)
df2 = df[['date','instrument','open']].set_index(['date','instrument'])
print(df2)
pd.concat([df1,df2], axis=1,join='inner') # inner表示对于索引不同步时,返回交集结果
                            close
date       instrument            
2017-02-10 000001.SZA  977.394470
           300023.SZA   67.790512
2017-02-13 000001.SZA  985.775146
                             open
date       instrument            
2017-02-10 000001.SZA  976.346924
           300023.SZA   66.320900
2017-02-13 000001.SZA  978.442078
           300023.SZA   68.016609
2017-02-14 000001.SZA  985.775146
           300023.SZA   73.103722
2017-02-15 300023.SZA   73.744316
           000001.SZA  984.727600
2017-02-16 300023.SZA   72.350075
           000001.SZA  989.965515
2017-02-17 300023.SZA   71.219604
           000001.SZA  991.013062
2017-02-20 000001.SZA  984.727600
           300023.SZA   69.599266
Out[34]:
close open
date instrument
2017-02-10 000001.SZA 977.394470 976.346924
300023.SZA 67.790512 66.320900
2017-02-13 000001.SZA 985.775146 978.442078
In [35]:
'''横向拼接, 指定按公共轴date和instrument对齐,返回索引并集结果'''
pd.concat([df1,df2], axis=1,join='outer') # outer表示对于索引不同步时,返回并集结果
Out[35]:
close open
date instrument
2017-02-10 000001.SZA 977.394470 976.346924
300023.SZA 67.790512 66.320900
2017-02-13 000001.SZA 985.775146 978.442078
300023.SZA NaN 68.016609
2017-02-14 000001.SZA NaN 985.775146
300023.SZA NaN 73.103722
2017-02-15 000001.SZA NaN 984.727600
300023.SZA NaN 73.744316
2017-02-16 000001.SZA NaN 989.965515
300023.SZA NaN 72.350075
2017-02-17 000001.SZA NaN 991.013062
300023.SZA NaN 71.219604
2017-02-20 000001.SZA NaN 984.727600
300023.SZA NaN 69.599266
  • 回到之前的问题,计算了多个股票的累和等序列如何跟原数据表df拼接?
In [74]:
df_tj = df.groupby('instrument').agg({'low':['cumsum','cumprod','cummin','cummax','diff','pct_change']})
columns = [k+'_'+v for k,v in df_tj.columns]
df_tj.columns = columns
df_tj.head()
Out[74]:
low_cumsum low_cumprod low_cummin low_cummax low_diff low_pct_change
0 975.299316 9.752993e+02 975.299316 975.299316 NaN NaN
1 65.680298 6.568030e+01 65.680298 65.680298 NaN NaN
2 1952.693848 9.532522e+05 975.299316 977.394470 2.095154 0.002148
3 133.056305 4.425276e+03 65.680298 67.376007 1.695709 0.025818
4 2934.278564 9.356979e+08 975.299316 981.584839 4.190369 0.004287
In [76]:
'''groupby操作后索引顺序没有改变,直接按照索引拼接'''
pd.concat([df, df_tj],axis=1, join='inner')
Out[76]:
instrument amount close low open high date low_cumsum low_cumprod low_cummin low_cummax low_diff low_pct_change
0 000001.SZA 450701345.0 977.394470 975.299316 976.346924 980.537231 2017-02-10 975.299316 9.752993e+02 975.299316 975.299316 NaN NaN
1 300023.SZA 59811422.0 67.790512 65.680298 66.320900 67.790512 2017-02-10 65.680298 6.568030e+01 65.680298 65.680298 NaN NaN
2 000001.SZA 599951024.0 985.775146 977.394470 978.442078 988.917908 2017-02-13 1952.693848 9.532522e+05 975.299316 977.394470 2.095154 0.002148
3 300023.SZA 157240617.0 74.573334 67.376007 68.016609 74.573334 2017-02-13 133.056305 4.425276e+03 65.680298 67.376007 1.695709 0.025818
4 000001.SZA 340318431.0 984.727600 981.584839 985.775146 986.822754 2017-02-14 2934.278564 9.356979e+08 975.299316 981.584839 4.190369 0.004287
5 300023.SZA 252524730.0 75.213928 70.993507 73.103722 77.060364 2017-02-14 204.049805 3.141659e+05 65.680298 70.993507 3.617500 0.053691
6 300023.SZA 175938876.0 72.387756 72.086296 73.744316 76.042938 2017-02-15 276.136108 2.264705e+07 65.680298 72.086296 1.092789 0.015393
7 000001.SZA 716759655.0 989.965515 983.679993 984.727600 999.393738 2017-02-15 3917.958496 9.204273e+11 975.299316 983.679993 2.095154 0.002134
8 300023.SZA 109264512.0 71.370331 70.126816 72.350075 72.689217 2017-02-16 346.262939 1.588166e+09 65.680298 72.086296 -1.959480 -0.027182
9 000001.SZA 388636225.0 991.013062 986.822754 989.965515 995.203430 2017-02-16 4904.781250 9.082985e+14 975.299316 986.822754 3.142761 0.003195
10 300023.SZA 93135166.0 69.561584 69.184761 71.219604 71.973251 2017-02-17 415.447693 1.098769e+11 65.680298 72.086296 -0.942055 -0.013434
11 000001.SZA 400161795.0 983.679993 981.584839 991.013062 994.155823 2017-02-17 5886.366211 8.915721e+17 975.299316 986.822754 -5.237915 -0.005308
12 000001.SZA 855710805.0 1001.488892 984.727600 984.727600 1003.584045 2017-02-20 6871.093750 8.779556e+20 975.299316 986.822754 3.142761 0.003202
13 300023.SZA 72052822.0 70.089134 69.335487 69.599266 70.654366 2017-02-20 484.783173 7.618367e+12 65.680298 72.086296 0.150726 0.002179

8.数据合并merge

In [79]:
'''使用merge指定按公共列date和instrument对齐合并,返回的结果方式包括inner、outer、left、right、cross'''
df1 = df[['date','instrument','close']].head(3)
df2 = df[['date','instrument','open']]
df_merge = df1.merge(df2, on=['date', 'instrument'], how='inner')
df_merge
Out[79]:
date instrument close open
0 2017-02-10 000001.SZA 977.394470 976.346924
1 2017-02-10 300023.SZA 67.790512 66.320900
2 2017-02-13 000001.SZA 985.775146 978.442078
In [80]:
df_merge = df1.merge(df2, on=['date', 'instrument'], how='outer')
df_merge
Out[80]:
date instrument close open
0 2017-02-10 000001.SZA 977.394470 976.346924
1 2017-02-10 300023.SZA 67.790512 66.320900
2 2017-02-13 000001.SZA 985.775146 978.442078
3 2017-02-13 300023.SZA NaN 68.016609
4 2017-02-14 000001.SZA NaN 985.775146
5 2017-02-14 300023.SZA NaN 73.103722
6 2017-02-15 300023.SZA NaN 73.744316
7 2017-02-15 000001.SZA NaN 984.727600
8 2017-02-16 300023.SZA NaN 72.350075
9 2017-02-16 000001.SZA NaN 989.965515
10 2017-02-17 300023.SZA NaN 71.219604
11 2017-02-17 000001.SZA NaN 991.013062
12 2017-02-20 000001.SZA NaN 984.727600
13 2017-02-20 300023.SZA NaN 69.599266