克隆策略

Pandas库两种常用的基本数据结构:Series和DataFrame

In [17]:
import pandas as pd
'''Series 的构建案例1:股票持仓变动'''
s1 = pd.Series([100,-120,130],index=['000001.SZA','600010.SHA','300001.SZA'])
s1
Out[17]:
000001.SZA    100
600010.SHA   -120
300001.SZA    130
dtype: int64
In [18]:
'''Series 的构建案例2:混合数据类型'''
s2 = pd.Series([1,'小明',{'a':1,'b':2}],index=range(3))
s2
Out[18]:
0                   1
1                  小明
2    {'a': 1, 'b': 2}
dtype: object
In [19]:
'''DataFrame 的构建案例1:股票所属板块'''
df1 = pd.DataFrame({'instrument':['000001','600010','300001'],'plate':['深圳主板','上证主板','创业板']}, index=range(3))
df1
Out[19]:
instrument plate
0 000001 深圳主板
1 600010 上证主板
2 300001 创业板
In [49]:
'''DataFrame 的构建案例2:时间日期索引的构建'''
dates  = pd.date_range('2/6/2017', periods=5, freq='D')
df2 = pd.DataFrame( np.random.randint(1,100,(5,4)), index=dates, columns=['open','hgih','low','close'])
df2
Out[49]:
open hgih low close
2017-02-06 94 36 33 83
2017-02-07 53 62 41 58
2017-02-08 84 43 97 98
2017-02-09 39 28 96 75
2017-02-10 63 82 74 37
In [47]:
'''DataFrame 的文件保存输出,默认新建文件/覆盖原文件'''
df2.to_csv('xxx.csv')
In [50]:
'''DataFrame 的文件保存输出,mode来控制追加写入'''
df2.to_csv('xxx.csv',mode='a', header=None)

任务一: 获取历史数据并设置索引,熟悉排序、数据切片功能

1、首先通过API获取历史数据

In [51]:
df = DataSource('bar1d_CN_STOCK_A').read(['000001.SZA','300023.SZA'],'2017-02-10','2017-02-15',fields=['close','open','high','low','adjust_factor'])
In [52]:
'''查看一个DataFrame的前/后N行数据, 使用head(N)或者tail(N)'''
df.head(5)
Out[52]:
close open date low high adjust_factor instrument
0 977.394470 976.346924 2017-02-10 975.299316 980.537231 104.758255 000001.SZA
1 67.790512 66.320900 2017-02-10 65.680298 67.790512 3.768233 300023.SZA
2 985.775146 978.442078 2017-02-13 977.394470 988.917908 104.758255 000001.SZA
3 74.573334 68.016609 2017-02-13 67.376007 74.573334 3.768233 300023.SZA
4 984.727600 985.775146 2017-02-14 981.584839 986.822754 104.758255 000001.SZA
前复权:前复权是以当前价为基准复权,复权后价格=(复权前价格-现金红利)÷(1+流通股份变动比例),最后一天的价格不变,根据分红除权数据处理之前的价格数据;优点最新价是实际价和看盘结果一致。
后复权:上市第一天的价格不变,根据分红配股数据处理之后的价格数据,复权后价格=复权前价格×(1+流通股份变动比例)+现金红利,因此会导致最后一天的价格显示出来不是实际成交价。
如果计算收益率,则后复权计算更方便,前复权需要更新历史价格数据,平台默认使用历史后复权数据
In [55]:
'''真实收盘价格的计算'''
df['close_actual']=df['close']/df['adjust_factor']
df.head()
Out[55]:
close open date low high adjust_factor instrument close_actual
0 977.394470 976.346924 2017-02-10 975.299316 980.537231 104.758255 000001.SZA 9.330000
1 67.790512 66.320900 2017-02-10 65.680298 67.790512 3.768233 300023.SZA 17.990000
2 985.775146 978.442078 2017-02-13 977.394470 988.917908 104.758255 000001.SZA 9.410000
3 74.573334 68.016609 2017-02-13 67.376007 74.573334 3.768233 300023.SZA 19.790001
4 984.727600 985.775146 2017-02-14 981.584839 986.822754 104.758255 000001.SZA 9.400000
In [56]:
'''获取DataFrame的列, 返回带索引的Series'''
df.date # 或 df['date']
Out[56]:
0   2017-02-10
1   2017-02-10
2   2017-02-13
3   2017-02-13
4   2017-02-14
5   2017-02-14
6   2017-02-15
7   2017-02-15
Name: date, dtype: datetime64[ns]

2、字符串时间、TimeStamp格式时间和np.datetime64格式时间的转换

In [62]:
'''平台默认获取数据得到的时间date列类型为numpy的datetime64格式'''
df.date.values[0]
Out[62]:
numpy.datetime64('2017-02-10T00:00:00.000000000')
In [63]:
'''datetime64格式转化为字符串格式'''
np.datetime_as_string(df.date.values[0]) # 或 str(df.date.values[0])
Out[63]:
'2017-02-10T00:00:00.000000000'
In [65]:
'''numpy.datetime64格式转为pandas.Timestamp格式'''
pd.to_datetime(df.date.values[0])
Out[65]:
Timestamp('2017-02-10 00:00:00')
In [66]:
'''Timestamp格式转为字符串格式'''
pd.to_datetime(df.date.values[0]).strftime('%Y%m%d')
Out[66]:
'20170210'
In [68]:
'''Timestamp格式做日期加减'''
pd.to_datetime(df.date.values[0]) + pd.Timedelta(days=1)
Out[68]:
Timestamp('2017-02-11 00:00:00')
In [69]:
'''计算时间差'''
delta = (pd.to_datetime('2019-01-10') - pd.to_datetime('2019-01-05'))
print('时间间隔天数:',delta/np.timedelta64(1,'D'),'时间间隔分钟数:',delta/np.timedelta64(1,'m'),'时间间隔秒数:',delta/np.timedelta64(1,'s'))
时间间隔天数: 5.0 时间间隔分钟数: 7200.0 时间间隔秒数: 432000.0
In [72]:
'''时间的批量处理'''
df['date_str'] = np.datetime_as_string(df['date'])
df['date_Timestamp'] = pd.to_datetime(df['date'])
df[['date','date_str','date_Timestamp']]
Out[72]:
date date_str date_Timestamp
0 2017-02-10 2017-02-10T00:00:00.000000000 2017-02-10
1 2017-02-10 2017-02-10T00:00:00.000000000 2017-02-10
2 2017-02-13 2017-02-13T00:00:00.000000000 2017-02-13
3 2017-02-13 2017-02-13T00:00:00.000000000 2017-02-13
4 2017-02-14 2017-02-14T00:00:00.000000000 2017-02-14
5 2017-02-14 2017-02-14T00:00:00.000000000 2017-02-14
6 2017-02-15 2017-02-15T00:00:00.000000000 2017-02-15
7 2017-02-15 2017-02-15T00:00:00.000000000 2017-02-15

3、设置/取消日期date列为索引

In [73]:
df.head()
Out[73]:
close open date low high adjust_factor instrument close_actual date_str date_Timestamp
0 977.394470 976.346924 2017-02-10 975.299316 980.537231 104.758255 000001.SZA 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
1 67.790512 66.320900 2017-02-10 65.680298 67.790512 3.768233 300023.SZA 17.990000 2017-02-10T00:00:00.000000000 2017-02-10
2 985.775146 978.442078 2017-02-13 977.394470 988.917908 104.758255 000001.SZA 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
3 74.573334 68.016609 2017-02-13 67.376007 74.573334 3.768233 300023.SZA 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
4 984.727600 985.775146 2017-02-14 981.584839 986.822754 104.758255 000001.SZA 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
In [74]:
df.set_index('date',inplace=True)
df.head(5)
Out[74]:
close open low high adjust_factor instrument close_actual date_str date_Timestamp
date
2017-02-10 977.394470 976.346924 975.299316 980.537231 104.758255 000001.SZA 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
2017-02-10 67.790512 66.320900 65.680298 67.790512 3.768233 300023.SZA 17.990000 2017-02-10T00:00:00.000000000 2017-02-10
2017-02-13 985.775146 978.442078 977.394470 988.917908 104.758255 000001.SZA 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
2017-02-13 74.573334 68.016609 67.376007 74.573334 3.768233 300023.SZA 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
2017-02-14 984.727600 985.775146 981.584839 986.822754 104.758255 000001.SZA 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
In [75]:
df.reset_index(inplace=True)
df
Out[75]:
date close open low high adjust_factor instrument close_actual date_str date_Timestamp
0 2017-02-10 977.394470 976.346924 975.299316 980.537231 104.758255 000001.SZA 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
1 2017-02-10 67.790512 66.320900 65.680298 67.790512 3.768233 300023.SZA 17.990000 2017-02-10T00:00:00.000000000 2017-02-10
2 2017-02-13 985.775146 978.442078 977.394470 988.917908 104.758255 000001.SZA 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
3 2017-02-13 74.573334 68.016609 67.376007 74.573334 3.768233 300023.SZA 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
4 2017-02-14 984.727600 985.775146 981.584839 986.822754 104.758255 000001.SZA 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
5 2017-02-14 75.213928 73.103722 70.993507 77.060364 3.768233 300023.SZA 19.959999 2017-02-14T00:00:00.000000000 2017-02-14
6 2017-02-15 72.387756 73.744316 72.086296 76.042938 3.768233 300023.SZA 19.209999 2017-02-15T00:00:00.000000000 2017-02-15
7 2017-02-15 989.965515 984.727600 983.679993 999.393738 104.758255 000001.SZA 9.450000 2017-02-15T00:00:00.000000000 2017-02-15

3、按日期和收盘价价格排序

In [76]:
df.sort_values(by=['date','close'],ascending=[False,False])
Out[76]:
date close open low high adjust_factor instrument close_actual date_str date_Timestamp
7 2017-02-15 989.965515 984.727600 983.679993 999.393738 104.758255 000001.SZA 9.450000 2017-02-15T00:00:00.000000000 2017-02-15
6 2017-02-15 72.387756 73.744316 72.086296 76.042938 3.768233 300023.SZA 19.209999 2017-02-15T00:00:00.000000000 2017-02-15
4 2017-02-14 984.727600 985.775146 981.584839 986.822754 104.758255 000001.SZA 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
5 2017-02-14 75.213928 73.103722 70.993507 77.060364 3.768233 300023.SZA 19.959999 2017-02-14T00:00:00.000000000 2017-02-14
2 2017-02-13 985.775146 978.442078 977.394470 988.917908 104.758255 000001.SZA 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
3 2017-02-13 74.573334 68.016609 67.376007 74.573334 3.768233 300023.SZA 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
0 2017-02-10 977.394470 976.346924 975.299316 980.537231 104.758255 000001.SZA 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
1 2017-02-10 67.790512 66.320900 65.680298 67.790512 3.768233 300023.SZA 17.990000 2017-02-10T00:00:00.000000000 2017-02-10

4、按列值过滤行,选取close列>100的行

In [113]:
df[df.close>10]
Out[113]:
date instrument open close low high adjust_factor close_actual date_str date_Timestamp
0 2017-02-10 000001.SZA 976.346924 977.394470 975.299316 980.537231 104.758255 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
1 2017-02-10 300023.SZA 66.320900 67.790512 65.680298 67.790512 3.768233 17.990000 2017-02-10T00:00:00.000000000 2017-02-10
2 2017-02-13 000001.SZA 978.442078 985.775146 977.394470 988.917908 104.758255 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
3 2017-02-13 300023.SZA 68.016609 74.573334 67.376007 74.573334 3.768233 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
4 2017-02-14 000001.SZA 985.775146 984.727600 981.584839 986.822754 104.758255 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
5 2017-02-14 300023.SZA 73.103722 75.213928 70.993507 77.060364 3.768233 19.959999 2017-02-14T00:00:00.000000000 2017-02-14
6 2017-02-15 300023.SZA 73.744316 72.387756 72.086296 76.042938 3.768233 19.209999 2017-02-15T00:00:00.000000000 2017-02-15
7 2017-02-15 000001.SZA 984.727600 989.965515 983.679993 999.393738 104.758255 9.450000 2017-02-15T00:00:00.000000000 2017-02-15
In [80]:
'''使用loc[行过滤条件,列过滤]方法实现行和列的过滤, 例如获取close_actual大于10的各行数据,并截取instrument、low、high三列'''
df.loc[df['close_actual']>10, ['instrument','low','high']]
Out[80]:
instrument low high
1 300023.SZA 65.680298 67.790512
3 300023.SZA 67.376007 74.573334
5 300023.SZA 70.993507 77.060364
6 300023.SZA 72.086296 76.042938

5、按行号/列号切片过滤

In [81]:
'''选取第一行的数据'''
df.iloc[0]
Out[81]:
date                        2017-02-10 00:00:00
close                                 977.39447
open                                 976.346924
low                                  975.299316
high                                 980.537231
adjust_factor                        104.758255
instrument                           000001.SZA
close_actual                               9.33
date_str          2017-02-10T00:00:00.000000000
date_Timestamp              2017-02-10 00:00:00
Name: 0, dtype: object
In [82]:
'''按位置选取第一行、第一列的元素'''
df.iloc[0,0]
Out[82]:
Timestamp('2017-02-10 00:00:00')

6、按索引和列名过滤数据

In [83]:
df.set_index('date',inplace=True)
df.head()
Out[83]:
close open low high adjust_factor instrument close_actual date_str date_Timestamp
date
2017-02-10 977.394470 976.346924 975.299316 980.537231 104.758255 000001.SZA 9.330000 2017-02-10T00:00:00.000000000 2017-02-10
2017-02-10 67.790512 66.320900 65.680298 67.790512 3.768233 300023.SZA 17.990000 2017-02-10T00:00:00.000000000 2017-02-10
2017-02-13 985.775146 978.442078 977.394470 988.917908 104.758255 000001.SZA 9.410000 2017-02-13T00:00:00.000000000 2017-02-13
2017-02-13 74.573334 68.016609 67.376007 74.573334 3.768233 300023.SZA 19.790001 2017-02-13T00:00:00.000000000 2017-02-13
2017-02-14 984.727600 985.775146 981.584839 986.822754 104.758255 000001.SZA 9.400000 2017-02-14T00:00:00.000000000 2017-02-14
In [84]:
'''获取2017-02-14到2017-02-15的所有数据'''
df.loc['2017-02-14':'2017-02-15',['instrument','close']] # 或 df.loc[np.datetime64('2017-02-14'):np.datetime64('2017-02-15'),['instrument','close']]
Out[84]:
instrument close
date
2017-02-14 000001.SZA 984.727600
2017-02-14 300023.SZA 75.213928
2017-02-15 300023.SZA 72.387756
2017-02-15 000001.SZA 989.965515