AI量化知识树

10分钟学会Pandas

由xiaoshao创建,最终由qxiao 被浏览 1276 用户

SELECT date, open, high, low, close

FROM bar1d_CN_STOCK_A

WHERE instrument = '000005.SZA'

AND date BETWEEN '2017-01-06' AND '2017-02-10'

ORDER BY date;

10分钟学会Pandas

Pandas最初被作为金融数据分析工具而开发出来,在金融领域被广泛使用。Pandas纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具、函数和方法。

本文是针对pandas新手的快速入门学习指南。你可以在 AI量化平台-编写策略 里,一步一步的学习和实践。

# 导入库
import pandas as pd
import numpy as np

Pandas数据结构

主要数据结构:Series和DataFrame,Series是一种类似于一维数组的对象,它由一组数据以及一组与之相关的数据标签(即索引)组成。DataFrame是一个表格型 的数据结构,可以简单地理解为是由多个具有相同的索引的Series组成。DataFrame是金融数据分析中特别常用的数据结构,也是我们平台最常遇到的数据结构。

# 创建Series,以日期为索引
dates = pd.date_range('2/6/2017', periods=5, freq='D')
s = pd.Series([1,3,5,np.nan,6,],index=dates)
s
2017-02-06    1.0
2017-02-07    3.0
2017-02-08    5.0
2017-02-09    NaN
2017-02-10    6.0

Freq: D, dtype: float64
# 创建DataFrame,以日期为索引
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))
df
A B C D
2017-02-06 -1.738581 -1.750260 -0.532282 -0.409738
2017-02-07 -0.871625 0.188268 -0.689427 1.128257
2017-02-08 1.504632 -0.229489 -2.775763 1.867307
2017-02-09 -1.753426 1.252952 0.053105 0.870436
2017-02-10 -1.162485 -0.376172 1.548791 -0.928139
sql=f""
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA' 
  AND date BETWEEN '2017-01-06' AND '2017-01-16'
ORDER BY date;""
# 选择某一列数据,它会返回一个Series,等同于df.close:
df['close']
df
date instrument open high low close
2017-01-06 000005.SZA 64.502495 66.077988 64.224464 65.151230
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.205032 63.761089 62.741653 62.927006
2017-01-13 000005.SZA 62.927006 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.185596 62.278271 56.810387 59.683346

查看数据

# 显示DataFrame前几行(默认是5)的数据
df.head()
date instrument open high low close
2017-01-06 000005.SZA 64.502495 66.077988 64.224464 65.151230
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.205032 63.761089 62.741653 62.927006
# 显示DataFrame后行的数据
df.tail(2)
date instrument open high low close
2017-01-13 000005.SZA 62.927006 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.185596 62.278271 56.810387 59.683346

显示索引、列名以及底层的numpy数据

# 数据索引
df.index
# output:
DatetimeIndex(['2017-01-06', '2017-01-09', '2017-01-10', '2017-01-11',
​                '2017-01-12', '2017-01-13', '2017-01-16'],
​              dtype='datetime64[ns]', name='date', freq=None)
# 数据列名
df.columns
# output:
Index(['instrument', 'open', 'high', 'low', 'close'], dtype='object')
# 底层numpy数据
df.values
# output:
array([['000005.SZA', 64.50249481201172, 66.07798767089844,
​        64.2244644165039, 65.15122985839844],
​       ['000005.SZA', 64.50249481201172, 65.89263916015625,
​        64.40982055664062, 65.52193450927734],
​       ['000005.SZA', 65.52193450927734, 65.61460876464844,
​        64.96587371826172, 64.96587371826172],
​       ['000005.SZA', 64.96587371826172, 64.96587371826172,
​        63.57573699951172, 63.57573699951172],
​       ['000005.SZA', 63.20503234863281, 63.76108932495117,
​        62.74165344238281, 62.927005767822266],
​       ['000005.SZA', 62.927005767822266, 63.20503234863281,
​        61.907569885253906, 61.907569885253906],
​       ['000005.SZA', 62.18559646606445, 62.27827072143555,
​        56.810386657714844, 59.683345794677734]], dtype=object)
# 数据类型
df.dtypes
# output:

instrument         object
open               float32
high                float32
low                  float32
close               float32
dtype:             object

转置、排序、偏移

# 转置
df.T
date 2017-01-06 00: 00: 00 2017-01-09 00: 00: 00 2017-01-10 00: 00: 00 2017-01-11 00: 00: 00 2017-01-12 00: 00: 00 2017-01-13 00: 00: 00 2017-01-16 00: 00: 00
instrument 000005.SZA 000005.SZA 000005.SZA 000005.SZA 000005.SZA 000005.SZA 000005.SZA
open 64.5025 64.5025 65.5219 64.9659 63.205 62.927 62.1856
high 66.078 65.8926 65.6146 64.9659 63.7611 63.205 62.2783
low 64.2245 64.4098 64.9659 63.5757 62.7417 61.9076 56.8104
close 65.1512 65.5219 64.9659 63.5757 62.927 61.9076 59.6833
# 按轴进行排序
df.sort_index(axis=1, ascending=False)
date open low instrument high close
2017-01-06 64.502495 64.224464 000005.SZA 66.077988 65.151230
2017-01-09 64.502495 64.409821 000005.SZA 65.892639 65.521935
2017-01-10 65.521935 64.965874 000005.SZA 65.614609 64.965874
2017-01-11 64.965874 63.575737 000005.SZA 64.965874 63.575737
2017-01-12 63.205032 62.741653 000005.SZA 63.761089 62.927006
2017-01-13 62.927006 61.907570 000005.SZA 63.205032 61.907570
2017-01-16 62.185596 56.810387 000005.SZA 62.278271 59.683346
# 按值进行排序
df.sort_values(by='close',ascending=False) # ascending默认升序,ascending=False降序
date instrument open high low close
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-06 000005.SZA 64.502495 66.077988 64.224464 65.151230
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.205032 63.761089 62.741653 62.927006
2017-01-13 000005.SZA 62.927006 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.185596 62.278271 56.810387 59.683346

shift数据偏移(可理解为回溯)

# 利用pct_change计算每日收益
sql=f""SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
  AND date BETWEEN '2010-01-01' AND '2016-12-31'
ORDER BY date;""
rets.head()
date
2010-01-04         NaN
2010-01-05   -0.017292
2010-01-06   -0.017167
2010-01-07   -0.010917
2010-01-08   -0.002208
Name: close, dtype: float32
# 计算每日收益,等价于pct_change函数
daily_returns = df / df.shift(1) -1 # 正数表示向前偏移,负数表示向后偏移
daily_returns.head()
date
2010-01-04         NaN
2010-01-05   -0.017292
2010-01-06   -0.017167
2010-01-07   -0.010917
2010-01-08   -0.002208
Name: close, dtype: float32

数据选择

选取

sql=""WITH df1 AS(
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
  AND date BETWEEN '2017-01-06' AND '2017-02-10'
ORDER BY date;)""
df = df.set_index('date')

# 选择某一列数据,它会返回一个Series,等同于df.close:
df['close']
date
2017-01-06    65.151230
2017-01-09    65.521935
2017-01-10    64.965874
2017-01-11    63.575737
2017-01-12    62.927006
2017-01-13    61.907570
2017-01-16    59.683346
2017-01-17    60.054047
2017-01-18    60.702782
2017-01-19    59.683346
2017-01-20    60.239399
2017-01-23    60.517429
2017-01-24    60.332077
2017-01-25    60.424751
2017-01-26    60.795456
2017-02-03    60.517429
2017-02-06    60.795456
2017-02-07    61.166161
2017-02-08    61.073483
2017-02-09    61.444187
2017-02-10    61.536865
Name: close, dtype: float32
# 获取2017-02-06到2017-02-09的所有数据
df.loc['20170206':'20170209']
date	instrument	open	high	low	close
2017-02-06	000005.SZA	60.517429	60.795456	60.146725	60.795456
2017-02-07	000005.SZA	60.795456	61.629539	60.424751	61.166161
2017-02-08	000005.SZA	60.980808	61.166161	60.424751	61.073483
2017-02-09	000005.SZA	61.073483	61.536865	60.888130	61.444187
# 获取2017-02-06的所有数据
df.loc['2017-02-06':'2017-02-06']
date instrument open high low close
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456
# 获取2017-02-06的开盘价和收盘价数据
df.loc['2017-02-06':'2017-02-06'][['open','close']]
date open close
2017-02-06 60.517429 60.795456
# 获取2017-02-06和2017-02-09间断两天的开盘价和收盘价数据
df.loc['2017-02-06':'2017-02-09'][['open','close']]
date open close
2017-02-06 60.517429 60.795456
2017-02-07 60.795456 61.166161
2017-02-08 60.980808 61.073483
2017-02-09 61.073483 61.444187
# 获取2017-02-06的开盘价,返回DataFrame
df.loc['2017-02-06':'2017-02-06'][['open']]
date	open
2017-02-06	60.517429
# 获取2017-02-06的收盘价,返回标量
df.loc['2017-02-06','close']
60.517429

布尔索引

# 获取最高价大于64的交易日数据
df[df.high > 64]
date instrument open high low close
2017-01-06 000005.SZA 64.502495 66.077988 64.224464 65.151230
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-18 000005.SZA 60.888130 64.687851 60.517429 60.702782

where 索引

# 选出大于60的元素
df[df > 60]
date instrument open high low close
2017-01-06 000005.SZA 64.502495 66.077988 64.224464 65.151230
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.205032 63.761089 62.741653 62.927006
2017-01-13 000005.SZA 62.927006 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.185596 62.278271 NaN NaN
2017-01-17 000005.SZA NaN 60.424751 NaN 60.054047
2017-01-18 000005.SZA 60.888130 64.687851 60.517429 60.702782
2017-01-19 000005.SZA NaN 60.980808 NaN NaN
2017-01-20 000005.SZA 60.239399 60.610104 NaN 60.239399
2017-01-23 000005.SZA 60.332077 60.888130 60.146725 60.517429
2017-01-24 000005.SZA 60.610104 60.610104 60.239399 60.332077
2017-01-25 000005.SZA 60.424751 60.517429 NaN 60.424751
2017-01-26 000005.SZA 60.517429 60.795456 60.424751 60.795456
2017-02-03 000005.SZA 60.888130 60.888130 60.239399 60.517429
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865

丢弃数据

# 丢弃行
df.drop([pd.Timestamp('2017-01-06')]) # 丢弃行
date instrument open high low close
2017-01-09 000005.SZA 64.502495 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.521935 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.205032 63.761089 62.741653 62.927006
2017-01-13 000005.SZA 62.927006 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.185596 62.278271 56.810387 59.683346
2017-01-17 000005.SZA 59.497993 60.424751 58.385880 60.054047
2017-01-18 000005.SZA 60.888130 64.687851 60.517429 60.702782
2017-01-19 000005.SZA 59.776020 60.980808 59.497993 59.683346
2017-01-20 000005.SZA 60.239399 60.610104 59.776020 60.239399
2017-01-23 000005.SZA 60.332077 60.888130 60.146725 60.517429
2017-01-24 000005.SZA 60.610104 60.610104 60.239399 60.332077
2017-01-25 000005.SZA 60.424751 60.517429 59.961372 60.424751
2017-01-26 000005.SZA 60.517429 60.795456 60.424751 60.795456
2017-02-03 000005.SZA 60.888130 60.888130 60.239399 60.517429
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865
# 丢弃列 ,del df['open'],原地修改
df.drop('open',axis=1)  # 非原地修改
date instrument high low close
2017-01-06 000005.SZA 66.077988 64.224464 65.151230
2017-01-09 000005.SZA 65.892639 64.409821 65.521935
2017-01-10 000005.SZA 65.614609 64.965874 64.965874
2017-01-11 000005.SZA 64.965874 63.575737 63.575737
2017-01-12 000005.SZA 63.761089 62.741653 62.927006
2017-01-13 000005.SZA 63.205032 61.907570 61.907570
2017-01-16 000005.SZA 62.278271 56.810387 59.683346
2017-01-17 000005.SZA 60.424751 58.385880 60.054047
2017-01-18 000005.SZA 64.687851 60.517429 60.702782
2017-01-19 000005.SZA 60.980808 59.497993 59.683346
2017-01-20 000005.SZA 60.610104 59.776020 60.239399
2017-01-23 000005.SZA 60.888130 60.146725 60.517429
2017-01-24 000005.SZA 60.610104 60.239399 60.332077
2017-01-25 000005.SZA 60.517429 59.961372 60.424751
2017-01-26 000005.SZA 60.795456 60.424751 60.795456
2017-02-03 000005.SZA 60.888130 60.239399 60.517429
2017-02-06 000005.SZA 60.795456 60.146725 60.795456
2017-02-07 000005.SZA 61.629539 60.424751 61.166161
2017-02-08 000005.SZA 61.166161 60.424751 61.073483
2017-02-09 000005.SZA 61.536865 60.888130 61.444187
2017-02-10 000005.SZA 61.629539 61.258835 61.536865

对齐、合并、赋值

sql=""WITH df1 AS (
    SELECT 
        date, 
        open, 
        high, 
        low, 
        close
    FROM 
        bar1d_CN_STOCK_A
    WHERE 
        instrument = '000005.SZA'
        AND date BETWEEN '2017-02-01' AND '2017-02-10'
) , df2 AS (
    SELECT 
        date, 
        market_cap, 
        pe_ttm
    FROM 
        market_value_CN_STOCK_A
    WHERE 
        instrument = '000005.SZA'
        AND date BETWEEN '2017-01-01' AND '2017-02-10'
)""
# 取索引的交集进行合并
df1.merge(df2,left_index=True,right_index=True,how='inner')
date instrument_x open high low close instrument_y market_cap pe_ttm
2017-02-03 000005.SZA 60.888130 60.888130 60.239399 60.517429 000005.SZA 6.912246e+09 63.786068
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 000005.SZA 6.944002e+09 64.079109
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161 000005.SZA 6.986343e+09 64.469841
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 000005.SZA 6.975758e+09 64.372154
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187 000005.SZA 7.018099e+09 64.762886
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865 000005.SZA 7.028685e+09 64.860565
# 取索引的并集进行合并
df1.merge(df2,left_index=True,right_index=True,how='outer')
date instrument_x open high low close instrument_y market_cap pe_ttm
2017-01-03 NaN NaN NaN NaN NaN 000005.SZA 7.229807e+09 66.716515
2017-01-04 NaN NaN NaN NaN NaN 000005.SZA 7.420343e+09 68.474785
2017-01-05 NaN NaN NaN NaN NaN 000005.SZA 7.378002e+09 68.084053
2017-01-06 NaN NaN NaN NaN NaN 000005.SZA 7.441514e+09 68.670143
2017-01-09 NaN NaN NaN NaN NaN 000005.SZA 7.483855e+09 69.060875
2017-01-10 NaN NaN NaN NaN NaN 000005.SZA 7.420343e+09 68.474785
2017-01-11 NaN NaN NaN NaN NaN 000005.SZA 7.261563e+09 67.009560
2017-01-12 NaN NaN NaN NaN NaN 000005.SZA 7.187465e+09 66.325790
2017-01-13 NaN NaN NaN NaN NaN 000005.SZA 7.071026e+09 65.251289
2017-01-16 NaN NaN NaN NaN NaN 000005.SZA 6.816977e+09 62.906933
2017-01-17 NaN NaN NaN NaN NaN 000005.SZA 6.859319e+09 63.297661
2017-01-18 NaN NaN NaN NaN NaN 000005.SZA 6.933416e+09 63.981430
2017-01-19 NaN NaN NaN NaN NaN 000005.SZA 6.816977e+09 62.906933
2017-01-20 NaN NaN NaN NaN NaN 000005.SZA 6.880489e+09 63.493023
2017-01-23 NaN NaN NaN NaN NaN 000005.SZA 6.912246e+09 63.786068
2017-01-24 NaN NaN NaN NaN NaN 000005.SZA 6.891075e+09 63.590702
2017-01-25 NaN NaN NaN NaN NaN 000005.SZA 6.901660e+09 63.688385
2017-01-26 NaN NaN NaN NaN NaN 000005.SZA 6.944002e+09 64.079109
2017-02-03 000005.SZA 60.888130 60.888130 60.239399 60.517429 000005.SZA 6.912246e+09 63.786068
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 000005.SZA 6.944002e+09 64.079109
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161 000005.SZA 6.986343e+09 64.469841
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 000005.SZA 6.975758e+09 64.372154
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187 000005.SZA 7.018099e+09 64.762886
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865 000005.SZA 7.028685e+09 64.860565
# 通过标签赋值
df1.loc['2017-02-03','market_cap'] = 0
date instrument open high low close market_cap
2017-02-03 000005.SZA 60.888130 60.888130 60.239399 60.517429 0.0
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 NaN
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161 NaN
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 NaN
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187 NaN
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865 NaN
# 通过位置赋值
df1.iat[0,0] ='600519.SHA'
df1
date instrument open high low close market_cap
2017-02-03 600519.SHA 60.888130 60.888130 60.239399 60.517429 0.0
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 NaN
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161 NaN
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 NaN
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187 NaN
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865 NaN
# 通过where操作来赋值
df1[df1['market_cap'] == 0] = 111
df1.head(4)
date instrument open high low close market_cap
2017-02-03 111 111.000000 111.000000 111.000000 111.000000 111.0
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 NaN
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161 NaN
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 NaN

缺失值处理

在pandas中,用np.nan来代表缺失值,这些值默认不会参与运算。

# 通过set_index设置某一列为索引
sql=""WITH df2 AS(
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
  AND date BETWEEN '2016-11-15' AND '2016-12-01'
ORDER BY date;)""
df2
date instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
2016-11-22 000005.SZA 69.970383 70.619110 69.692352 70.341087
2016-11-23 000005.SZA 70.433762 70.897141 69.785027 70.526436
2016-11-24 000005.SZA 70.155731 70.619110 69.507004 69.970383
2016-11-25 000005.SZA 69.970383 70.433762 68.672913 70.063057
2016-11-28 000005.SZA 70.433762 71.638550 69.785027 70.711784
2016-11-29 000005.SZA 70.433762 71.082489 69.228973 69.321648
2016-11-30 000005.SZA 69.043617 69.599678 68.580238 68.765594
2016-12-01 000005.SZA 68.950943 69.321648 68.672913 69.043617
# 通过resset_index来将DataFrame的索引转化为列
df2.reset_index()
date instrument open high low close
0 2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
1 2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2 2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
3 2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
4 2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
5 2016-11-22 000005.SZA 69.970383 70.619110 69.692352 70.341087
6 2016-11-23 000005.SZA 70.433762 70.897141 69.785027 70.526436
7 2016-11-24 000005.SZA 70.155731 70.619110 69.507004 69.970383
8 2016-11-25 000005.SZA 69.970383 70.433762 68.672913 70.063057
9 2016-11-28 000005.SZA 70.433762 71.638550 69.785027 70.711784
10 2016-11-29 000005.SZA 70.433762 71.082489 69.228973 69.321648
11 2016-11-30 000005.SZA 69.043617 69.599678 68.580238 68.765594
12 2016-12-01 000005.SZA 68.950943 69.321648 68.672913 69.043617
# 将2016-11-22以后的数据用缺失值代替
df2.loc['2016-11-22':,:] = np.nan
df2
date instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
2016-11-22 NaN NaN NaN NaN NaN
2016-11-23 NaN NaN NaN NaN NaN
2016-11-24 NaN NaN NaN NaN NaN
2016-11-25 NaN NaN NaN NaN NaN
2016-11-28 NaN NaN NaN NaN NaN
2016-11-29 NaN NaN NaN NaN NaN
2016-11-30 NaN NaN NaN NaN NaN
2016-12-01 NaN NaN NaN NaN NaN
# 剔除所有包含缺失值的行数据
df2.dropna(how='any')
date instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
# 填充缺失值
df2.fillna(value=154)
date instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
2016-11-22 154 154.000000 154.000000 154.000000 154.000000
2016-11-23 154 154.000000 154.000000 154.000000 154.000000
2016-11-24 154 154.000000 154.000000 154.000000 154.000000
2016-11-25 154 154.000000 154.000000 154.000000 154.000000
2016-11-28 154 154.000000 154.000000 154.000000 154.000000
2016-11-29 154 154.000000 154.000000 154.000000 154.000000
2016-11-30 154 154.000000 154.000000 154.000000 154.000000
2016-12-01 154 154.000000 154.000000 154.000000 154.000000
# 获取值是否为nan的布尔标记
pd.isnull(df2)
date instrument open high low close
2016-11-15 False False False False False
2016-11-16 False False False False False
2016-11-17 False False False False False
2016-11-18 False False False False False
2016-11-21 False False False False False
2016-11-22 True True True True True
2016-11-23 True True True True True
2016-11-24 True True True True True
2016-11-25 True True True True True
2016-11-28 True True True True True
2016-11-29 True True True True True
2016-11-30 True True True True True
2016-12-01 True True True True True

运算

# describe()能对数据做一个快速统计汇总
df2.describe()
open high low close
count 5.000000 5.000000 5.000000 5.000000
mean 70.211342 70.693253 69.488464 70.118668
std 0.647406 0.521793 0.265382 0.250397
min 69.785027 70.155731 69.136299 69.877701
25% 69.970383 70.433762 69.321648 69.970383
50% 69.970383 70.619110 69.507004 70.063057
75% 69.970383 70.711784 69.692352 70.155731
max 71.360519 71.545876 69.785027 70.526436

运算过程中,通常不包含缺失值。

# 按列计算平均值
df2.mean()
open     70.211342
high     70.693253
low      69.488464
close    70.118668
dtype: float32
# 按行计算平均值
df2.mean(1)
date
2016-11-15    70.109398
2016-11-16    70.711792
2016-11-17    69.808197
2016-11-18    69.947212
2016-11-21    70.063049
2016-11-22          NaN
2016-11-23          NaN
2016-11-24          NaN
2016-11-25          NaN
2016-11-28          NaN
2016-11-29          NaN
2016-11-30          NaN
2016-12-01          NaN
dtype: float32
# pandas会按照指定的索引对齐
dates = ['2016-11-15','2016-11-16','2016-11-17','2016-11-18','2016-11-23']
for i in range(len(dates)):
    dates[i] = pd.Timestamp(dates[i])
pd.DataFrame(df2,index=dates)
instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-17 000005.SZA 69.970383 70.155731 69.136299 69.970383
2016-11-18 000005.SZA 69.970383 70.433762 69.321648 70.063057
2016-11-23 NaN NaN NaN NaN NaN

Apply 函数

# 通过apply函数求每个交易日高开低收价格的平均值
df3 = df2[['open','high','low','close']]
df3.apply(np.mean,axis=1)
date
2016-11-15    70.109398
2016-11-16    70.711792
2016-11-17    69.808197
2016-11-18    69.947212
2016-11-21    70.063049
2016-11-22          NaN
2016-11-23          NaN
2016-11-24          NaN
2016-11-25          NaN
2016-11-28          NaN
2016-11-29          NaN
2016-11-30          NaN
2016-12-01          NaN
dtype: float64
# 计算每日股价振幅,振幅=最高价-最低价
df3.apply(lambda x:x.max()-x.min())
open     1.575493
high     1.390144
low      0.648727
close    0.648735
dtype: float64
# 通过apply对各个元素调用字符串方法,比如将股票代码的后缀改成小写
temp = df2.ix['2016-11-15':'2016-11-21']
temp['instrument'].apply(lambda x : x.lower())
date
2016-11-15    000005.sza
2016-11-16    000005.sza
2016-11-17    000005.sza
2016-11-18    000005.sza
2016-11-21    000005.sza
Name: instrument,
dtype: object

rolling_apply的使用

# 获取数据
sql=""WITH data AS(
SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
  AND date BETWEEN '2016-01-01' AND '2016-10-01'
ORDER BY date;)""
# 计算5日均线和20日均线
data['MA_' + str(5)] = data['close'].rolling(5).mean()
data['MA_' + str(20)] = data['close'].rolling(20).mean()
data.head(12)
instrument close MA_5 MA_20
date
2016-01-04 000001.SZA 974.312988 NaN NaN
2016-01-05 000001.SZA 980.332520 NaN NaN
2016-01-06 000001.SZA 991.511780 NaN NaN
2016-01-07 000001.SZA 940.775269 NaN NaN
2016-01-08 000001.SZA 956.254211 968.637354 NaN
2016-01-11 000001.SZA 925.296326 958.834021 NaN
2016-01-12 000001.SZA 929.596008 948.686719 NaN
2016-01-13 000001.SZA 920.996643 934.583691 NaN
2016-01-14 000001.SZA 926.156250 931.659888 NaN
2016-01-15 000001.SZA 899.498108 920.308667 NaN
2016-01-18 000001.SZA 895.198425 914.289087 NaN
2016-01-19 000001.SZA 920.996643 912.569214 NaN

合并

Concat 连接

# 通过concat()来连接pandas对象
a = df2.loc['2016-11-15':'2016-11-16']
b = df2.loc['2016-11-20':'2016-11-26']
pd.concat([a,b])
date instrument open high low close
2016-11-15 000005.SZA 69.785027 70.619110 69.507004 70.526436
2016-11-16 000005.SZA 71.360519 71.545876 69.785027 70.155731
2016-11-21 000005.SZA 69.970383 70.711784 69.692352 69.877701
2016-11-22 NaN NaN NaN NaN NaN
2016-11-23 NaN NaN NaN NaN NaN
2016-11-24 NaN NaN NaN NaN NaN
2016-11-25 NaN NaN NaN NaN NaN

Join 合并

# 合并,和merge功能类似
sql=""WITH df5 AS (
    SELECT 
        date, 
        open, 
        high, 
        low, 
        close
    FROM 
        bar1d_CN_STOCK_A
    WHERE 
        instrument = '000005.SZA'
        AND date BETWEEN '2017-02-01' AND '2017-02-10'
) , df6 AS (
    SELECT 
        date, 
        market_cap, 
        pe_ttm
    FROM 
        market_value_CN_STOCK_A
    WHERE 
        instrument = '000005.SZA'
        AND date BETWEEN '2017-01-01' AND '2017-02-10'
)""

# 删除股票代码列
del df5['instrument']
del df6['instrument']

# 按日期轴合并数据
df5.join(df6)
date open high low close market_cap pe_ttm
2017-02-03 60.888130 60.888130 60.239399 60.517429 6.912246e+09 63.786068
2017-02-06 60.517429 60.795456 60.146725 60.795456 6.944002e+09 64.079109
2017-02-07 60.795456 61.629539 60.424751 61.166161 6.986343e+09 64.469841
2017-02-08 60.980808 61.166161 60.424751 61.073483 6.975758e+09 64.372154
2017-02-09 61.073483 61.536865 60.888130 61.444187 7.018099e+09 64.762886
2017-02-10 61.629539 61.629539 61.258835 61.536865 7.028685e+09 64.860565


Append 添加

# 通过append方法,将若干行添加到dataFrame后面
c = df1.loc['2017-02-03':'2017-02-06']
d = df1.loc['2017-02-08':'2017-02-10']
c.append(d)
date instrument open high low close market_cap
2017-02-03 111 111.000000 111.000000 111.000000 111.000000 111.0
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456 NaN
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483 NaN
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187 NaN
2017-02-10 000005.SZA 61.629539 61.629539 61.258835 61.536865 NaN

数据透视表

也许大多数人都有在Excel中使用数据透视表的经历,其实Pandas也提供了一个类似的功能,名为pivot_table

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df
A \n B C D E
0 one A foo -2.216199 0.288395
1 one B foo 1.572496 0.159725
2 two C foo -0.559229 0.204685
3 three A bar 2.044115 -0.032094
4 one B bar 1.022778 1.650399
5 one C bar -0.558943 1.074547
6 two A foo -0.147329 0.150598
7 three B foo 0.659760 1.779248
8 one C foo 0.506593 -0.343586
9 one A bar 0.941787 -0.277070
10 two B bar 1.662566 -0.558867
11 three C bar -1.671826 -0.939832
# 我们可以轻松得到数据透视表
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
\n C bar foo
A B
one A 0.941787 -2.216199
B 1.022778 1.572496
C -0.558943 0.506593
three A 2.044115 NaN
B NaN 0.659760
C -1.671826 NaN
two A NaN -0.147329
B 1.662566 NaN
C NaN -0.559229
# 查看每个交易日多只股票的收盘价数据(每列为股票)
df = D.history_data(['000001.SZA','000002.SZA'],'2016-12-15','2017-01-01',fields=['close'],groupped_by_instrument=False)
pd.pivot_table(df,values='close',index=['date'],columns=['instrument'])
instrument 000001.SZA 000002.SZA
date
2016-12-15 969.013855 2965.032471
2016-12-16 969.013855 2982.294189
2016-12-19 963.775940 2801.710205
2016-12-20 954.347656 2699.467529
2016-12-21 959.585571 2719.385010
2016-12-22 957.490417 2736.646729
2016-12-23 951.204956 2695.484131
2016-12-26 955.395264 2741.958008
2016-12-27 951.204956 2844.200439
2016-12-28 949.109802 2814.988281
2016-12-29 951.204956 2767.186768
2016-12-30 953.300110 2728.679688
# 查看不同股票每个交易日的收盘价数据(每列为交易日)
pd.pivot_table(df,values = 'close',index=['instrument'],columns=['date'])
date 2016-12-15 00 :00 :00 2016-12-16 00 :00 :00 2016-12-19 00 :00 :00 2016-12-20 00 :00 :00 2016-12-21 00 :00 :00 2016-12-22 00 :00 :00 2016-12-23 00 :00 :00 2016-12-26 00 :00 :00 2016-12-27 00 :00 :00 2016-12-28 00 :00 :00 2016-12-29 00 :00 :00 2016-12-30 00 :00 :00
instrument
000001.SZA 969.013855 969.013855 963.775940 954.347656 959.585571 957.490417 951.204956 955.395264 951.204956 949.109802 951.204956 953.300110
000002.SZA 2965.032471 2982.294189 2801.710205 2699.467529 2719.385010 2736.646729 2695.484131 2741.958008 2844.200439 2814.988281 2767.186768 2728.679688

分组

对于“group by”操作,我们通常是指以下一个或几个步骤: * 划分 按照某些标准将数据分为不同的组 * 应用 对每组数据分别执行一个函数 * 组合 将结果组合到一个数据结构

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'bar'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df
A B C D
0 foo one -0.160349 -0.086323
1 bar one -0.994189 -1.444457
2 foo two -0.095365 0.227690
3 bar three 1.502585 -0.286937
4 foo two -0.258329 -0.123057
5 bar two 0.944706 -0.211243
6 foo one 1.542726 -0.164639
7 bar three 0.874764 -0.075149
# 分组并对每个分组应用sum函数
df.groupby('A').sum()
A C D
bar 2.327866 -2.017786
foo 1.028683 -0.146328
# 按多个列分组形成层级索引,然后应用函数
df.groupby(['A','B']).sum()
A B C D
bar one -0.994189 -1.444457
three 2.377349 -0.362086
two 0.944706 -0.211243
foo one 1.382377 -0.250962
two -0.353694 0.104634
# 通过groupby函数来计算各股票和大盘指数的相关系数(按年划分)

# 获取数据
stocks = ['000005.SZA','000010.SZA','603998.SHA','603898.SHA']
sql=""WITH df AS(
SELECT instrument, date, close
FROM bar1d_CN_STOCK_A
WHERE instrument IN ('000005.SZA','000010.SZA','603998.SHA','603898.SHA')
  AND date BETWEEN '2010-01-01' AND '2016-12-31'
ORDER BY instrument, date;)""

# 每列转化为每日多只股票的收盘价数据
df = pd.pivot_table(df,values='close',index=['date'],columns=['instrument'])

# 计算日收益率
rets = df.pct_change().dropna()

# 定义求相关系数的函数
corr_func = lambda x:x.corrwith(x['000005.SZA'])

# 求每只股票和大盘相关性
by_year = rets.groupby(lambda x:x.year)
by_year.apply(corr_func)
instrument 000005.SZA 000010.SZA 603898.SHA 603998.SHA
2015 1.0 0.628609 0.178597 0.523903
2016 1.0 0.643711 0.627512 0.548929

时间序列

pandas在对频率转换进行重新采样时拥有着简单,强大而且高效的功能(例如把按秒采样的数据转换为按5分钟采样的数据)。这在金融领域很常见,但又不限于此。

# 按秒产生5000个时间、起始时间是2017-01-01的时间序列
rng = pd.date_range('1/1/2017', periods=5000, freq='S')
rng
DatetimeIndex(['2017-01-01 00 :00 :00 ', '2017-01-01 00 :00 :01 ',
​               '2017-01-01 00 :00 :02 ', '2017-01-01 00 :00 :03 ',
​               '2017-01-01 00 :00 :04 ', '2017-01-01 00 :00 :05 ',
​               '2017-01-01 00 :00 :06 ', '2017-01-01 00 :00 :07 ',
​               '2017-01-01 00 :00 :08 ', '2017-01-01 00 :00 :09 ',
​               ...
​               '2017-01-01 01 :23 :10 ', '2017-01-01 01 :23 :11 ',
​               '2017-01-01 01 :23 :12 ', '2017-01-01 01 :23 :13 ',
​               '2017-01-01 01 :23 :14 ', '2017-01-01 01 :23 :15 ',
​               '2017-01-01 01 :23 :16 ', '2017-01-01 01 :23 :17 ',
​               '2017-01-01 01 :23 :18 ', '2017-01-01 01 :23 :19 '],
​              dtype='datetime64[ns]', length=5000, freq='S')
# 重采样,计算每五分钟的总和
ts = pd.Series(np.random.randint(0,5000,len(rng)), index=rng)
ts.resample('5Min', how='sum')
2017-01-01 00:00:00    765477
2017-01-01 00:05:00    755782
2017-01-01 00:10:00    759474
2017-01-01 00:15:00    738611
2017-01-01 00:20:00    782165
2017-01-01 00:25:00    741331
2017-01-01 00:30:00    771220
2017-01-01 00:35:00    790782
2017-01-01 00:40:00    739392
2017-01-01 00:45:00    800815
2017-01-01 00:50:00    725954
2017-01-01 00:55:00    725840
2017-01-01 01:00:00    739548
2017-01-01 01:05:00    756409
2017-01-01 01:10:00    727530
2017-01-01 01:15:00    776613
2017-01-01 01:20:00    522169
Freq: 5T, dtype: int64
# 频率转化,将日K线数据装换成周K
sql=""WITH df AS(
SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
  AND date BETWEEN '2016-11-01' AND '2016-12-31'
ORDER BY date;)""
df.resample('W').ohlc()
close
date open high low close
2016-11-06 957.490417 957.490417 950.157349 954.347656
2016-11-13 955.395264 961.680786 950.157349 961.680786
2016-11-20 965.871094 966.918701 961.680786 961.680786
2016-11-27 967.966248 1007.774414 967.966248 1007.774414
2016-12-04 1008.821960 1008.821960 1000.441345 1000.441345
2016-12-11 991.013062 1010.917114 991.013062 1010.917114
2016-12-18 995.203430 995.203430 969.013855 969.013855
2016-12-25 963.775940 963.775940 951.204956 951.204956
2017-01-01 955.395264 955.395264 949.109802 953.300110

绘图

数据可视化:可交互图表 (文档: 数据可视化)

# 绘制沪深300走势图
history_hs300 = DataSource('bar1d_CN_STOCK_A').read(instruments = ['000300.HIX'], start_date='2016-01-01', end_date='2017-01-01', fields=['close'])
T.plot(history_hs300[['date', 'close']].set_index('date'), chart_type='spline')

{w:100}

# 绘制多只股票走势图
sql="" WITH data AS(
SELECT instrument, date, close
FROM bar1d_CN_STOCK_A
WHERE instrument IN ('000005.SZA', '000010.SZA', '603998.SHA', '603898.SHA')
  AND date BETWEEN '2016-01-01' AND '2017-01-01'
ORDER BY instrument, date;)""
data = pd.pivot_table(data,values='close',index=['date'],columns=['instrument'])
T.plot(data, chart_type='spline')

{w:100}

标签

函数pandas
评论
  • 谢谢,受教了!
  • 为什么DataSource本地运行直接报错,只能在平台上写代码???
  • api函数也不对
{link}