复制链接
克隆策略
In [2]:
import pandas as pd
import numpy as np
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs

import warnings
warnings.filterwarnings('ignore')


from joblib import Parallel, delayed
In [3]:
import fai
# 连接和初始化fai
fai.init(cluster="fai-anthonywan-cptlfirs.fai-anthonywan",token="1VhioLNOp7BgqtDJAMsa4NuPwi7wGihZ")
In [4]:
sd = '2022-01-01'
ed = '2023-05-09'

3.1 “潮汐”的定义 我们观察个股分钟频成交量的高点与低点来定义“涨潮”与“退潮”, 具体如下: 1)剔除开盘和收盘数据,仅考虑日内分钟频数据,为了减小个别异 常点的影响,我们首先计算个股每分钟的成交量及其前后 4 分钟成交 量的总和(共 9 分钟),作为该分钟“邻域成交量”。 2)假设“邻域成交量”最高点发生在第 t 分钟,这一分钟称为“顶峰 时刻”。 3)第 5~t-1 分钟里,“邻域成交量”最低点发生在第 m 分钟,这一点 的邻域成交量为 Vm,收盘价为 Cm,这一分钟称为“涨潮时刻”,从 “涨潮时刻”到“顶峰时刻”的过程记为“涨潮”。 4)第 t+1~233 分钟里,”邻域成交量“最低点发生在第 n 分钟里,这 一点的邻域成交量为 Vn,收盘价为 Cn,这一分钟称为“退潮时刻”, 从“顶峰时刻”到“退潮时刻”的过程记为“退潮”。 5)从“涨潮时刻”到“退潮时刻”的全过程记为一次“潮汐”。

3.2 “潮汐”过程的价格变动速率 我们首先来考察“潮汐”过程的价格变动速率,进而构造“全潮汐” 因子,具体过程如下: 1)如上述定义,我们记“涨潮时刻”发生在第 m 分钟,收盘价为 Cm; “退潮时刻”发生在第 n 分钟,收盘价为 Cn。 2)则全部“潮汐”过程的价格变化率为(Cn-Cm)/Cm。 金融工程报告 7 敬请关注文后特别声明与免责条款 3)进而全“潮汐”过程的价格变动速率为(Cn-Cm)/Cm/(n-m),我们 将此作为每日投资者出售或购买股票意愿强烈程度的代理变量。 4)我们计算最近 20 个交易日的价格变动速率的平均值,记为“全潮 汐”因子。 接下来我们将对上述构建的“全潮汐”因子进行单因子测试,我们在 全 A 样本中按照月度频率进行测试,测试中对因子进行市值和行业正 交化处理,测试区间为 2013 年 4 月至 2022 年 2 月(下同)。因子表 现如下所示。

In [5]:
@fai.remote
def calc_data(instrument,sd,ed):
    import warnings
    warnings.filterwarnings('ignore')
    print(instrument)
    df = DataSource('bar1m_CN_STOCK_A').read(start_date=sd,end_date=ed,instruments = instrument)
    try:

        df_ = df.copy()
        df_['day'] = df_['date'].dt.to_period('D')
        
    except:
        return

    def calc_one_day(df):
        df.reset_index(inplace=True,drop=True)

        vs = df.loc[0,'volume']
        cs = df.loc[0,'close']
        s = 0 
        df['邻域'] = df['volume']  +df['volume'].shift()+df['volume'].shift(2)+df['volume'].shift(3)+df['volume'].shift(4)+df['volume'].shift(-1)+df['volume'].shift(-2)+df['volume'].shift(-3)+df['volume'].shift(-4)

        df.dropna(inplace=True)
        df.reset_index(inplace=True,drop=True)
        idxmax = df['邻域'].idxmax()  #顶峰

        if idxmax != 0:
            df_1 = df.loc[:idxmax]
            df_2 = df.loc[idxmax:]

            idxmin_m = df_1['邻域'].idxmin()
            idxmin_n = df_2['邻域'].idxmin()

            
            vm = df.loc[idxmin_m,'volume']
            cm = df.loc[idxmin_m,'close']
            vn = df.loc[idxmin_n,'volume']
            cn = df.loc[idxmin_n,'close']

            df['vm'] = vm
            df['cm'] = cm
            df['vn'] = vn
            df['cn'] = cn
            df['nm'] = idxmin_n - idxmin_m
            
        else:
            idxmin = df['邻域'].idxmin()
            
            
            vn = df.loc[idxmin,'volume']
            cn = df.loc[idxmin,'close']
            
            df['vm'] = vs
            df['cm'] = cs
            df['vn'] = vn
            df['cn'] = cn
            df['nm'] = idxmin

        df['factor']  = (df['cn'] - df['cm'])/df['cm']/df['nm']
        return df

    try:
        df_ = df_.groupby('day').apply(calc_one_day)
        df_.reset_index(inplace=True,drop=True)
        df_.drop_duplicates(subset=['day'],inplace=True)
        return df_

    except:
        return
In [6]:
tmp = DataSource('bar1d_CN_STOCK_A').read(start_date=sd,end_date=ed)
ins_list = tmp.instrument.unique()
In [7]:
import time
time0 = time.time()
fai.log_silent(True)
remainings = [calc_data.remote(ins, sd , ed) for ins in ins_list]
done = 0
ready_list = []
print('提交时间:', time.time() - time0)

while remainings:
    ready, remainings = fai.wait(remainings)
    ready_list+=ready
    done += len(ready)
    if done % 100 == 0:
        
        print(f"{time.time() - time0}, {done}/{len(remainings) + done}") 
print('计算时间:', time.time() - time0)

time1 = time.time()
df = pd.concat(fai.get(ready_list))
print('子任务合并时间:', time.time() - time1)
提交时间: 3.502511739730835
72.5324432849884, 100/5220
112.80881810188293, 200/5220
152.13598370552063, 300/5220
189.12619376182556, 400/5220
231.671724319458, 500/5220
269.972279548645, 600/5220
309.133460521698, 700/5220
349.521497964859, 800/5220
389.31311774253845, 900/5220
429.5872299671173, 1000/5220
470.62226843833923, 1100/5220
511.10495471954346, 1200/5220
552.7588784694672, 1300/5220
593.6794936656952, 1400/5220
631.4529659748077, 1500/5220
674.0429482460022, 1600/5220
710.2315633296967, 1700/5220
749.3886156082153, 1800/5220
790.1398203372955, 1900/5220
830.3366467952728, 2000/5220
868.4557855129242, 2100/5220
905.8010051250458, 2200/5220
945.9148850440979, 2300/5220
989.7080097198486, 2400/5220
1027.384386062622, 2500/5220
1066.4245028495789, 2600/5220
1105.1222043037415, 2700/5220
1146.8430831432343, 2800/5220
1187.1940641403198, 2900/5220
1226.5458617210388, 3000/5220
1266.569834470749, 3100/5220
1307.9403958320618, 3200/5220
1345.0098867416382, 3300/5220
1382.7026841640472, 3400/5220
1424.5511016845703, 3500/5220
1466.2177591323853, 3600/5220
1505.7667565345764, 3700/5220
1545.130047082901, 3800/5220
1586.0551328659058, 3900/5220
1627.2407739162445, 4000/5220
1664.7209911346436, 4100/5220
1704.4398725032806, 4200/5220
1746.8999288082123, 4300/5220
1787.5217778682709, 4400/5220
1827.986206293106, 4500/5220
1845.785433292389, 4600/5220
1873.5754194259644, 4700/5220
1901.2115919589996, 4800/5220
1923.469750881195, 4900/5220
1936.281195640564, 5000/5220
1944.1335709095001, 5100/5220
1948.2896678447723, 5200/5220
计算时间: 1948.9932589530945
子任务合并时间: 25.052433729171753
In [8]:
df
Out[8]:
instrument date open close low high amount volume day 邻域 vm cm vn cn nm factor
0 000001.SZA 2022-01-04 09:35:00 16.309999 16.350000 16.309999 16.350000 13450956.0 823700 2022-01-04 14330820.0 3692400 16.459999 72300 16.340000 98 -0.000074
232 000001.SZA 2022-01-05 09:35:00 16.830000 16.820000 16.809999 16.840000 37579002.0 2233200 2022-01-05 22701552.0 1572400 16.870001 138200 17.120001 198 0.000075
464 000001.SZA 2022-01-06 09:35:00 17.049999 17.010000 17.010000 17.059999 32483309.0 1908135 2022-01-06 16526187.0 4701984 17.059999 155318 17.110001 151 0.000019
696 000001.SZA 2022-01-07 09:35:00 17.150000 17.139999 17.129999 17.170000 8717223.0 508500 2022-01-07 6913817.0 113400 17.150000 156000 17.200001 93 0.000031
928 000001.SZA 2022-01-10 09:35:00 17.389999 17.400000 17.370001 17.400000 40702587.0 2340475 2022-01-10 17570870.0 3828381 17.270000 49600 17.129999 81 -0.000100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3944 301203.SZA 2023-04-28 09:35:00 36.110001 36.110001 36.049999 36.110001 54107.0 1500 2023-04-28 28293.0 1800 36.049999 1000 36.209999 191 0.000023
4176 301203.SZA 2023-05-04 09:35:00 36.119999 36.119999 36.119999 36.150002 65041.0 1800 2023-05-04 46257.0 8200 35.930000 0 36.110001 141 0.000036
4408 301203.SZA 2023-05-05 09:35:00 36.090000 36.090000 36.090000 36.250000 83082.0 2300 2023-05-05 16300.0 4900 36.099998 400 35.820000 135 -0.000057
4640 301203.SZA 2023-05-08 09:35:00 35.700001 35.709999 35.700001 35.860001 107120.0 3000 2023-05-08 25300.0 1200 35.799999 0 35.959999 138 0.000032
4872 301203.SZA 2023-05-09 09:35:00 35.889999 35.880001 35.880001 35.889999 81848.0 2281 2023-05-09 11540.0 0 35.959999 0 36.119999 139 0.000032

1553427 rows × 16 columns

In [9]:
df['date'] = df['day']
df = df[['instrument','date','factor']]
df['date'] = df['date'].astype(str)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
In [10]:
df
Out[10]:
instrument date factor
0 000001.SZA 2022-01-04 -0.000074
232 000001.SZA 2022-01-05 0.000075
464 000001.SZA 2022-01-06 0.000019
696 000001.SZA 2022-01-07 0.000031
928 000001.SZA 2022-01-10 -0.000100
... ... ... ...
3944 301203.SZA 2023-04-28 0.000023
4176 301203.SZA 2023-05-04 0.000036
4408 301203.SZA 2023-05-05 -0.000057
4640 301203.SZA 2023-05-08 0.000032
4872 301203.SZA 2023-05-09 0.000032

1553427 rows × 3 columns

In [11]:
m1 = M.instruments.v2(
    start_date=sd,
    end_date=ed,
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0
)

m2 = M.input_features.v1(
    features="""
收盘=close_0

市值=market_cap_0

行业=industry_sw_level1_0

"""
)


m3 = M.general_feature_extractor.v7(
    instruments=m1.data,
    features=m2.data,
    start_date='',
    end_date='',
    before_start_days=500
)

m4 = M.derived_feature_extractor.v3(
    input_data=m3.data,
    features=m2.data,
    date_col='date',
    instrument_col='instrument',
    drop_na=False,
    remove_extra_columns=False,
    user_functions={}
)

m5 = M.chinaa_stock_filter.v1(
    input_data=m4.data,
    index_constituent_cond=['全部'],
    board_cond=['上证主板', '深证主板', '创业板'],
    industry_cond=['全部'],
    st_cond=['正常'],
    delist_cond=['非退市'],
    output_left_data=False
)

df_ = m5.data.read()

df = pd.merge(df,df_[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')

df.sort_values(by='date',inplace=True)
df
[2023-05-11 14:47:17.051985] INFO moduleinvoker: instruments.v2 开始运行..
[2023-05-11 14:47:17.073515] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.075331] INFO moduleinvoker: instruments.v2 运行完成[0.033861s].
[2023-05-11 14:47:17.082416] INFO moduleinvoker: input_features.v1 开始运行..
[2023-05-11 14:47:17.097371] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.099238] INFO moduleinvoker: input_features.v1 运行完成[0.016831s].
[2023-05-11 14:47:17.142849] INFO moduleinvoker: general_feature_extractor.v7 开始运行..
[2023-05-11 14:47:17.149568] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.151096] INFO moduleinvoker: general_feature_extractor.v7 运行完成[0.008292s].
[2023-05-11 14:47:17.171122] INFO moduleinvoker: derived_feature_extractor.v3 开始运行..
[2023-05-11 14:47:17.178547] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.180099] INFO moduleinvoker: derived_feature_extractor.v3 运行完成[0.008992s].
[2023-05-11 14:47:17.197896] INFO moduleinvoker: chinaa_stock_filter.v1 开始运行..
[2023-05-11 14:47:17.208025] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.209324] INFO moduleinvoker: chinaa_stock_filter.v1 运行完成[0.01145s].
Out[11]:
instrument date factor 收盘 市值 行业
0 000001.SZA 2022-01-04 -0.000074 1864.619995 3.233026e+11 480000.0
1149166 603013.SHA 2022-01-04 0.000063 18.726503 8.667807e+09 280000.0
168211 002032.SZA 2022-01-04 -0.000117 526.558594 4.972564e+10 330000.0
1149490 601965.SHA 2022-01-04 0.000121 31.856092 1.789526e+10 280000.0
1149814 601989.SHA 2022-01-04 0.000044 7.161360 9.690865e+10 650000.0
... ... ... ... ... ... ...
1262233 603698.SHA 2023-05-09 0.000072 20.232754 7.986251e+09 640000.0
114784 000905.SZA 2023-05-09 0.000235 20.790482 5.852878e+09 420000.0
570753 300277.SZA 2023-05-09 0.000052 40.384846 2.539300e+09 710000.0
1542230 001298.SZA 2023-05-09 0.000052 32.970001 3.165120e+09 270000.0
1553426 301203.SZA 2023-05-09 0.000032 35.849998 2.868000e+09 760000.0

1553427 rows × 6 columns

In [12]:
import plotly.graph_objs as go
import numpy as np


# 使用 Histogram 类创建概率分布图
fig = go.Figure(data=[go.Histogram(x=df.factor, histnorm='probability')])

# 设置图形的标题和轴标签
fig.update_layout(title='Probability Distribution', xaxis_title='Value', yaxis_title='Probability')

# 显示图形
fig.show()