In [2]:

import pandas as pd
import numpy as np
from biglearning.module2.common.data import Outputs
from zipline.finance.commission import PerOrder
import os
from bigdatasource.api import DataSource
from biglearning.api import M
from biglearning.api import tools as T
from biglearning.module2.common.data import Outputs

import warnings
warnings.filterwarnings('ignore')


from joblib import Parallel, delayed

In [3]:

import fai
# 连接和初始化fai
fai.init(cluster="fai-anthonywan-cptlfirs.fai-anthonywan",token="1VhioLNOp7BgqtDJAMsa4NuPwi7wGihZ")

In [4]:

sd = '2022-01-01'
ed = '2023-05-09'

3.1 “潮汐”的定义我们观察个股分钟频成交量的高点与低点来定义“涨潮”与“退潮”，具体如下： 1）剔除开盘和收盘数据，仅考虑日内分钟频数据，为了减小个别异常点的影响，我们首先计算个股每分钟的成交量及其前后 4 分钟成交量的总和（共 9 分钟），作为该分钟“邻域成交量”。 2）假设“邻域成交量”最高点发生在第 t 分钟，这一分钟称为“顶峰时刻”。 3）第 5～t-1 分钟里，“邻域成交量”最低点发生在第 m 分钟，这一点的邻域成交量为 Vm，收盘价为 Cm，这一分钟称为“涨潮时刻”，从 “涨潮时刻”到“顶峰时刻”的过程记为“涨潮”。 4）第 t+1～233 分钟里，”邻域成交量“最低点发生在第 n 分钟里，这一点的邻域成交量为 Vn，收盘价为 Cn，这一分钟称为“退潮时刻”，从“顶峰时刻”到“退潮时刻”的过程记为“退潮”。 5）从“涨潮时刻”到“退潮时刻”的全过程记为一次“潮汐”。

3.2 “潮汐”过程的价格变动速率我们首先来考察“潮汐”过程的价格变动速率，进而构造“全潮汐” 因子，具体过程如下： 1）如上述定义，我们记“涨潮时刻”发生在第 m 分钟，收盘价为 Cm； “退潮时刻”发生在第 n 分钟，收盘价为 Cn。 2）则全部“潮汐”过程的价格变化率为(Cn-Cm)/Cm。金融工程报告 7 敬请关注文后特别声明与免责条款 3）进而全“潮汐”过程的价格变动速率为(Cn-Cm)/Cm/(n-m)，我们将此作为每日投资者出售或购买股票意愿强烈程度的代理变量。 4）我们计算最近 20 个交易日的价格变动速率的平均值，记为“全潮汐”因子。接下来我们将对上述构建的“全潮汐”因子进行单因子测试，我们在全 A 样本中按照月度频率进行测试，测试中对因子进行市值和行业正交化处理，测试区间为 2013 年 4 月至 2022 年 2 月（下同）。因子表现如下所示。

In [5]:

@fai.remote
def calc_data(instrument,sd,ed):
    import warnings
    warnings.filterwarnings('ignore')
    print(instrument)
    df = DataSource('bar1m_CN_STOCK_A').read(start_date=sd,end_date=ed,instruments = instrument)
    try:

        df_ = df.copy()
        df_['day'] = df_['date'].dt.to_period('D')
        
    except:
        return

    def calc_one_day(df):
        df.reset_index(inplace=True,drop=True)

        vs = df.loc[0,'volume']
        cs = df.loc[0,'close']
        s = 0 
        df['邻域'] = df['volume']  +df['volume'].shift()+df['volume'].shift(2)+df['volume'].shift(3)+df['volume'].shift(4)+df['volume'].shift(-1)+df['volume'].shift(-2)+df['volume'].shift(-3)+df['volume'].shift(-4)

        df.dropna(inplace=True)
        df.reset_index(inplace=True,drop=True)
        idxmax = df['邻域'].idxmax()  #顶峰

        if idxmax != 0:
            df_1 = df.loc[:idxmax]
            df_2 = df.loc[idxmax:]

            idxmin_m = df_1['邻域'].idxmin()
            idxmin_n = df_2['邻域'].idxmin()

            
            vm = df.loc[idxmin_m,'volume']
            cm = df.loc[idxmin_m,'close']
            vn = df.loc[idxmin_n,'volume']
            cn = df.loc[idxmin_n,'close']

            df['vm'] = vm
            df['cm'] = cm
            df['vn'] = vn
            df['cn'] = cn
            df['nm'] = idxmin_n - idxmin_m
            
        else:
            idxmin = df['邻域'].idxmin()
            
            
            vn = df.loc[idxmin,'volume']
            cn = df.loc[idxmin,'close']
            
            df['vm'] = vs
            df['cm'] = cs
            df['vn'] = vn
            df['cn'] = cn
            df['nm'] = idxmin

        df['factor']  = (df['cn'] - df['cm'])/df['cm']/df['nm']
        return df

    try:
        df_ = df_.groupby('day').apply(calc_one_day)
        df_.reset_index(inplace=True,drop=True)
        df_.drop_duplicates(subset=['day'],inplace=True)
        return df_

    except:
        return

In [6]:

tmp = DataSource('bar1d_CN_STOCK_A').read(start_date=sd,end_date=ed)
ins_list = tmp.instrument.unique()

In [7]:

import time
time0 = time.time()
fai.log_silent(True)
remainings = [calc_data.remote(ins, sd , ed) for ins in ins_list]
done = 0
ready_list = []
print('提交时间:', time.time() - time0)

while remainings:
    ready, remainings = fai.wait(remainings)
    ready_list+=ready
    done += len(ready)
    if done % 100 == 0:
        
        print(f"{time.time() - time0}, {done}/{len(remainings) + done}") 
print('计算时间:', time.time() - time0)

time1 = time.time()
df = pd.concat(fai.get(ready_list))
print('子任务合并时间:', time.time() - time1)

提交时间: 3.502511739730835
72.5324432849884, 100/5220
112.80881810188293, 200/5220
152.13598370552063, 300/5220
189.12619376182556, 400/5220
231.671724319458, 500/5220
269.972279548645, 600/5220
309.133460521698, 700/5220
349.521497964859, 800/5220
389.31311774253845, 900/5220
429.5872299671173, 1000/5220
470.62226843833923, 1100/5220
511.10495471954346, 1200/5220
552.7588784694672, 1300/5220
593.6794936656952, 1400/5220
631.4529659748077, 1500/5220
674.0429482460022, 1600/5220
710.2315633296967, 1700/5220
749.3886156082153, 1800/5220
790.1398203372955, 1900/5220
830.3366467952728, 2000/5220
868.4557855129242, 2100/5220
905.8010051250458, 2200/5220
945.9148850440979, 2300/5220
989.7080097198486, 2400/5220
1027.384386062622, 2500/5220
1066.4245028495789, 2600/5220
1105.1222043037415, 2700/5220
1146.8430831432343, 2800/5220
1187.1940641403198, 2900/5220
1226.5458617210388, 3000/5220
1266.569834470749, 3100/5220
1307.9403958320618, 3200/5220
1345.0098867416382, 3300/5220
1382.7026841640472, 3400/5220
1424.5511016845703, 3500/5220
1466.2177591323853, 3600/5220
1505.7667565345764, 3700/5220
1545.130047082901, 3800/5220
1586.0551328659058, 3900/5220
1627.2407739162445, 4000/5220
1664.7209911346436, 4100/5220
1704.4398725032806, 4200/5220
1746.8999288082123, 4300/5220
1787.5217778682709, 4400/5220
1827.986206293106, 4500/5220
1845.785433292389, 4600/5220
1873.5754194259644, 4700/5220
1901.2115919589996, 4800/5220
1923.469750881195, 4900/5220
1936.281195640564, 5000/5220
1944.1335709095001, 5100/5220
1948.2896678447723, 5200/5220
计算时间: 1948.9932589530945
子任务合并时间: 25.052433729171753

In [8]:

df

Out[8]:

	instrument	date	open	close	low	high	amount	volume	day	邻域	vm	cm	vn	cn	nm	factor
0	000001.SZA	2022-01-04 09:35:00	16.309999	16.350000	16.309999	16.350000	13450956.0	823700	2022-01-04	14330820.0	3692400	16.459999	72300	16.340000	98	-0.000074
232	000001.SZA	2022-01-05 09:35:00	16.830000	16.820000	16.809999	16.840000	37579002.0	2233200	2022-01-05	22701552.0	1572400	16.870001	138200	17.120001	198	0.000075
464	000001.SZA	2022-01-06 09:35:00	17.049999	17.010000	17.010000	17.059999	32483309.0	1908135	2022-01-06	16526187.0	4701984	17.059999	155318	17.110001	151	0.000019
696	000001.SZA	2022-01-07 09:35:00	17.150000	17.139999	17.129999	17.170000	8717223.0	508500	2022-01-07	6913817.0	113400	17.150000	156000	17.200001	93	0.000031
928	000001.SZA	2022-01-10 09:35:00	17.389999	17.400000	17.370001	17.400000	40702587.0	2340475	2022-01-10	17570870.0	3828381	17.270000	49600	17.129999	81	-0.000100
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3944	301203.SZA	2023-04-28 09:35:00	36.110001	36.110001	36.049999	36.110001	54107.0	1500	2023-04-28	28293.0	1800	36.049999	1000	36.209999	191	0.000023
4176	301203.SZA	2023-05-04 09:35:00	36.119999	36.119999	36.119999	36.150002	65041.0	1800	2023-05-04	46257.0	8200	35.930000	0	36.110001	141	0.000036
4408	301203.SZA	2023-05-05 09:35:00	36.090000	36.090000	36.090000	36.250000	83082.0	2300	2023-05-05	16300.0	4900	36.099998	400	35.820000	135	-0.000057
4640	301203.SZA	2023-05-08 09:35:00	35.700001	35.709999	35.700001	35.860001	107120.0	3000	2023-05-08	25300.0	1200	35.799999	0	35.959999	138	0.000032
4872	301203.SZA	2023-05-09 09:35:00	35.889999	35.880001	35.880001	35.889999	81848.0	2281	2023-05-09	11540.0	0	35.959999	0	36.119999	139	0.000032

1553427 rows × 16 columns

In [9]:

df['date'] = df['day']
df = df[['instrument','date','factor']]
df['date'] = df['date'].astype(str)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [10]:

df

Out[10]:

	instrument	date	factor
0	000001.SZA	2022-01-04	-0.000074
232	000001.SZA	2022-01-05	0.000075
464	000001.SZA	2022-01-06	0.000019
696	000001.SZA	2022-01-07	0.000031
928	000001.SZA	2022-01-10	-0.000100
...	...	...	...
3944	301203.SZA	2023-04-28	0.000023
4176	301203.SZA	2023-05-04	0.000036
4408	301203.SZA	2023-05-05	-0.000057
4640	301203.SZA	2023-05-08	0.000032
4872	301203.SZA	2023-05-09	0.000032

1553427 rows × 3 columns

In [11]:

m1 = M.instruments.v2(
    start_date=sd,
    end_date=ed,
    market='CN_STOCK_A',
    instrument_list='',
    max_count=0
)

m2 = M.input_features.v1(
    features="""
收盘=close_0

市值=market_cap_0

行业=industry_sw_level1_0

"""
)


m3 = M.general_feature_extractor.v7(
    instruments=m1.data,
    features=m2.data,
    start_date='',
    end_date='',
    before_start_days=500
)

m4 = M.derived_feature_extractor.v3(
    input_data=m3.data,
    features=m2.data,
    date_col='date',
    instrument_col='instrument',
    drop_na=False,
    remove_extra_columns=False,
    user_functions={}
)

m5 = M.chinaa_stock_filter.v1(
    input_data=m4.data,
    index_constituent_cond=['全部'],
    board_cond=['上证主板', '深证主板', '创业板'],
    industry_cond=['全部'],
    st_cond=['正常'],
    delist_cond=['非退市'],
    output_left_data=False
)

df_ = m5.data.read()

df = pd.merge(df,df_[['instrument','date','收盘','市值','行业']],left_on=['instrument','date'],right_on=['instrument','date'],how='left')

df.sort_values(by='date',inplace=True)
df

[2023-05-11 14:47:17.051985] INFO moduleinvoker: instruments.v2 开始运行..
[2023-05-11 14:47:17.073515] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.075331] INFO moduleinvoker: instruments.v2 运行完成[0.033861s].
[2023-05-11 14:47:17.082416] INFO moduleinvoker: input_features.v1 开始运行..
[2023-05-11 14:47:17.097371] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.099238] INFO moduleinvoker: input_features.v1 运行完成[0.016831s].
[2023-05-11 14:47:17.142849] INFO moduleinvoker: general_feature_extractor.v7 开始运行..
[2023-05-11 14:47:17.149568] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.151096] INFO moduleinvoker: general_feature_extractor.v7 运行完成[0.008292s].
[2023-05-11 14:47:17.171122] INFO moduleinvoker: derived_feature_extractor.v3 开始运行..
[2023-05-11 14:47:17.178547] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.180099] INFO moduleinvoker: derived_feature_extractor.v3 运行完成[0.008992s].
[2023-05-11 14:47:17.197896] INFO moduleinvoker: chinaa_stock_filter.v1 开始运行..
[2023-05-11 14:47:17.208025] INFO moduleinvoker: 命中缓存
[2023-05-11 14:47:17.209324] INFO moduleinvoker: chinaa_stock_filter.v1 运行完成[0.01145s].

Out[11]:

	instrument	date	factor	收盘	市值	行业
0	000001.SZA	2022-01-04	-0.000074	1864.619995	3.233026e+11	480000.0
1149166	603013.SHA	2022-01-04	0.000063	18.726503	8.667807e+09	280000.0
168211	002032.SZA	2022-01-04	-0.000117	526.558594	4.972564e+10	330000.0
1149490	601965.SHA	2022-01-04	0.000121	31.856092	1.789526e+10	280000.0
1149814	601989.SHA	2022-01-04	0.000044	7.161360	9.690865e+10	650000.0
...	...	...	...	...	...	...
1262233	603698.SHA	2023-05-09	0.000072	20.232754	7.986251e+09	640000.0
114784	000905.SZA	2023-05-09	0.000235	20.790482	5.852878e+09	420000.0
570753	300277.SZA	2023-05-09	0.000052	40.384846	2.539300e+09	710000.0
1542230	001298.SZA	2023-05-09	0.000052	32.970001	3.165120e+09	270000.0
1553426	301203.SZA	2023-05-09	0.000032	35.849998	2.868000e+09	760000.0

1553427 rows × 6 columns

In [12]:

import plotly.graph_objs as go
import numpy as np


# 使用 Histogram 类创建概率分布图
fig = go.Figure(data=[go.Histogram(x=df.factor, histnorm='probability')])

# 设置图形的标题和轴标签
fig.update_layout(title='Probability Distribution', xaxis_title='Value', yaxis_title='Probability')

# 显示图形
fig.show()