一个自定义因子就足足跑了27分钟,有那位大佬能优化一下吗?

用户成长系列
标签: #<Tag:0x00007fc0649d4b90>

(chaoskey) #1

这段代码源自:研报比赛模版。 (最后有完整可克隆的代码)

目标代码
%E7%9B%AE%E6%A0%87%E4%BB%A3%E7%A0%81
自定义表达式
%E8%87%AA%E5%AE%9A%E4%B9%89%E8%A1%A8%E8%BE%BE%E5%BC%8F
待测试因子
%E5%BE%85%E6%B5%8B%E8%AF%95%E5%9B%A0%E5%AD%90
运行结果(一个因子就足足跑了27分钟)

克隆策略
In [2]:
def fu_exp_wgt_return(N, df,turn_0):
    '''
         exp_wgt_return_Nm
         
         个股最近N个月的指数衰减权重乘以日换手率加权平均日收益率
         N=1、3、6、12
    '''
    
    # 提取基础数据
    dff=df[['date','instrument','turn_0']].set_index('date')
    
    def cal(x):
        # exp_wgt_return_Nm
        weights = np.array([ np.exp(-1*j/N/4) for j in range(N*20)])
        result = np.dot(x,weights)
        return result
    
    # 按股票分组计算 exp_wgt_return_Nm
    dff = dff.groupby('instrument').rolling(N*20)['turn_0'].apply(cal)
    
    # 解除索引
    dff = dff.reset_index().rename(columns={'turn_0':'exp_wgt_return'})
    
    # 提取目标因子序列
    return dff['exp_wgt_return']

    {"Description":"实验创建于2017/8/26","Summary":"","Graph":{"EdgesInternal":[{"DestinationInputPortId":"-215:instruments","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8:data"},{"DestinationInputPortId":"-222:input_data","SourceOutputPortId":"-215:data"},{"DestinationInputPortId":"-215:features","SourceOutputPortId":"-7117:data"},{"DestinationInputPortId":"-222:features","SourceOutputPortId":"-7117:data"}],"ModuleNodes":[{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","ModuleId":"BigQuantSpace.instruments.instruments-v2","ModuleParameters":[{"Name":"start_date","Value":"2010-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2016-12-31","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"market","Value":"CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_list","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"max_count","Value":"0","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"rolling_conf","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":1,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-215","ModuleId":"BigQuantSpace.general_feature_extractor.general_feature_extractor-v7","ModuleParameters":[{"Name":"start_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"before_start_days","Value":"480","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"-215"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-215"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-215","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":15,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-222","ModuleId":"BigQuantSpace.derived_feature_extractor.derived_feature_extractor-v3","ModuleParameters":[{"Name":"date_col","Value":"date","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_col","Value":"instrument","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"drop_na","Value":"True","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"remove_extra_columns","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"user_functions","Value":"bigquant_run = {\n 'fu_trade_num': (lambda df,n:fu_trade_num(n, df)),\n 'fu_exp_wgt_return': (lambda df,N,turn_0:fu_exp_wgt_return(N, df,turn_0))\n}\n","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_data","NodeId":"-222"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-222"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-222","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":16,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-7117","ModuleId":"BigQuantSpace.input_features.input_features-v1","ModuleParameters":[{"Name":"features","Value":"fu_exp_wgt_return(12,turn_0)","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features_ds","NodeId":"-7117"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-7117","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":6,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true}],"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-8' Position='210,95,200,200'/><NodePosition Node='-215' Position='312,230,200,200'/><NodePosition Node='-222' Position='450,341,200,200'/><NodePosition Node='-7117' Position='556,95,200,200'/></NodePositions><NodeGroups /></DataV1>"},"IsDraft":true,"ParentExperimentId":null,"WebService":{"IsWebServiceExperiment":false,"Inputs":[],"Outputs":[],"Parameters":[{"Name":"交易日期","Value":"","ParameterDefinition":{"Name":"交易日期","FriendlyName":"交易日期","DefaultValue":"","ParameterType":"String","HasDefaultValue":true,"IsOptional":true,"ParameterRules":[],"HasRules":false,"MarkupType":0,"CredentialDescriptor":null}}],"WebServiceGroupId":null,"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions></NodePositions><NodeGroups /></DataV1>"},"DisableNodesUpdate":false,"Category":"user","Tags":[],"IsPartialRun":true}
    In [3]:
    # 本代码由可视化策略环境自动生成 2019年7月14日 15:49
    # 本代码单元只能在可视化模式下编辑。您也可以拷贝代码,粘贴到新建的代码单元或者策略,然后修改。
    
    
    m16_user_functions_bigquant_run = {
        'fu_trade_num':  (lambda df,n:fu_trade_num(n, df)),
        'fu_exp_wgt_return':  (lambda df,N,turn_0:fu_exp_wgt_return(N, df,turn_0))
    }
    
    
    m1 = M.instruments.v2(
        start_date='2010-01-01',
        end_date='2016-12-31',
        market='CN_STOCK_A',
        instrument_list='',
        max_count=0
    )
    
    m6 = M.input_features.v1(
        features='fu_exp_wgt_return(12,turn_0)'
    )
    
    m15 = M.general_feature_extractor.v7(
        instruments=m1.data,
        features=m6.data,
        start_date='',
        end_date='',
        before_start_days=480
    )
    
    m16 = M.derived_feature_extractor.v3(
        input_data=m15.data,
        features=m6.data,
        date_col='date',
        instrument_col='instrument',
        drop_na=True,
        remove_extra_columns=False,
        user_functions=m16_user_functions_bigquant_run
    )
    

    (chaoskey) #2

    此问题已经被我自己解决了,基本思路:

    将原来的 权重矢量 挨个和 turn_0矢量 相乘,

    修改成

    权重矢量 和 turn_0块矩阵相乘。

    效果明显: 原来计算这单个因子要27分钟,现在只要76s

    def rolling_window(a, window, axis=0):
        """
        返回2D array的滑窗array的array
        """
        if axis == 0:
            shape = (a.shape[0] - window +1, window, a.shape[-1])
            strides = (a.strides[0],) + a.strides
            a_rolling = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
        elif axis==1:
            shape = (a.shape[-1] - window +1,) + (a.shape[0], window) 
            strides = (a.strides[-1],) + a.strides
            a_rolling = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
        return a_rolling
    
    def fu_exp_wgt_return(N, df,turn_0):
        '''
             exp_wgt_return_Nm
             
             个股最近N个月的指数衰减权重乘以日换手率加权平均日收益率
             N=1、3、6、12
        '''
        weights = np.array([ np.exp(-1*j/N/4) for j in range(N*20)])
        
        # 提取基础数据, 获得数据矩阵
        dff = df[['date','instrument','turn_0']].set_index(['date','instrument'])
        dfx=dff['turn_0'].unstack()  #双索引 =>  行索引 + 列索引
        mx = dfx.values   # np矩阵
        
        # 计算exp_wgt_return_Nm矩阵
        ret = np.full(mx.shape, np.nan)
        mx_rolling = rolling_window(mx, N*20, axis=0)
        ret[N*20-1:,:] = np.stack(map(lambda x:np.dot(weights, x), mx_rolling))
        # 赋值给dfx
        dfx.iloc[:] = ret
        
        # 恢复成双索引
        dfx = dfx.stack()
        # 合并数据,保持原有排序
        dff["x"] = dfx
        
        # 然后去除双索引 恢复成与df类似的结构
        dfx = dff["x"].reset_index(['date','instrument'])
        # 提取目标因子序列
        return dfx["x"]
    

    (yangziriver) #3

    好思路,值得学习!NumPy和Pandas要用好了真不错。刚入门的新手真羡慕。


    #4

    可以试试直接用表达式引擎来生成因子。【学院教程】利用表达式引擎批量生成因子

    克隆策略

      {"Description":"实验创建于2019/7/15","Summary":"","Graph":{"EdgesInternal":[{"DestinationInputPortId":"-2321:instruments","SourceOutputPortId":"-2109:data"},{"DestinationInputPortId":"-2321:features","SourceOutputPortId":"-2316:data"},{"DestinationInputPortId":"-2328:features","SourceOutputPortId":"-2316:data"},{"DestinationInputPortId":"-2328:input_data","SourceOutputPortId":"-2321:data"}],"ModuleNodes":[{"Id":"-2109","ModuleId":"BigQuantSpace.instruments.instruments-v2","ModuleParameters":[{"Name":"start_date","Value":"2010-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2016-12-31","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"market","Value":"CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_list","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"max_count","Value":0,"ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"rolling_conf","NodeId":"-2109"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-2109","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":1,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-2316","ModuleId":"BigQuantSpace.input_features.input_features-v1","ModuleParameters":[{"Name":"features","Value":"\n# #号开始的表示注释,注释需单独一行\n# 多个特征,每行一个,可以包含基础特征和衍生特征,特征须为本平台特征\nalpha1='+'.join(['exp(-1*{}/1/4)*shift(turn_0,{})'.format(k,k) for k in range(20)])\n","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features_ds","NodeId":"-2316"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-2316","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":2,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-2321","ModuleId":"BigQuantSpace.general_feature_extractor.general_feature_extractor-v7","ModuleParameters":[{"Name":"start_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"before_start_days","Value":"200","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"-2321"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-2321"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-2321","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":3,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-2328","ModuleId":"BigQuantSpace.derived_feature_extractor.derived_feature_extractor-v3","ModuleParameters":[{"Name":"date_col","Value":"date","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_col","Value":"instrument","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"drop_na","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"remove_extra_columns","Value":"False","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"user_functions","Value":"{}","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_data","NodeId":"-2328"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-2328"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-2328","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":4,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true}],"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions><NodePosition Node='-2109' Position='383,170,200,200'/><NodePosition Node='-2316' Position='838,165,200,200'/><NodePosition Node='-2321' Position='600,306,200,200'/><NodePosition Node='-2328' Position='636,463,200,200'/></NodePositions><NodeGroups /></DataV1>"},"IsDraft":true,"ParentExperimentId":null,"WebService":{"IsWebServiceExperiment":false,"Inputs":[],"Outputs":[],"Parameters":[{"Name":"交易日期","Value":"","ParameterDefinition":{"Name":"交易日期","FriendlyName":"交易日期","DefaultValue":"","ParameterType":"String","HasDefaultValue":true,"IsOptional":true,"ParameterRules":[],"HasRules":false,"MarkupType":0,"CredentialDescriptor":null}}],"WebServiceGroupId":null,"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions></NodePositions><NodeGroups /></DataV1>"},"DisableNodesUpdate":false,"Category":"user","Tags":[],"IsPartialRun":true}
      In [57]:
      # 本代码由可视化策略环境自动生成 2019年7月15日 11:58
      # 本代码单元只能在可视化模式下编辑。您也可以拷贝代码,粘贴到新建的代码单元或者策略,然后修改。
      
      
      m1 = M.instruments.v2(
          start_date='2010-01-01',
          end_date='2016-12-31',
          market='CN_STOCK_A',
          instrument_list='',
          max_count=0
      )
      
      m2 = M.input_features.v1(
          features="""
      # #号开始的表示注释,注释需单独一行
      # 多个特征,每行一个,可以包含基础特征和衍生特征,特征须为本平台特征
      alpha1='+'.join(['exp(-1*{}/1/4)*shift(turn_0,{})'.format(k,k) for k in range(20)])
      """
      )
      
      m3 = M.general_feature_extractor.v7(
          instruments=m1.data,
          features=m2.data,
          start_date='',
          end_date='',
          before_start_days=200
      )
      
      m4 = M.derived_feature_extractor.v3(
          input_data=m3.data,
          features=m2.data,
          date_col='date',
          instrument_col='instrument',
          drop_na=False,
          remove_extra_columns=False,
          user_functions={}
      )
      

      (chaoskey) #5

      还是你的方法好! 您的方法生成这个因子只要17s.