【模板案例】互信息计算

策略分享
meetup
标签: #<Tag:0x00007f8c6807c848> #<Tag:0x00007f8c6807c6b8>

(iQuant) #1

11月19日Meetup模板案例分享:

克隆策略

互信息的概念来自概率论和信息论,常用于度量两个随机变量之间的关联程度。不同于相关系数仅能捕捉两个随机变量之间的线性相关性,互信息可以捕捉两个变量之间的任何统计依赖性。两个离散随机变量 X 和 Y 的互信息定义为:

image|300x83

其中,p(x, y) 是 X 和 Y 的联合概率分布函数,p(x) 和 p(y) 分别是 X 和 Y 的边缘概率分布函数。

在连续随机变量的情形下,求和替换为二重定积分:

image|360x87

其中,p(x, y) 是 X 和 Y 的联合概率密度函数,p(x) 和 p(y) 分别是 X 和 Y 的边缘概率密度函数。

    {"Description":"实验创建于2017/8/26","Summary":"","Graph":{"EdgesInternal":[{"DestinationInputPortId":"-215:instruments","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8:data"},{"DestinationInputPortId":"-215:features","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24:data"},{"DestinationInputPortId":"-222:features","SourceOutputPortId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24:data"},{"DestinationInputPortId":"-222:input_data","SourceOutputPortId":"-215:data"}],"ModuleNodes":[{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","ModuleId":"BigQuantSpace.instruments.instruments-v2","ModuleParameters":[{"Name":"start_date","Value":"2010-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2015-01-01","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"market","Value":"CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_list","Value":"000001.SZA\n000002.SZA","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"max_count","Value":"0","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"rolling_conf","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-8","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":1,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"287d2cb0-f53c-4101-bdf8-104b137c8601-24","ModuleId":"BigQuantSpace.input_features.input_features-v1","ModuleParameters":[{"Name":"features","Value":"# #号开始的表示注释\n# 多个特征,每行一个,可以包含基础特征和衍生特征\nts_mi(close_0, return_0, 20)","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features_ds","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24"}],"OutputPortsInternal":[{"Name":"data","NodeId":"287d2cb0-f53c-4101-bdf8-104b137c8601-24","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":3,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-215","ModuleId":"BigQuantSpace.general_feature_extractor.general_feature_extractor-v7","ModuleParameters":[{"Name":"start_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"before_start_days","Value":90,"ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"-215"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-215"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-215","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":15,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true},{"Id":"-222","ModuleId":"BigQuantSpace.derived_feature_extractor.derived_feature_extractor-v3","ModuleParameters":[{"Name":"date_col","Value":"date","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"instrument_col","Value":"instrument","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"drop_na","Value":"True","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"remove_extra_columns","Value":"True","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"user_functions","Value":"def ts_mi(df, x, y, window):\n from sklearn.metrics import mutual_info_score as mis\n \n def group_func(df1, window):\n a = x[df1.index].values\n shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)\n strides = a.strides + (a.strides[-1],)\n try:\n c_x = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)\n c_y = np.lib.stride_tricks.as_strided(y[df1.index].values, shape=shape, strides=strides)\n except:\n return pd.Series([np.nan] * len(df), index=df1.index)\n d = []\n for i, j in zip(c_x, c_y):\n d.append(mis(i, j))\n return pd.Series([np.nan] * (window - 1) + d, index=df1.index)\n \n return df.groupby(\"instrument\", as_index=False, group_keys=False).apply(group_func, window=window)\n\nbigquant_run = {\n \"ts_mi\": ts_mi\n}\n","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_data","NodeId":"-222"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-222"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-222","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":16,"IsPartOfPartialRun":null,"Comment":"","CommentCollapsed":true}],"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-8' Position='410,258,200,200'/><NodePosition Node='287d2cb0-f53c-4101-bdf8-104b137c8601-24' Position='742,259,200,200'/><NodePosition Node='-215' Position='625,365,200,200'/><NodePosition Node='-222' Position='629,458,200,200'/></NodePositions><NodeGroups /></DataV1>"},"IsDraft":true,"ParentExperimentId":null,"WebService":{"IsWebServiceExperiment":false,"Inputs":[],"Outputs":[],"Parameters":[{"Name":"交易日期","Value":"","ParameterDefinition":{"Name":"交易日期","FriendlyName":"交易日期","DefaultValue":"","ParameterType":"String","HasDefaultValue":true,"IsOptional":true,"ParameterRules":[],"HasRules":false,"MarkupType":0,"CredentialDescriptor":null}}],"WebServiceGroupId":null,"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions></NodePositions><NodeGroups /></DataV1>"},"DisableNodesUpdate":false,"Category":"user","Tags":[],"IsPartialRun":true}
    In [3]:
    # 本代码由可视化策略环境自动生成 2020年11月20日 18:30
    # 本代码单元只能在可视化模式下编辑。您也可以拷贝代码,粘贴到新建的代码单元或者策略,然后修改。
    
    
    def ts_mi(df, x, y, window):
        from sklearn.metrics import mutual_info_score as mis
        
        def group_func(df1, window):
            a = x[df1.index].values
            shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
            strides = a.strides + (a.strides[-1],)
            try:
                c_x = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
                c_y = np.lib.stride_tricks.as_strided(y[df1.index].values, shape=shape, strides=strides)
            except:
                return pd.Series([np.nan] * len(df), index=df1.index)
            d = []
            for i, j in zip(c_x, c_y):
                d.append(mis(i, j))
            return pd.Series([np.nan] * (window - 1) + d, index=df1.index)
        
        return df.groupby("instrument", as_index=False, group_keys=False).apply(group_func, window=window)
    
    m16_user_functions_bigquant_run = {
        "ts_mi": ts_mi
    }
    
    
    m1 = M.instruments.v2(
        start_date='2010-01-01',
        end_date='2015-01-01',
        market='CN_STOCK_A',
        instrument_list="""000001.SZA
    000002.SZA""",
        max_count=0
    )
    
    m3 = M.input_features.v1(
        features="""# #号开始的表示注释
    # 多个特征,每行一个,可以包含基础特征和衍生特征
    ts_mi(close_0, return_0, 20)"""
    )
    
    m15 = M.general_feature_extractor.v7(
        instruments=m1.data,
        features=m3.data,
        start_date='',
        end_date='',
        before_start_days=90
    )
    
    m16 = M.derived_feature_extractor.v3(
        input_data=m15.data,
        features=m3.data,
        date_col='date',
        instrument_col='instrument',
        drop_na=True,
        remove_extra_columns=True,
        user_functions=m16_user_functions_bigquant_run
    )
    
    In [12]:
    df = DataSource("bar1d_CN_STOCK_A").read("000001.SZA", start_date="2015-01-01", end_date="2020-03-01", fields=["turn", "close"])
    
    In [13]:
    df["return"] = df["close"].pct_change()
    df.dropna(inplace=True)
    
    In [21]:
    from sklearn.metrics import mutual_info_score
    mutual_info_score(df["turn"].values, df["return"].values)
    
    Out[21]:
    6.9871900066935915

    互信息计算出错
    BigQuant AI量化专家Meetup(更新至11月19日)