在股票数据上做本福特定律测试

策略分享
标签: #<Tag:0x00007f8c7424b398>

(think) #1

本福特定律 ,也称为 本福特法则 ,说明一堆从实际生活得出的数据中,以1为首位数字的数的出现概率约为总数的三成,接近直觉得出之期望值1/9的3倍。推广来说,越大的数,以它为首几位的数出现的概率就越低。精确地数学表述为:在b进位制中,以数n起头的数出现的机率为logb(n + 1) − logb(n)。它可用于检查各种数据是否有造假。

这个定律是一个非常神奇的定律,它的适用范围异常的广泛,几乎所有日常生活中没有人为规则的统计数据都满足这个定律。比如说世界各国人口数量、各国国土面积、账本、物理化学常数、数学物理课本后面的答案、放射性半衰期等等数据居然都符合本福特定律。值得一提的是,科学家还发现,统计物理的三个重要分布,Boltzmann-Gibbs分布,Bose-Einstein分布,Fermi-Dirac分布,也基本上满足Benford定律!

本福特定律产生的根源,就在于指数增长。这幅图可以直观的显示,如果一个变量随时间成指数增长的话,那么这个变量开头的数字随着时间的变化就应该是如下图:(横轴代表时间,纵轴代表那个变量)

image

我们使用A股数据来验证本福特定律,代码如下,结论是非常符合。

编码视频:https://www.bilibili.com/video/BV1RT4y1F7cu/

克隆策略

    {"Description":"实验创建于2020/11/7","Summary":"","Graph":{"EdgesInternal":[{"DestinationInputPortId":"-15:input_1","SourceOutputPortId":"-5:data"}],"ModuleNodes":[{"Id":"-5","ModuleId":"BigQuantSpace.use_datasource.use_datasource-v1","ModuleParameters":[{"Name":"datasource_id","Value":"bar1d_CN_STOCK_A","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"start_date","Value":"2020-11-06","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"end_date","Value":"2020-11-06","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"instruments","NodeId":"-5"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"features","NodeId":"-5"}],"OutputPortsInternal":[{"Name":"data","NodeId":"-5","OutputType":null}],"UsePreviousResults":true,"moduleIdForCode":1,"Comment":"","CommentCollapsed":true},{"Id":"-15","ModuleId":"BigQuantSpace.cached.cached-v3","ModuleParameters":[{"Name":"run","Value":"# Python 代码入口函数,input_1/2/3 对应三个输入端,data_1/2/3 对应三个输出端\ndef bigquant_run(input_1, input_2, input_3):\n from collections import Counter\n\n df = input_1.read()\n df['change'] = df['close'] / df['open']\n result_df = pd.DataFrame(index=range(1, 10))\n result_df.index = result_df.index.map(lambda x: str(x))\n result_df.index.name = '首位数字'\n for col in ['adjust_factor', 'amount', 'close', 'deal_number', 'turn', 'volume', 'change']:\n counts = Counter(df[col].apply(lambda x: str(x)[0]))\n if '0' in counts:\n del counts['0']\n rdf = pd.DataFrame(sorted(list(counts.items())), columns=['首位数字', '数量'])\n rdf.set_index('首位数字', inplace=True)\n result_df[f'{col}'] = rdf['数量']\n\n T.plot(result_df, title='本福特定律-股票数据测试')\n\n return Outputs(data_1=result_df)\n","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"post_run","Value":"# 后处理函数,可选。输入是主函数的输出,可以在这里对数据做处理,或者返回更友好的outputs数据格式。此函数输出不会被缓存。\ndef bigquant_run(outputs):\n return outputs\n","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"input_ports","Value":"","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"params","Value":"{}","ValueType":"Literal","LinkedGlobalParameter":null},{"Name":"output_ports","Value":"","ValueType":"Literal","LinkedGlobalParameter":null}],"InputPortsInternal":[{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_1","NodeId":"-15"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_2","NodeId":"-15"},{"DataSourceId":null,"TrainedModelId":null,"TransformModuleId":null,"Name":"input_3","NodeId":"-15"}],"OutputPortsInternal":[{"Name":"data_1","NodeId":"-15","OutputType":null},{"Name":"data_2","NodeId":"-15","OutputType":null},{"Name":"data_3","NodeId":"-15","OutputType":null}],"UsePreviousResults":false,"moduleIdForCode":2,"Comment":"本福特定律测试","CommentCollapsed":false}],"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions><NodePosition Node='-5' Position='222,117,200,200'/><NodePosition Node='-15' Position='211,232,200,200'/></NodePositions><NodeGroups /></DataV1>"},"IsDraft":true,"ParentExperimentId":null,"WebService":{"IsWebServiceExperiment":false,"Inputs":[],"Outputs":[],"Parameters":[{"Name":"交易日期","Value":"","ParameterDefinition":{"Name":"交易日期","FriendlyName":"交易日期","DefaultValue":"","ParameterType":"String","HasDefaultValue":true,"IsOptional":true,"ParameterRules":[],"HasRules":false,"MarkupType":0,"CredentialDescriptor":null}}],"WebServiceGroupId":null,"SerializedClientData":"<?xml version='1.0' encoding='utf-16'?><DataV1 xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><Meta /><NodePositions></NodePositions><NodeGroups /></DataV1>"},"DisableNodesUpdate":false,"Category":"user","Tags":[],"IsPartialRun":true}
    In [29]:
    # 本代码由可视化策略环境自动生成 2020年11月7日 23:20
    # 本代码单元只能在可视化模式下编辑。您也可以拷贝代码,粘贴到新建的代码单元或者策略,然后修改。
    
    
    # Python 代码入口函数,input_1/2/3 对应三个输入端,data_1/2/3 对应三个输出端
    def m2_run_bigquant_run(input_1, input_2, input_3):
        from collections import Counter
    
        df = input_1.read()
        df['change'] = df['close'] / df['open']
        result_df = pd.DataFrame(index=range(1, 10))
        result_df.index = result_df.index.map(lambda x: str(x))
        result_df.index.name = '首位数字'
        for col in ['adjust_factor', 'amount', 'close', 'deal_number', 'turn', 'volume', 'change']:
            counts = Counter(df[col].apply(lambda x: str(x)[0]))
            if '0' in counts:
                del counts['0']
            rdf = pd.DataFrame(sorted(list(counts.items())), columns=['首位数字', '数量'])
            rdf.set_index('首位数字', inplace=True)
            result_df[f'{col}'] = rdf['数量']
    
        T.plot(result_df, title='本福特定律-股票数据测试')
    
        return Outputs(data_1=result_df)
    
    # 后处理函数,可选。输入是主函数的输出,可以在这里对数据做处理,或者返回更友好的outputs数据格式。此函数输出不会被缓存。
    def m2_post_run_bigquant_run(outputs):
        return outputs
    
    
    m1 = M.use_datasource.v1(
        datasource_id='bar1d_CN_STOCK_A',
        start_date='2020-11-06',
        end_date='2020-11-06'
    )
    
    m2 = M.cached.v3(
        input_1=m1.data,
        run=m2_run_bigquant_run,
        post_run=m2_post_run_bigquant_run,
        input_ports='',
        params='{}',
        output_ports='',
        m_cached=False
    )