强化学习入门demo分享

策略分享
强化学习
标签: #<Tag:0x00007f4cdabe3e48> #<Tag:0x00007f4cdabe3c68>

(胖大帅) #1

最近在研究,深度强化学习,先分享一个强化学习的入门demo

克隆策略
In [26]:
prices = DataSource('bar1d_CN_STOCK_A').read(instruments=['000333.SZA'],start_date='2010-01-01',fields=['close'])['close']
prices.index = range(len(prices))
prices.plot()
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcd453164e0>
In [27]:
class RandomDecisionPolicy:
    def __init__(self, actions):
        self.actions = actions
    def select_action(self, current_state, step):
        # 三种行为 0,1,-1
        action = self.actions[random.randint(0, len(self.actions) -1)]
        return action  
    def update_q(self, state, action, reward, next_state):
        pass
    
import tensorflow as tf
class QLearningDecisionPolicy:
    def __init__(self, actions, input_dim):
        self.epsilon = 0.9
        self.gamma = 0.001
        self.actions = actions
        output_dim = len(actions)
        h1_dim = 200
        self.x = tf.placeholder(tf.float32, [None, input_dim])
        self.y = tf.placeholder(tf.float32, [output_dim])
        W1 = tf.Variable(tf.random_normal([input_dim, h1_dim]))
        b1 = tf.Variable(tf.constant(0.1, shape=[h1_dim]))

        h1 = tf.nn.relu(tf.matmul(self.x, W1) + b1)
        W2 = tf.Variable(tf.random_normal([h1_dim, output_dim]))
        b2 = tf.Variable(tf.constant(0.1, shape=[output_dim]))
        self.q = tf.nn.relu(tf.matmul(h1, W2) + b2)
        loss = tf.square(self.y - self.q)
        self.train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
        
    def select_action(self, current_state, step):
        '''
        epsilon为一个超参数,该参数的目的在于避免智能体选择同一个action
        '''
        threshold = min(self.epsilon, step / 1000.)
        if random.random() < threshold:
            action_q_vals = self.sess.run(self.q, feed_dict={self.x: current_state})
            action_idx = np.argmax(action_q_vals)
            action = self.actions[action_idx]
        else:
            # 随机选择一个行为
            action = self.actions[random.randint(0, len(self.actions) - 1)]
       
        return action
    
    def update_q(self, state, action, reward, next_state):
        action_q_vals = self.sess.run(self.q, feed_dict={self.x:state})
        next_action_q_vals = self.sess.run(self.q, feed_dict={self.x: next_state})
        next_action_idx = np.argmax(next_action_q_vals)
        action_q_vals[0, next_action_idx] = reward + self.gamma *next_action_q_vals[0, next_action_idx]
        action_q_vals = np.squeeze(np.asarray(action_q_vals))
        self.sess.run(self.train_op, feed_dict={self.x: state, self.y:action_q_vals})

# portfolio = budget + number of stocks * share value
# reward = new_portfolio - current_portfolio

def run_simulation(policy, initial_budget, initial_num_stocks, prices, hist, debug=False):
    budget = initial_budget
    num_stocks = initial_num_stocks
    share_value = 0
    transitions = list()
 
    for i in range(len(prices) - hist - 1):
#          if i % 500 == 0:
#             print('progress {:.2f}%'.format(float(100*i) /(len(prices) - hist - 1)))
            
        current_state = np.asmatrix(np.hstack((prices[i:i+hist], budget, num_stocks)))
        current_portfolio = budget + num_stocks * share_value
      
        action = policy.select_action(current_state, i)
        
        share_value = float(prices[i + hist + 1])
        if action == 'Buy' and budget >= share_value:
            budget -= share_value
            num_stocks += 1
        elif action == 'Sell' and num_stocks > 0:
            budget += share_value
            num_stocks -= 1
        else:
            action = 'Hold'
            
        new_portfolio = budget + num_stocks * share_value
        reward = new_portfolio - current_portfolio
        next_state = np.asmatrix(np.hstack((prices[i+1:i+hist+1], budget, num_stocks)))
        transitions.append((current_state, action, reward, next_state))
        portfolio = budget + num_stocks * share_value
        policy.update_q(current_state, action, reward, next_state)
        if debug:
            print('${}\t{} shares'.format(budget, num_stocks))
    return portfolio

def run_simulations(policy, budget, num_stocks, prices, hist):
    num_tries = 10  # 运行100次看效果
    final_portfolios = list()
    for i in range(num_tries):
        final_portfolio = run_simulation(policy, budget, num_stocks,
   prices, hist)
        final_portfolios.append(final_portfolio)
    avg, std = np.mean(final_portfolios), np.std(final_portfolios)
    return avg, std
In [28]:
actions = ['Buy', 'Sell', 'Hold']
hist = 200
policy = RandomDecisionPolicy(actions)
budget = 1000.0
num_stocks = 0
avg,std=run_simulations(policy,budget,num_stocks,prices, hist)
# 因为决策是随机的所以不稳定,故而关注多次模拟的平均情形
print('mean_portfolio_value:', avg, 'std_portfolio_value:' , std)
mean_portfolio_value: 2106.2798206329344 std_portfolio_value: 448.9551499798718
In [29]:
actions = ['Buy', 'Sell', 'Hold']
hist = 200
policy = QLearningDecisionPolicy(actions, 202) 
budget = 1000.0
num_stocks = 0
avg, std=run_simulations(policy, budget, num_stocks, prices, hist)
print('mean_portfolio_value:', avg, 'std_portfolio_value:' , std)
WARNING:tensorflow:From /usr/local/python3/lib/python3.5/site-packages/tensorflow/python/util/tf_should_use.py:118: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
mean_portfolio_value: 3651.7990772247313 std_portfolio_value: 498.4919781407758

DQN个股择时策略研究
(yangziriver) #2

太深奥了,能介绍一下如何使用这个策略吗?