-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmachinery.py
252 lines (224 loc) · 12 KB
/
machinery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import numpy as np
import tensorflow as tf
# based on example at: https://keras.io/examples/rl/ddpg_pendulum/
class OUActionNoise:
"""
To implement better exploration by the Actor network, we use noisy perturbations,
specifically an **Ornstein-Uhlenbeck process** for generating noise, as described in the paper.
It samples noise from a correlated normal distribution.
"""
def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
self.theta = theta
self.mean = mean
self.std_dev = std_deviation
self.dt = dt
self.x_initial = x_initial
self.reset()
def __call__(self):
# formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
x = (self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape))
# store x into x_prev - makes next noise dependent on current one
self.x_prev = x
return x
def reset(self):
if self.x_initial is not None:
self.x_prev = self.x_initial
else:
self.x_prev = np.zeros_like(self.mean)
class Buffer:
"""
The `Buffer` class implements Experience Replay.
**Critic loss** - Mean Squared Error of `y - Q(s, a)`
where `y` is the expected return as seen by the Target network,
and `Q(s, a)` is action value predicted by the Critic network. `y` is a moving target
that the critic model tries to achieve; we make this target
stable by updating the Target model slowly.
**Actor loss** - This is computed using the mean of the value given by the Critic network
for the actions taken by the Actor network. We seek to maximize this quantity.
Hence we update the Actor network so that it produces actions that get
the maximum predicted value as seen by the Critic, for a given state.
"""
def __init__(self, env, agent, buffer_capacity=100000, batch_size=64, gamma=0.99):
# self.TD3 = agent.TD3
# store optimizers
self.actor_optimizer = agent.actor_optimizer
self.critic_optimizer = agent.critic_optimizer
self.critic2_optimizer = agent.critic2_optimizer
self.TD3 = agent.TD3
self.num_states = env.observation_space.shape
self.num_actions = env.action_space.shape[0]
self.num_res_states = env.reservoir_space.shape
# number of "experiences" to store at max
self.buffer_capacity = buffer_capacity
# num of tuples to train on.
self.batch_size = batch_size
self.gamma = gamma
# num of times record() was called.
self.buffer_counter = 0
# instead of list of tuples as the exp.replay concept go, we use different np.arrays for each tuple element
self.state_buffer = np.zeros((self.buffer_capacity,) + self.num_states)
self.res_state_buffer = np.zeros((self.buffer_capacity,)+ self.num_res_states)
self.action_buffer = np.zeros((self.buffer_capacity, self.num_actions))
self.reward_buffer = np.zeros((self.buffer_capacity, 1))
self.next_state_buffer = np.zeros((self.buffer_capacity,)+ self.num_states)
self.next_res_state_buffer = np.zeros((self.buffer_capacity,)+ self.num_res_states)
# takes (s,a,r,s') obervation tuple as input
def record(self, obs_tuple):
# set index to zero if buffer_capacity is exceeded, replacing old records
index = self.buffer_counter % self.buffer_capacity
self.state_buffer[index] = obs_tuple[0]
self.res_state_buffer[index] = obs_tuple[1]
self.action_buffer[index] = obs_tuple[2]
self.reward_buffer[index] = obs_tuple[3]
self.next_state_buffer[index] = obs_tuple[4]
self.next_res_state_buffer[index] = obs_tuple[5]
self.buffer_counter += 1
# Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
# TensorFlow to build a static graph out of the logic and computations in our function.
# This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
@tf.function
def update_critic(self,res_state_batch,action_batch,reward_batch,next_res_state_batch,agent):
# training and updating Actor & Critic networks.
if self.TD3:
with tf.GradientTape(persistent=True) as tape:
target_action_noise = tf.clip_by_value(tf.random.normal([next_res_state_batch.shape[0],1],mean=0.,stddev=1.),-3.,3.)
target_actions = tf.clip_by_value(agent.target_actor(next_res_state_batch, training=True)+target_action_noise,agent.lower_bound,agent.upper_bound)
target_critic = tf.reduce_min(tf.stack([agent.target_critic([next_res_state_batch,target_actions],training=True),agent.target_critic2([next_res_state_batch,target_actions],training=True)],axis=-1),axis=-1)
y = reward_batch + self.gamma * target_critic
critic_value = agent.critic([res_state_batch, action_batch], training=True)
critic2_value = agent.critic2([res_state_batch, action_batch], training=True)
critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
critic2_loss = tf.math.reduce_mean(tf.math.square(y - critic2_value))
critic_grad = tape.gradient(critic_loss, agent.critic.trainable_variables)
critic2_grad = tape.gradient(critic2_loss, agent.critic2.trainable_variables)
del tape
self.critic_optimizer.apply_gradients(zip(critic_grad, agent.critic.trainable_variables))
self.critic2_optimizer.apply_gradients(zip(critic2_grad, agent.critic2.trainable_variables))
else:
with tf.GradientTape() as tape:
target_actions = agent.target_actor(next_res_state_batch, training=True)
y = reward_batch + self.gamma * agent.target_critic([next_res_state_batch, target_actions], training=True)
critic_value = agent.critic([res_state_batch, action_batch], training=True)
critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
# error between target_critic on next state and critic_model on current state
critic_grad = tape.gradient(critic_loss, agent.critic.trainable_variables)
self.critic_optimizer.apply_gradients(zip(critic_grad, agent.critic.trainable_variables))
@tf.function
def update_actor(self,res_state_batch,agent):
with tf.GradientTape() as tape:
actions = agent.actor(res_state_batch, training=True)
critic_value = agent.critic([res_state_batch, actions], training=True)
# used `-value` as we want to maximize the value given by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value) # critic prediction is actor loss
actor_grad = tape.gradient(actor_loss, agent.actor.trainable_variables)
self.actor_optimizer.apply_gradients(zip(actor_grad, agent.actor.trainable_variables))
def learn_actor_critic(self,agent):
# get sampling range
record_range = min(self.buffer_counter, self.buffer_capacity)
# randomly sample indices
batch_indices = np.random.choice(record_range, self.batch_size)
# convert to tensors
# state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
res_state_batch = tf.convert_to_tensor(self.res_state_buffer[batch_indices])
action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
reward_batch = tf.cast(reward_batch, dtype=tf.float32)
# next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
next_res_state_batch = tf.convert_to_tensor(self.next_res_state_buffer[batch_indices])
self.update_critic(res_state_batch, action_batch, reward_batch, next_res_state_batch, agent)
self.update_actor(res_state_batch, agent)
def learn_critic(self,agent):
# get sampling range
record_range = min(self.buffer_counter, self.buffer_capacity)
# randomly sample indices
batch_indices = np.random.choice(record_range, self.batch_size)
# convert to tensors
# state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
res_state_batch = tf.convert_to_tensor(self.res_state_buffer[batch_indices])
action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
reward_batch = tf.cast(reward_batch, dtype=tf.float32)
# next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
next_res_state_batch = tf.convert_to_tensor(self.next_res_state_buffer[batch_indices])
self.update_critic(res_state_batch, action_batch, reward_batch, next_res_state_batch, agent)
@tf.function
def update(self, res_state_batch,action_batch,reward_batch,next_res_state_batch,agent):
# training and updating Actor & Critic networks. - see pseudocode.
with tf.GradientTape() as tape:
target_actions = agent.target_actor(next_res_state_batch, training=True)
y = reward_batch + self.gamma * agent.target_critic([next_res_state_batch, target_actions], training=True)
critic_value = agent.critic([res_state_batch, action_batch], training=True)
critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
# error between target_critic on next state and critic_model on current state
critic_grad = tape.gradient(critic_loss, agent.critic.trainable_variables)
self.critic_optimizer.apply_gradients(zip(critic_grad, agent.critic.trainable_variables))
with tf.GradientTape() as tape:
actions = agent.actor(res_state_batch, training=True)
critic_value = agent.critic([res_state_batch, actions], training=True)
# used `-value` as we want to maximize the value given by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value) # critic prediction is actor loss
actor_grad = tape.gradient(actor_loss, agent.actor.trainable_variables)
self.actor_optimizer.apply_gradients(zip(actor_grad, agent.actor.trainable_variables))
# compute the loss and update parameters
def learn(self,agent):
# get sampling range
record_range = min(self.buffer_counter, self.buffer_capacity)
# randomly sample indices
batch_indices = np.random.choice(record_range, self.batch_size)
# convert to tensors
# state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
res_state_batch = tf.convert_to_tensor(self.res_state_buffer[batch_indices])
action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
reward_batch = tf.cast(reward_batch, dtype=tf.float32)
# next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
next_res_state_batch = tf.convert_to_tensor(self.next_res_state_buffer[batch_indices])
self.update(res_state_batch, action_batch, reward_batch, next_res_state_batch,agent)
def warmup(self,env,agent,OLD_DIR):
warmup, warmups, epi_done = True, 0, False
# if OLD_DIR is None:
while warmup:
prev_state = env.reset(agent)
prev_res_state = prev_state
while epi_done == False:
action, noise = [np.random.uniform(env.action_space.low[0],env.action_space.high[0]),], 0
state, reward, info = env.step(action, noise)
res_state = state
ens_done, epi_done = info['ens_done'], info['epi_done']
self.record((prev_state, prev_res_state, action, reward, state, res_state))
prev_state = state
prev_res_state = res_state
warmup = False
if warmups < self.buffer_capacity / agent.epi_steps:
warmups += 1
epi_done = False
warmup = True
# else:
# while warmup:
# prev_state = env.reset(agent)
# prev_res_state = prev_state
# while epi_done == False:
# tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
# action, noise = agent.policy(tf_prev_state)
# state, reward, info = env.step(action, noise)
# res_state = state
# ens_done, epi_done = info['ens_done'], info['epi_done']
# self.record((prev_state, prev_res_state, action, reward, state, res_state))
# prev_state = state
# prev_res_state = res_state
# warmup = False
# if warmups < self.buffer_capacity / agent.epi_steps:
# warmups += 1
# epi_done = False
# warmup = True
# this update target parameters slowly based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
for (a, b) in zip(target_weights, weights):
a.assign(b * tau + a * (1. - tau))
def anneal_lr(optimizers, epi_count, max_epi, method = 'linear'):
for opt in optimizers:
if method == 'linear':
opt.lr = opt.lr*(1.-epi_count/max_epi)
elif method == 'exponential':
opt.lr = opt.lr*(1./epi_count)