Source code for gym_nethack.policies.rl
import random
import numpy as np
from gym_nethack.policies.core import Policy
[docs]class LinearAnnealedPolicy(Policy):
def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps):
if not hasattr(inner_policy, attr):
raise ValueError('Policy "{}" does not have attribute "{}".'.format(attr))
self.inner_policy = inner_policy
self.attr = attr
self.value_max = value_max
self.value_min = value_min
self.value_test = value_test
self.nb_steps = nb_steps
super().__init__(name=inner_policy.name + str(value_max) + 'to' + str(value_min))
[docs] def get_current_value(self):
if self.agent.training:
# Linear annealed: f(x) = ax + b.
a = -float(self.value_max - self.value_min) / float(self.nb_steps)
b = float(self.value_max)
value = max(self.value_min, a * float(self.agent.step) + b)
else:
value = self.value_test
return value
[docs] def select_action(self, **kwargs):
setattr(self.inner_policy, self.attr, self.get_current_value())
return self.inner_policy.select_action(**kwargs)
@property
def metrics_names(self):
return ['mean_{}'.format(self.attr)]
@property
def metrics(self):
return [getattr(self.inner_policy, self.attr)]
[docs] def get_config(self):
config = super(LinearAnnealedPolicy, self).get_config()
config['attr'] = self.attr
config['value_max'] = self.value_max
config['value_min'] = self.value_min
config['value_test'] = self.value_test
config['nb_steps'] = self.nb_steps
config['inner_policy'] = get_object_config(self.inner_policy)
return config
[docs]class EpsGreedyPossibleQPolicy(Policy):
def __init__(self, eps=.1):
super().__init__(name='egreedy')
self.eps = eps
[docs] def select_action(self, q_values, valid_action_indices):
if len(valid_action_indices) == 1:
return valid_action_indices[0]
assert q_values.ndim == 1
nb_actions = q_values.shape[0]
if np.random.uniform() < self.eps:
action = np.random.choice(valid_action_indices)
else:
mask = np.ones(len(q_values), np.bool)
mask[valid_action_indices] = 0
q_values[mask] = -100
action = np.argmax(q_values)
return action
[docs] def get_config(self):
config = super(EpsGreedyQPolicy, self).get_config()
config['eps'] = self.eps
return config
[docs]class BoltzmannPossibleQPolicy(Policy):
def __init__(self, tau=1., clip=(-500., 500.)):
super().__init__(name='boltzmann')
self.tau = tau
self.clip = clip
[docs] def select_action(self, q_values, valid_action_indices):
assert q_values.ndim == 1
q_values = q_values.astype('float64')
nb_actions = q_values.shape[0]
exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
# set the exp values of impossible actions to 0
exp_values = [e_val if i in valid_action_indices else 0 for i, e_val in enumerate(exp_values)]
probs = exp_values / np.sum(exp_values)
action = np.random.choice(range(nb_actions), p=probs)
return action
[docs] def get_config(self):
config = super(BoltzmannPossibleQPolicy, self).get_config()
config['tau'] = self.tau
config['clip'] = self.clip
return config