Skip to content

Commit

Permalink
docs: polish params name
Browse files Browse the repository at this point in the history
  • Loading branch information
Gaiejj committed Oct 5, 2023
1 parent 131f438 commit 93d4267
Show file tree
Hide file tree
Showing 19 changed files with 72 additions and 143 deletions.
2 changes: 1 addition & 1 deletion docs/source/usage/train.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ We provide the detailed description of the command line arguments in the followi
+-------------------+--------------------------------+----------------------------------------------+
| model-dir | The model dir | "" |
+-------------------+--------------------------------+----------------------------------------------+
| safety-bound | Cost_limit | 25.0 |
| cost-limit | Cost_limit | 25.0 |
+-------------------+--------------------------------+----------------------------------------------+
| device | The device to run the model on | "cpu" |
+-------------------+--------------------------------+----------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion safepo/multi_agent/happo.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def train(self, buffer, logger):
std_advantages = torch.std(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])

for sample in data_generator:
Expand Down
18 changes: 9 additions & 9 deletions safepo/multi_agent/macpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def trpo_update(self, sample):
self.policy.cost_optimizer.step()


rescale_constraint_val = (aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"])
rescale_constraint_val = (aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"])

if rescale_constraint_val == 0:
rescale_constraint_val = 1e-8
Expand Down Expand Up @@ -265,7 +265,7 @@ def trpo_update(self, sample):
r_coef = (reward_loss_grad * b_step_dir).sum(0, keepdim=True)
s_coef = (cost_loss_grad * b_step_dir).sum(0, keepdim=True)

fraction = self.config["line_search_fraction"]
fraction = self.config["step_fraction"]
loss_improve = 0

B_cost_loss_grad_dot = torch.dot(B_cost_loss_grad, B_cost_loss_grad)
Expand All @@ -285,7 +285,7 @@ def trpo_update(self, sample):
s_coef = 1e-8
positive_Cauchy_value = (
q_coef - (r_coef ** 2) / (1e-8 + s_coef))
whether_recover_policy_value = 2 * self.config["kl_threshold"] - (
whether_recover_policy_value = 2 * self.config["target_kl"] - (
rescale_constraint_val ** 2) / (
1e-8 + s_coef)
if rescale_constraint_val < 0 and whether_recover_policy_value < 0:
Expand All @@ -301,24 +301,24 @@ def trpo_update(self, sample):

if optim_case in [3, 4]:
lam = torch.sqrt(
(q_coef / (2 * self.config["kl_threshold"])))
(q_coef / (2 * self.config["target_kl"])))
nu = torch.tensor(0) # v_coef = 0
elif optim_case in [1, 2]:
LA, LB = [0, r_coef / rescale_constraint_val], [r_coef / rescale_constraint_val, np.inf]
LA, LB = (LA, LB) if rescale_constraint_val < 0 else (LB, LA)
proj = lambda x, L: max(L[0], min(L[1], x))
lam_a = proj(torch.sqrt(positive_Cauchy_value / whether_recover_policy_value), LA)
lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["kl_threshold"]))), LB)
lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["target_kl"]))), LB)

f_a = lambda lam: -0.5 * (positive_Cauchy_value / (
1e-8 + lam) + whether_recover_policy_value * lam) - r_coef * rescale_constraint_val / (
1e-8 + s_coef)
f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["kl_threshold"] * lam)
f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["target_kl"] * lam)
lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b
nu = max(0, lam * rescale_constraint_val - r_coef) / (1e-8 + s_coef)
else:
lam = torch.tensor(0)
nu = torch.sqrt(torch.tensor(2 * self.config["kl_threshold"]) / (1e-8 + s_coef))
nu = torch.sqrt(torch.tensor(2 * self.config["target_kl"]) / (1e-8 + s_coef))

x_a = (1. / (lam + 1e-8)) * (g_step_dir + nu * b_step_dir)
x_b = (nu * b_step_dir)
Expand All @@ -339,7 +339,7 @@ def trpo_update(self, sample):

flag = False
fraction_coef = self.config["fraction_coef"]
for i in range(self.config["ls_step"]):
for i in range(self.config["searching_steps"]):
x_norm = torch.norm(x)
if x_norm > 0.5:
x = x * 0.5 / x_norm
Expand Down Expand Up @@ -367,7 +367,7 @@ def trpo_update(self, sample):
available_actions_batch, active_masks_batch, new_actor=self.policy.actor, old_actor=old_actor
).mean()

if ((kl < self.config["kl_threshold"]) and (loss_improve < 0 if optim_case > 1 else True)
if ((kl < self.config["target_kl"]) and (loss_improve < 0 if optim_case > 1 else True)
and (new_cost_loss.mean() - cost_loss.mean() <= max(-rescale_constraint_val, 0))):
flag = True
break
Expand Down
2 changes: 1 addition & 1 deletion safepo/multi_agent/mappo.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def train(self, buffer, logger):
std_advantages = torch.std(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])

for sample in data_generator:
Expand Down
4 changes: 2 additions & 2 deletions safepo/multi_agent/mappolag.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def ppo_update(self, sample):
actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"])
self.policy.actor_optimizer.step()

delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()
delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()

R_Relu = torch.nn.ReLU()
new_lamda_lagr = R_Relu(self.lamda_lagr - (delta_lamda_lagr * self.config["lagrangian_coef_rate"]))
Expand Down Expand Up @@ -213,7 +213,7 @@ def train(self, buffer, logger):
std_cost_adv = torch.std(cost_adv_copy)
cost_adv = (cost_adv - mean_cost_adv) / (std_cost_adv + 1e-8)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"], cost_adv=cost_adv)

for sample in data_generator:
Expand Down
9 changes: 3 additions & 6 deletions safepo/multi_agent/marl_cfg/happo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: happo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 75
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -28,11 +25,11 @@ use_popart: True
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
73 changes: 0 additions & 73 deletions safepo/multi_agent/marl_cfg/ippo/config.yaml

This file was deleted.

23 changes: 9 additions & 14 deletions safepo/multi_agent/marl_cfg/macpo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: macpo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -20,26 +17,26 @@ eval_interval: 25
log_interval: 25
eval_episodes: 10000

safety_bound: 25
cost_limit: 25
EPS: 1.e-8
safety_gamma: 0.09
line_search_fraction: 0.5
step_fraction: 0.5
g_step_dir_coef: 0.1
b_step_dir_coef: 0.1
fraction_coef: 0.27
fraction_coef: 0.1

gamma: 0.96
gae_lambda: 0.95
use_gae: True
use_popart: True
use_valuenorm: False
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down Expand Up @@ -72,7 +69,6 @@ std_x_coef: 1
std_y_coef: 0.5

mamujoco:
use_valuenorm: True
layer_N: 1
num_env_steps: 10000000
episode_length: 1000
Expand All @@ -81,7 +77,6 @@ mamujoco:
hidden_size: 128
gamma: 0.99
safety_gamma: 0.2
fraction_coef: 0.1
kl_threshold: 0.01
ppo_epoch: 15
target_kl: 0.01
learning_iters: 15 # Conjugate Gradient Iterations
entropy_coef: 0.01
9 changes: 3 additions & 6 deletions safepo/multi_agent/marl_cfg/mappo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: mappo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 80
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -28,11 +25,11 @@ use_popart: True
use_valuenorm: False
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
11 changes: 4 additions & 7 deletions safepo/multi_agent/marl_cfg/mappolag/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: mappolag
experiment_name: check
seed: 0
run_dir: ./runs/
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -21,7 +18,7 @@ eval_interval: 25
log_interval: 25
eval_episodes: 10000

safety_bound: 25
cost_limit: 25
lagrangian_coef_rate: 1.e-5
lamda_lagr: 0.78

Expand All @@ -32,11 +29,11 @@ use_popart: True
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
Loading

0 comments on commit 93d4267

Please sign in to comment.