Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: polish params name #63

Merged
merged 1 commit into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/usage/train.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ We provide the detailed description of the command line arguments in the followi
+-------------------+--------------------------------+----------------------------------------------+
| model-dir | The model dir | "" |
+-------------------+--------------------------------+----------------------------------------------+
| safety-bound | Cost_limit | 25.0 |
| cost-limit | Cost_limit | 25.0 |
+-------------------+--------------------------------+----------------------------------------------+
| device | The device to run the model on | "cpu" |
+-------------------+--------------------------------+----------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion safepo/multi_agent/happo.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def train(self, buffer, logger):
std_advantages = torch.std(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])

for sample in data_generator:
Expand Down
18 changes: 9 additions & 9 deletions safepo/multi_agent/macpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def trpo_update(self, sample):
self.policy.cost_optimizer.step()


rescale_constraint_val = (aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"])
rescale_constraint_val = (aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"])

if rescale_constraint_val == 0:
rescale_constraint_val = 1e-8
Expand Down Expand Up @@ -265,7 +265,7 @@ def trpo_update(self, sample):
r_coef = (reward_loss_grad * b_step_dir).sum(0, keepdim=True)
s_coef = (cost_loss_grad * b_step_dir).sum(0, keepdim=True)

fraction = self.config["line_search_fraction"]
fraction = self.config["step_fraction"]
loss_improve = 0

B_cost_loss_grad_dot = torch.dot(B_cost_loss_grad, B_cost_loss_grad)
Expand All @@ -285,7 +285,7 @@ def trpo_update(self, sample):
s_coef = 1e-8
positive_Cauchy_value = (
q_coef - (r_coef ** 2) / (1e-8 + s_coef))
whether_recover_policy_value = 2 * self.config["kl_threshold"] - (
whether_recover_policy_value = 2 * self.config["target_kl"] - (
rescale_constraint_val ** 2) / (
1e-8 + s_coef)
if rescale_constraint_val < 0 and whether_recover_policy_value < 0:
Expand All @@ -301,24 +301,24 @@ def trpo_update(self, sample):

if optim_case in [3, 4]:
lam = torch.sqrt(
(q_coef / (2 * self.config["kl_threshold"])))
(q_coef / (2 * self.config["target_kl"])))
nu = torch.tensor(0) # v_coef = 0
elif optim_case in [1, 2]:
LA, LB = [0, r_coef / rescale_constraint_val], [r_coef / rescale_constraint_val, np.inf]
LA, LB = (LA, LB) if rescale_constraint_val < 0 else (LB, LA)
proj = lambda x, L: max(L[0], min(L[1], x))
lam_a = proj(torch.sqrt(positive_Cauchy_value / whether_recover_policy_value), LA)
lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["kl_threshold"]))), LB)
lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["target_kl"]))), LB)

f_a = lambda lam: -0.5 * (positive_Cauchy_value / (
1e-8 + lam) + whether_recover_policy_value * lam) - r_coef * rescale_constraint_val / (
1e-8 + s_coef)
f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["kl_threshold"] * lam)
f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["target_kl"] * lam)
lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b
nu = max(0, lam * rescale_constraint_val - r_coef) / (1e-8 + s_coef)
else:
lam = torch.tensor(0)
nu = torch.sqrt(torch.tensor(2 * self.config["kl_threshold"]) / (1e-8 + s_coef))
nu = torch.sqrt(torch.tensor(2 * self.config["target_kl"]) / (1e-8 + s_coef))

x_a = (1. / (lam + 1e-8)) * (g_step_dir + nu * b_step_dir)
x_b = (nu * b_step_dir)
Expand All @@ -339,7 +339,7 @@ def trpo_update(self, sample):

flag = False
fraction_coef = self.config["fraction_coef"]
for i in range(self.config["ls_step"]):
for i in range(self.config["searching_steps"]):
x_norm = torch.norm(x)
if x_norm > 0.5:
x = x * 0.5 / x_norm
Expand Down Expand Up @@ -367,7 +367,7 @@ def trpo_update(self, sample):
available_actions_batch, active_masks_batch, new_actor=self.policy.actor, old_actor=old_actor
).mean()

if ((kl < self.config["kl_threshold"]) and (loss_improve < 0 if optim_case > 1 else True)
if ((kl < self.config["target_kl"]) and (loss_improve < 0 if optim_case > 1 else True)
and (new_cost_loss.mean() - cost_loss.mean() <= max(-rescale_constraint_val, 0))):
flag = True
break
Expand Down
2 changes: 1 addition & 1 deletion safepo/multi_agent/mappo.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def train(self, buffer, logger):
std_advantages = torch.std(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])

for sample in data_generator:
Expand Down
4 changes: 2 additions & 2 deletions safepo/multi_agent/mappolag.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def ppo_update(self, sample):
actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"])
self.policy.actor_optimizer.step()

delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()
delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()

R_Relu = torch.nn.ReLU()
new_lamda_lagr = R_Relu(self.lamda_lagr - (delta_lamda_lagr * self.config["lagrangian_coef_rate"]))
Expand Down Expand Up @@ -213,7 +213,7 @@ def train(self, buffer, logger):
std_cost_adv = torch.std(cost_adv_copy)
cost_adv = (cost_adv - mean_cost_adv) / (std_cost_adv + 1e-8)

for _ in range(self.config["ppo_epoch"]):
for _ in range(self.config["learning_iters"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"], cost_adv=cost_adv)

for sample in data_generator:
Expand Down
9 changes: 3 additions & 6 deletions safepo/multi_agent/marl_cfg/happo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: happo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 75
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -28,11 +25,11 @@ use_popart: True
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
73 changes: 0 additions & 73 deletions safepo/multi_agent/marl_cfg/ippo/config.yaml

This file was deleted.

23 changes: 9 additions & 14 deletions safepo/multi_agent/marl_cfg/macpo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: macpo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -20,26 +17,26 @@ eval_interval: 25
log_interval: 25
eval_episodes: 10000

safety_bound: 25
cost_limit: 25
EPS: 1.e-8
safety_gamma: 0.09
line_search_fraction: 0.5
step_fraction: 0.5
g_step_dir_coef: 0.1
b_step_dir_coef: 0.1
fraction_coef: 0.27
fraction_coef: 0.1

gamma: 0.96
gae_lambda: 0.95
use_gae: True
use_popart: True
use_valuenorm: False
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down Expand Up @@ -72,7 +69,6 @@ std_x_coef: 1
std_y_coef: 0.5

mamujoco:
use_valuenorm: True
layer_N: 1
num_env_steps: 10000000
episode_length: 1000
Expand All @@ -81,7 +77,6 @@ mamujoco:
hidden_size: 128
gamma: 0.99
safety_gamma: 0.2
fraction_coef: 0.1
kl_threshold: 0.01
ppo_epoch: 15
target_kl: 0.01
learning_iters: 15 # Conjugate Gradient Iterations
entropy_coef: 0.01
9 changes: 3 additions & 6 deletions safepo/multi_agent/marl_cfg/mappo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: mappo
experiment_name: check
seed: 0
run_dir: ./runs
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 80
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -28,11 +25,11 @@ use_popart: True
use_valuenorm: False
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
11 changes: 4 additions & 7 deletions safepo/multi_agent/marl_cfg/mappolag/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ algorithm_name: mappolag
experiment_name: check
seed: 0
run_dir: ./runs/
use_centralized_V: True
use_obs_instead_of_state: False
num_env_steps: 100000000
episode_length: 8
n_rollout_threads: 1
n_eval_rollout_threads: 1
use_linear_lr_decay: False
hidden_size: 512
use_render: False
recurrent_N: 1
Expand All @@ -21,7 +18,7 @@ eval_interval: 25
log_interval: 25
eval_episodes: 10000

safety_bound: 25
cost_limit: 25
lagrangian_coef_rate: 1.e-5
lamda_lagr: 0.78

Expand All @@ -32,11 +29,11 @@ use_popart: True
use_valuenorm: True
use_proper_time_limits: False

kl_threshold: 0.016
ls_step: 10
target_kl: 0.016
searching_steps: 10
accept_ratio: 0.5
clip_param: 0.2
ppo_epoch: 5
learning_iters: 5
num_mini_batch: 1
data_chunk_length:
value_loss_coef: 1
Expand Down
Loading