PKU-Alignment · Gaiejj · Oct 5, 2023 · Oct 5, 2023
diff --git a/docs/source/usage/train.rst b/docs/source/usage/train.rst
@@ -91,7 +91,7 @@ We provide the detailed description of the command line arguments in the followi
         +-------------------+--------------------------------+----------------------------------------------+
         |   model-dir       | The model dir                  | ""                                           |
         +-------------------+--------------------------------+----------------------------------------------+
-        |   safety-bound    | Cost_limit                     | 25.0                                         |
+        |   cost-limit    | Cost_limit                     | 25.0                                         |
         +-------------------+--------------------------------+----------------------------------------------+
         |   device          | The device to run the model on | "cpu"                                        |
         +-------------------+--------------------------------+----------------------------------------------+

diff --git a/safepo/multi_agent/happo.py b/safepo/multi_agent/happo.py
@@ -177,7 +177,7 @@ def train(self, buffer, logger):
         std_advantages = torch.std(advantages_copy)
         advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
 
-        for _ in range(self.config["ppo_epoch"]):
+        for _ in range(self.config["learning_iters"]):
             data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])
 
             for sample in data_generator:

diff --git a/safepo/multi_agent/macpo.py b/safepo/multi_agent/macpo.py
@@ -231,7 +231,7 @@ def trpo_update(self, sample):
         self.policy.cost_optimizer.step()
 
 
-        rescale_constraint_val = (aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"])
+        rescale_constraint_val = (aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"])
 
         if rescale_constraint_val == 0:
             rescale_constraint_val = 1e-8
@@ -265,7 +265,7 @@ def trpo_update(self, sample):
         r_coef = (reward_loss_grad * b_step_dir).sum(0, keepdim=True)  
         s_coef = (cost_loss_grad * b_step_dir).sum(0, keepdim=True)  
 
-        fraction = self.config["line_search_fraction"] 
+        fraction = self.config["step_fraction"] 
         loss_improve = 0
 
         B_cost_loss_grad_dot = torch.dot(B_cost_loss_grad, B_cost_loss_grad)
@@ -285,7 +285,7 @@ def trpo_update(self, sample):
                 s_coef = 1e-8
             positive_Cauchy_value = (
                         q_coef - (r_coef ** 2) / (1e-8 + s_coef))  
-            whether_recover_policy_value = 2 * self.config["kl_threshold"] - (
+            whether_recover_policy_value = 2 * self.config["target_kl"] - (
                     rescale_constraint_val ** 2) / (
                                                        1e-8 + s_coef)
             if rescale_constraint_val < 0 and whether_recover_policy_value < 0:
@@ -301,24 +301,24 @@ def trpo_update(self, sample):
 
         if optim_case in [3, 4]:
             lam = torch.sqrt(
-                (q_coef / (2 * self.config["kl_threshold"])))
+                (q_coef / (2 * self.config["target_kl"])))
             nu = torch.tensor(0)  # v_coef = 0
         elif optim_case in [1, 2]:
             LA, LB = [0, r_coef / rescale_constraint_val], [r_coef / rescale_constraint_val, np.inf]
             LA, LB = (LA, LB) if rescale_constraint_val < 0 else (LB, LA)
             proj = lambda x, L: max(L[0], min(L[1], x))
             lam_a = proj(torch.sqrt(positive_Cauchy_value / whether_recover_policy_value), LA)
-            lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["kl_threshold"]))), LB)
+            lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["target_kl"]))), LB)
 
             f_a = lambda lam: -0.5 * (positive_Cauchy_value / (
                         1e-8 + lam) + whether_recover_policy_value * lam) - r_coef * rescale_constraint_val / (
                                           1e-8 + s_coef)
-            f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["kl_threshold"] * lam)
+            f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["target_kl"] * lam)
             lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b
             nu = max(0, lam * rescale_constraint_val - r_coef) / (1e-8 + s_coef)
         else:
             lam = torch.tensor(0)
-            nu = torch.sqrt(torch.tensor(2 * self.config["kl_threshold"]) / (1e-8 + s_coef))
+            nu = torch.sqrt(torch.tensor(2 * self.config["target_kl"]) / (1e-8 + s_coef))
 
         x_a = (1. / (lam + 1e-8)) * (g_step_dir + nu * b_step_dir)
         x_b = (nu * b_step_dir)
@@ -339,7 +339,7 @@ def trpo_update(self, sample):
 
         flag = False
         fraction_coef = self.config["fraction_coef"]
-        for i in range(self.config["ls_step"]):
+        for i in range(self.config["searching_steps"]):
             x_norm = torch.norm(x)
             if x_norm > 0.5:
                 x = x * 0.5 / x_norm
@@ -367,7 +367,7 @@ def trpo_update(self, sample):
                 available_actions_batch, active_masks_batch, new_actor=self.policy.actor, old_actor=old_actor
             ).mean()
 
-            if ((kl < self.config["kl_threshold"]) and (loss_improve < 0 if optim_case > 1 else True)
+            if ((kl < self.config["target_kl"]) and (loss_improve < 0 if optim_case > 1 else True)
                     and (new_cost_loss.mean() - cost_loss.mean() <= max(-rescale_constraint_val, 0))):
                 flag = True
                 break

diff --git a/safepo/multi_agent/mappo.py b/safepo/multi_agent/mappo.py
@@ -169,7 +169,7 @@ def train(self, buffer, logger):
         std_advantages = torch.std(advantages_copy)
         advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
 
-        for _ in range(self.config["ppo_epoch"]):
+        for _ in range(self.config["learning_iters"]):
             data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])
 
             for sample in data_generator:

diff --git a/safepo/multi_agent/mappolag.py b/safepo/multi_agent/mappolag.py
@@ -177,7 +177,7 @@ def ppo_update(self, sample):
         actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"])
         self.policy.actor_optimizer.step()
 
-        delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()
+        delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach()
 
         R_Relu = torch.nn.ReLU()
         new_lamda_lagr = R_Relu(self.lamda_lagr - (delta_lamda_lagr * self.config["lagrangian_coef_rate"]))
@@ -213,7 +213,7 @@ def train(self, buffer, logger):
         std_cost_adv = torch.std(cost_adv_copy)
         cost_adv = (cost_adv - mean_cost_adv) / (std_cost_adv + 1e-8)
 
-        for _ in range(self.config["ppo_epoch"]):
+        for _ in range(self.config["learning_iters"]):
             data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"], cost_adv=cost_adv)
 
             for sample in data_generator:

diff --git a/safepo/multi_agent/marl_cfg/happo/config.yaml b/safepo/multi_agent/marl_cfg/happo/config.yaml
@@ -3,13 +3,10 @@ algorithm_name: happo
 experiment_name: check
 seed: 0
 run_dir: ./runs
-use_centralized_V: True
-use_obs_instead_of_state: False
 num_env_steps: 100000000
 episode_length: 75
 n_rollout_threads: 1
 n_eval_rollout_threads: 1
-use_linear_lr_decay: False
 hidden_size: 512
 use_render: False
 recurrent_N: 1
@@ -28,11 +25,11 @@ use_popart: True
 use_valuenorm: True
 use_proper_time_limits: False
 
-kl_threshold: 0.016
-ls_step: 10
+target_kl: 0.016
+searching_steps: 10
 accept_ratio: 0.5
 clip_param: 0.2
-ppo_epoch: 5
+learning_iters: 5
 num_mini_batch: 1
 data_chunk_length: 
 value_loss_coef: 1

diff --git a/safepo/multi_agent/marl_cfg/ippo/config.yaml b/safepo/multi_agent/marl_cfg/ippo/config.yaml
diff --git a/safepo/multi_agent/marl_cfg/macpo/config.yaml b/safepo/multi_agent/marl_cfg/macpo/config.yaml
@@ -3,13 +3,10 @@ algorithm_name: macpo
 experiment_name: check
 seed: 0
 run_dir: ./runs
-use_centralized_V: True
-use_obs_instead_of_state: False
 num_env_steps: 100000000
 episode_length: 8
 n_rollout_threads: 1
 n_eval_rollout_threads: 1
-use_linear_lr_decay: False
 hidden_size: 512
 use_render: False
 recurrent_N: 1
@@ -20,26 +17,26 @@ eval_interval: 25
 log_interval: 25
 eval_episodes: 10000
 
-safety_bound: 25
+cost_limit: 25
 EPS: 1.e-8
 safety_gamma: 0.09
-line_search_fraction: 0.5
+step_fraction: 0.5
 g_step_dir_coef: 0.1
 b_step_dir_coef: 0.1
-fraction_coef: 0.27
+fraction_coef: 0.1
 
 gamma: 0.96
 gae_lambda: 0.95
 use_gae: True
 use_popart: True
-use_valuenorm: False
+use_valuenorm: True
 use_proper_time_limits: False
 
-kl_threshold: 0.016
-ls_step: 10
+target_kl: 0.016
+searching_steps: 10
 accept_ratio: 0.5
 clip_param: 0.2
-ppo_epoch: 5
+learning_iters: 5
 num_mini_batch: 1
 data_chunk_length: 
 value_loss_coef: 1
@@ -72,7 +69,6 @@ std_x_coef: 1
 std_y_coef: 0.5
 
 mamujoco: 
-  use_valuenorm: True
   layer_N: 1
   num_env_steps: 10000000
   episode_length: 1000
@@ -81,7 +77,6 @@ mamujoco:
   hidden_size: 128
   gamma: 0.99
   safety_gamma: 0.2
-  fraction_coef: 0.1
-  kl_threshold: 0.01
-  ppo_epoch: 15
+  target_kl: 0.01
+  learning_iters: 15 # Conjugate Gradient Iterations
   entropy_coef: 0.01
diff --git a/safepo/multi_agent/marl_cfg/mappo/config.yaml b/safepo/multi_agent/marl_cfg/mappo/config.yaml
@@ -3,13 +3,10 @@ algorithm_name: mappo
 experiment_name: check
 seed: 0
 run_dir: ./runs
-use_centralized_V: True
-use_obs_instead_of_state: False
 num_env_steps: 100000000
 episode_length: 8
 n_rollout_threads: 80
 n_eval_rollout_threads: 1
-use_linear_lr_decay: False
 hidden_size: 512
 use_render: False
 recurrent_N: 1
@@ -28,11 +25,11 @@ use_popart: True
 use_valuenorm: False
 use_proper_time_limits: False
 
-kl_threshold: 0.016
-ls_step: 10
+target_kl: 0.016
+searching_steps: 10
 accept_ratio: 0.5
 clip_param: 0.2
-ppo_epoch: 5
+learning_iters: 5
 num_mini_batch: 1
 data_chunk_length: 
 value_loss_coef: 1

diff --git a/safepo/multi_agent/marl_cfg/mappolag/config.yaml b/safepo/multi_agent/marl_cfg/mappolag/config.yaml
@@ -3,13 +3,10 @@ algorithm_name: mappolag
 experiment_name: check
 seed: 0
 run_dir: ./runs/
-use_centralized_V: True
-use_obs_instead_of_state: False
 num_env_steps: 100000000
 episode_length: 8
 n_rollout_threads: 1
 n_eval_rollout_threads: 1
-use_linear_lr_decay: False
 hidden_size: 512
 use_render: False
 recurrent_N: 1
@@ -21,7 +18,7 @@ eval_interval: 25
 log_interval: 25
 eval_episodes: 10000
 
-safety_bound: 25
+cost_limit: 25
 lagrangian_coef_rate: 1.e-5
 lamda_lagr: 0.78
 
@@ -32,11 +29,11 @@ use_popart: True
 use_valuenorm: True
 use_proper_time_limits: False
 
-kl_threshold: 0.016
-ls_step: 10
+target_kl: 0.016
+searching_steps: 10
 accept_ratio: 0.5
 clip_param: 0.2
-ppo_epoch: 5
+learning_iters: 5
 num_mini_batch: 1
 data_chunk_length: 
 value_loss_coef: 1