Source code for harl.runners.on_policy_ma_runner

"""Runner for on-policy MA algorithms."""
import numpy as np
import torch
from harl.runners.on_policy_base_runner import OnPolicyBaseRunner


[docs] class OnPolicyMARunner(OnPolicyBaseRunner): """Runner for on-policy MA algorithms."""
[docs] def train(self): """Training procedure for MAPPO.""" actor_train_infos = [] # compute advantages if self.value_normalizer is not None: advantages = self.critic_buffer.returns[ :-1 ] - self.value_normalizer.denormalize(self.critic_buffer.value_preds[:-1]) else: advantages = ( self.critic_buffer.returns[:-1] - self.critic_buffer.value_preds[:-1] ) # normalize advantages for FP if self.state_type == "FP": active_masks_collector = [ self.actor_buffer[i].active_masks for i in range(self.num_agents) ] active_masks_array = np.stack(active_masks_collector, axis=2) advantages_copy = advantages.copy() advantages_copy[active_masks_array[:-1] == 0.0] = np.nan mean_advantages = np.nanmean(advantages_copy) std_advantages = np.nanstd(advantages_copy) advantages = (advantages - mean_advantages) / (std_advantages + 1e-5) # update actors if self.share_param: actor_train_info = self.actor[0].share_param_train( self.actor_buffer, advantages.copy(), self.num_agents, self.state_type ) for _ in torch.randperm(self.num_agents): actor_train_infos.append(actor_train_info) else: for agent_id in range(self.num_agents): if self.state_type == "EP": actor_train_info = self.actor[agent_id].train( self.actor_buffer[agent_id], advantages.copy(), "EP" ) elif self.state_type == "FP": actor_train_info = self.actor[agent_id].train( self.actor_buffer[agent_id], advantages[:, :, agent_id].copy(), "FP", ) actor_train_infos.append(actor_train_info) # update critic critic_train_info = self.critic.train(self.critic_buffer, self.value_normalizer) return actor_train_infos, critic_train_info