Train¶
To train and evaluate an RL algorithm using Ray, use the appropriate training script. Here are the commands for different configurations:
HRL (Hierarchical Reinforcement Learning) Configuration
python train_truly_hierarchical.py
HL+LLP (High Level + Low-Level Pretrained) Configuration
python baselines/train_geo_dcrl.py
HLO (High Level Only) Configuration
python baselines/train_hierarchical.py
Training Script¶
The provided training script train_truly_hierarchical.py
uses Ray for distributed training. Here’s a brief overview of the script for PPO of HRL configuration:
import os
import ray
from ray import air, tune
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from gymnasium.spaces import Discrete, Box
from ray.rllib.algorithms.ppo import PPOConfig
from envs.truly_heirarchical_env import TrulyHeirarchicalDCRL
from envs.heirarchical_env import HeirarchicalDCRL, DEFAULT_CONFIG
from create_trainable import create_wrapped_trainable
NUM_WORKERS = 1
NAME = "test"
RESULTS_DIR = './results/'
# Dummy env to get obs and action space
hdcrl_env = HeirarchicalDCRL()
CONFIG = (
PPOConfig()
.environment(
env=TrulyHeirarchicalDCRL,
env_config=DEFAULT_CONFIG
)
.framework("torch")
.rollouts(
num_rollout_workers=NUM_WORKERS,
rollout_fragment_length=2,
)
.training(
gamma=0.99,
lr=1e-5,
kl_coeff=0.2,
clip_param=0.1,
entropy_coeff=0.0,
use_gae=True,
train_batch_size=4096,
num_sgd_iter=10,
model={'fcnet_hiddens': [64, 64]},
shuffle_sequences=True
)
.multi_agent(
policies={
"high_level_policy": (
None,
hdcrl_env.observation_space,
hdcrl_env.action_space,
PPOConfig()
),
"DC1_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
PPOConfig()
),
"DC2_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
PPOConfig()
),
"DC3_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
PPOConfig()
),
},
policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: agent_id,
)
.resources(num_gpus=0)
.debugging(seed=0)
)
if __name__ == "__main__":
os.environ["RAY_DEDUP_LOGS"] = "0"
ray.init(ignore_reinit_error=True)
tune.Tuner(
create_wrapped_trainable(PPO),
param_space=CONFIG.to_dict(),
run_config=air.RunConfig(
stop={"timesteps_total": 100_000_000},
verbose=0,
local_dir=RESULTS_DIR,
name=NAME,
checkpoint_config=ray.air.CheckpointConfig(
checkpoint_frequency=5,
num_to_keep=5,
checkpoint_score_attribute="episode_reward_mean",
checkpoint_score_order="max"
),
)
).fit()
This example assumes a DCC with three data centers. To use a different algorithm, such as A2C, you need to replace the PPOConfig
with A2CConfig
(or the appropriate config class for the algorithm) and adjust the hyperparameters accordingly. For example:
from ray.rllib.algorithms.a2c import A2C, A2CConfig
CONFIG = (
A2CConfig()
.environment(
env=TrulyHeirarchicalMSDCRL,
env_config=DEFAULT_CONFIG
)
.framework("torch")
.rollouts(
num_rollout_workers=NUM_WORKERS,
rollout_fragment_length=2,
)
.training(
gamma=0.99,
lr=1e-5,
kl_coeff=0.2,
clip_param=0.1,
entropy_coeff=0.0,
use_gae=True,
train_batch_size=4096,
num_sgd_iter=10,
model={'fcnet_hiddens': [64, 64]},
)
.multi_agent(
policies={
"high_level_policy": (
None,
hdcrl_env.observation_space,
hdcrl_env.action_space,
A2CConfig()
),
"DC1_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
A2CConfig()
),
"DC2_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
A2CConfig()
),
"DC3_ls_policy": (
None,
Box(-1.0, 1.0, (14,)),
Discrete(3),
A2CConfig()
),
},
policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: agent_id,
)
.resources(num_gpus=0)
.debugging(seed=1)
)
if __name__ == "__main__":
os.environ["RAY_DEDUP_LOGS"] = "0"
ray.init(ignore_reinit_error=True)
tune.Tuner(
create_wrapped_trainable(A2C),
param_space=CONFIG.to_dict(),
run_config=air.RunConfig(
stop={"timesteps_total": 100_000_000},
verbose=0,
local_dir=RESULTS_DIR,
name=NAME,
checkpoint_config=ray.air.CheckpointConfig(
checkpoint_frequency=5,
num_to_keep=5,
checkpoint_score_attribute="episode_reward_mean",
checkpoint_score_order="max"
),
)
).fit()