Source code for envs.sustaindc.dc_gym

from typing import Optional, Tuple
import numpy as np
import pandas as pd

import gymnasium as gym
from gymnasium import spaces
from collections import deque

import envs.sustaindc.datacenter_model as DataCenter

[docs] class dc_gymenv(gym.Env): def __init__(self, observation_variables : list, dc_memory_GB : float, observation_space : spaces.Box, action_variables: list, action_space : spaces.Discrete, action_mapping: dict, ranges : dict, # this data frame should be time indexed for the code to work add_cpu_usage : bool, add_gpu_usage : bool, # Added GPU usage parameter min_temp : float, max_temp : float, action_definition : dict, DC_Config : dict, seed : int = 123, episode_length_in_time : pd.Timedelta = None, # can be 1 week in minutes eg pd.Timedelta('7days') ): """Creates the data center environment Args: observation_variables (list[str]): The partial list of variables that will be evaluated inside this evironment.The actual gym space may include other variables like sine cosine of hours, day of year, cpu usage, carbon intensity and battery state of charge. dc_memory_GB (float): The DRAM memory in a datacenter observation_space (spaces.Box): The gym observations space following gymnasium standard action_variables (list[str]): The list of action variables for the environment. It is used to create the info dict returned by the environment action_space (spaces.Discrete): The gym action space following gymnasium standard action_mapping (dict): A mapping from agent discrete action choice to actual delta change in setpoint. The mapping is defined in utils.make_pyeplus_env.py ranges (dict[str,list]): The upper and lower bounds on the observation_variables add_cpu_usage (bool): Whether to include CPU usage in the observation space add_gpu_usage (bool): Whether to include GPU usage in the observation space max_temp (float): The maximum temperature allowed for the CRAC setpoint min_temp (float): The minimum temperature allowed for the CRAC setpoint action_definition (dict): A mapping of the action name to the default or initialized value. Specified in utils.make_pyeplus_env.py episode_length_in_time (pd.Timedelta, optional): The maximum length after which the done flag should be True. Defaults to None. Setting none causes done to be True after data set is exausted. """ super().__init__() self.observation_variables = observation_variables self.observation_space = observation_space self.action_variables = action_variables self.action_space = action_space self.action_mapping = action_mapping self.dc_memory_GB = dc_memory_GB self.ranges = ranges self.seed = seed self.add_cpu_usage = add_cpu_usage self.add_gpu_usage = add_gpu_usage # Added GPU usage flag self.ambient_temp = 20 self.scale_obs = False self.obs_max = [] self.obs_min = [] self.DC_Config = DC_Config # Initialize data center model with GPU support if available gpu_config = None if hasattr(self.DC_Config, 'RACK_GPU_CONFIG'): gpu_config = self.DC_Config.RACK_GPU_CONFIG self.dc = DataCenter.DataCenter_ITModel(num_racks=self.DC_Config.NUM_RACKS, dc_memory_GB = self.dc_memory_GB, rack_supply_approach_temp_list=self.DC_Config.RACK_SUPPLY_APPROACH_TEMP_LIST, rack_CPU_config=self.DC_Config.RACK_CPU_CONFIG, rack_GPU_config=gpu_config, # Add GPU config max_W_per_rack=self.DC_Config.MAX_W_PER_RACK, DC_ITModel_config=self.DC_Config) # Check if the data center has GPUs self.has_gpus = self.dc.has_gpus self.CRAC_Fan_load, self.CRAC_cooling_load, self.Compressor_load, self.CW_pump_load, self.CT_pump_load = None, None, None, None, None # self.HVAC_load = self.ranges['Facility Total HVAC Electricity Demand Rate(Whole Building)'][0] self.rackwise_cpu_pwr, self.rackwise_itfan_pwr, self.rackwise_memory_power, self.rackwise_gpu_pwr, self.rackwise_outlet_temp = [], [], [], [], [] self.cpu_load_frac = 0.5 self.gpu_load_frac = 0.5 self.mem_load_frac = 0.5 self.bat_SoC = 300*1e3 # all units are SI self.raw_curr_state = None self.raw_next_state = None self.raw_curr_stpt = action_definition['cooling setpoints']['initial_value'] self.max_temp = max_temp self.min_temp = min_temp self.consecutive_actions = 0 self.last_action = None self.action_scaling_factor = 1 # Starts with a scale factor of 1 # IT + HVAC # self.power_lb_kW = (self.ranges['Facility Total Building Electricity Demand Rate(Whole Building)'][0] + # self.ranges['Facility Total HVAC Electricity Demand Rate(Whole Building)'][0]) / 1e3 # self.power_ub_kW = (self.ranges['Facility Total Building Electricity Demand Rate(Whole Building)'][1] + # self.ranges['Facility Total HVAC Electricity Demand Rate(Whole Building)'][1] ) / 1e3
[docs] def reset(self, *, seed=None, options=None): """ Reset `dc_gymenv` to initial state. Args: seed (int, optional): Random seed. options (dict, optional): Environment options. Returns: raw_curr_state (List[float]): Current state of the environmment {} (dict): A dictionary that containing additional information about the environment state """ super().reset(seed=self.seed) self.CRAC_Fan_load, self.CRAC_cooling_load, self.Compressor_load, self.CW_pump_load, self.CT_pump_load = None, None, None, None, None # self.HVAC_load = self.ranges['Facility Total HVAC Electricity Demand Rate(Whole Building)'][0] self.rackwise_cpu_pwr, self.rackwise_itfan_pwr, self.rackwise_gpu_pwr, self.rackwise_outlet_temp, self.rackwise_memory_power = [], [], [], [], [] self.water_usage = None # self.raw_curr_state = self.get_obs() self.consecutive_actions = 0 self.last_action = None self.action_scaling_factor = 1 # Starts with a scale factor of 1 self.info = { 'dc_ITE_total_power_kW': 0, 'dc_CT_total_power_kW': 0, 'dc_Compressor_total_power_kW': 0, 'dc_HVAC_total_power_kW': 0, 'dc_total_power_kW': 0, 'dc_crac_setpoint_delta': 16, 'dc_crac_setpoint': 16, 'dc_cpu_workload_fraction': 1, 'dc_gpu_workload_fraction': 1 if self.has_gpus else 0, # Added GPU workload 'dc_mem_workload_fraction': 1, 'dc_int_temperature': 16, 'dc_exterior_ambient_temp': 16, 'dc_CW_pump_power_kW': 0, 'dc_CT_pump_power_kW': 0, 'dc_water_usage': 0, } if self.scale_obs: return self.normalize(self.raw_curr_state), self.info return None, self.info
[docs] def step(self, raw_curr_stpt): """ Makes an environment step in`dc_gymenv. Args: action_id (int): Action to take. Returns: observations (List[float]): Current state of the environmment reward (float): reward value. done (bool): A boolean value signaling the if the episode has ended. info (dict): A dictionary that containing additional information about the environment state """ self.raw_curr_stpt = raw_curr_stpt # Set a fixed CRAC setpoint to 18 C # Prepare load percentages for all racks ITE_load_pct_list = [self.cpu_load_frac*100 for i in range(self.DC_Config.NUM_RACKS)] mem_load_pct_list = [self.mem_load_frac*100 for i in range(self.DC_Config.NUM_RACKS)] # Prepare GPU load if GPUs are present GPU_load_pct_list = None if self.has_gpus: GPU_load_pct_list = [self.gpu_load_frac*100 for i in range(self.DC_Config.NUM_RACKS)] # Calculate power with GPU support result = self.dc.compute_datacenter_IT_load_outlet_temp( ITE_load_pct_list=ITE_load_pct_list, CRAC_setpoint=self.raw_curr_stpt, GPU_load_pct_list=GPU_load_pct_list, MEMORY_load_pct_list=mem_load_pct_list ) # Unpack result based on whether it includes GPU power if len(result) == 5: # Includes GPU power self.rackwise_cpu_pwr, self.rackwise_itfan_pwr, rackwise_memory_power, self.rackwise_gpu_pwr, self.rackwise_outlet_temp = result else: # Original version without GPU self.rackwise_cpu_pwr, self.rackwise_itfan_pwr, rackwise_memory_power, self.rackwise_outlet_temp = result self.rackwise_gpu_pwr = [0] * len(self.rackwise_cpu_pwr) avg_CRAC_return_temp = DataCenter.calculate_avg_CRAC_return_temp( rack_return_approach_temp_list=self.DC_Config.RACK_RETURN_APPROACH_TEMP_LIST, rackwise_outlet_temp=self.rackwise_outlet_temp ) # Calculate total power including GPU if present data_center_total_ITE_Load = sum(self.rackwise_cpu_pwr) + sum(self.rackwise_itfan_pwr) + sum(self.rackwise_gpu_pwr) + sum(rackwise_memory_power) self.CRAC_Fan_load, self.CT_Cooling_load, self.CRAC_Cooling_load, self.Compressor_load, self.CW_pump_load, self.CT_pump_load = DataCenter.calculate_HVAC_power( CRAC_setpoint=self.raw_curr_stpt, avg_CRAC_return_temp=avg_CRAC_return_temp, ambient_temp=self.ambient_temp, data_center_full_load=data_center_total_ITE_Load, # Use total load including GPU DC_Config=self.DC_Config ) self.HVAC_load = self.CT_Cooling_load + self.Compressor_load # Set the additional attributes for the cooling tower water usage calculation self.dc.hot_water_temp = avg_CRAC_return_temp # °C self.dc.cold_water_temp = self.raw_curr_stpt # °C self.dc.wet_bulb_temp = self.wet_bulb # °C from weather data # Calculate the cooling tower water usage self.water_usage = self.dc.calculate_cooling_tower_water_usage() # calculate reward self.reward = 0 # calculate self.raw_next_state # self.raw_next_state = self.get_obs() # Update info dictionary with GPU information self.info = { 'dc_ITE_total_power_kW': data_center_total_ITE_Load / 1e3, 'dc_CT_total_power_kW': self.CT_Cooling_load / 1e3, 'dc_Compressor_total_power_kW': self.Compressor_load / 1e3, 'dc_HVAC_total_power_kW': (self.CT_Cooling_load + self.Compressor_load) / 1e3, 'dc_total_power_kW': (data_center_total_ITE_Load + self.CT_Cooling_load + self.Compressor_load) / 1e3, 'dc_crac_setpoint': self.raw_curr_stpt, 'dc_cpu_workload_fraction': self.cpu_load_frac, 'dc_gpu_workload_fraction': self.gpu_load_frac if self.has_gpus else 0, # Added GPU workload 'dc_int_temperature': np.mean(self.rackwise_outlet_temp), 'dc_exterior_ambient_temp': self.ambient_temp, 'dc_CW_pump_power_kW': self.CW_pump_load, 'dc_CT_pump_power_kW': self.CT_pump_load, 'dc_water_usage': self.water_usage, } # Done and truncated are managed by the main class truncated = False done = False # Return processed/unprocessed state to agent if self.scale_obs: return self.normalize(self.raw_next_state), self.reward, done, truncated, self.info return None, self.reward, done, truncated, self.info
[docs] def normalize(self, obs): """ Normalizes the observation. """ return np.float32((obs-self.obs_min)/self.obs_delta)
[docs] def get_obs(self): """ Returns the observation at the current time step. Returns: observation (List[float]): Current state of the environmment. """ zone_air_therm_cooling_stpt = self.min_temp # in C, default for reset state if self.raw_curr_stpt is not None: zone_air_therm_cooling_stpt = self.raw_curr_stpt zone_air_temp = self.obs_min[2] # in C, default for reset state if self.rackwise_outlet_temp: zone_air_temp = sum(self.rackwise_outlet_temp)/len(self.rackwise_outlet_temp) # 'Facility Total HVAC Electricity Demand Rate(Whole Building)' ie 'HVAC POWER' hvac_power = self.HVAC_load # Calculate 'Facility Total Building Electricity Demand Rate(Whole Building)' i.e. 'IT POWER' it_power = 0 # Add CPU power if available if self.rackwise_cpu_pwr: it_power += sum(self.rackwise_cpu_pwr) # Add IT fan power if available if hasattr(self, 'rackwise_itfan_pwr') and self.rackwise_itfan_pwr: it_power += sum(self.rackwise_itfan_pwr) # Add GPU power if available if self.rackwise_gpu_pwr: it_power += sum(self.rackwise_gpu_pwr) # If no power components were available, use the fallback value if it_power == 0: it_power = self.ranges['Facility Total Building Electricity Demand Rate(Whole Building)'][0] # Basic observation list obs = [self.ambient_temp, zone_air_therm_cooling_stpt, zone_air_temp, hvac_power, it_power] return obs
[docs] def update_workloads(self, cpu_load, mem_load, gpu_load): """ Updates the current CPU, GPU amd MEMORY utilization. Fraction between 0.0 and 1.0 """ if 0.0 > cpu_load or cpu_load > 1.0: print('CPU load out of bounds') assert 0.0 <= cpu_load <= 1.0, 'CPU load out of bounds' self.cpu_load_frac = cpu_load if 0.0 > gpu_load or gpu_load > 1.0: print('GPU load out of bounds') assert 0.0 <= gpu_load <= 1.0, 'GPU load out of bounds' self.gpu_load_frac = gpu_load if 0.0 > mem_load or mem_load > 1.0: print('Memory load out of bounds') assert 0.0 <= mem_load <= 1.0, 'Memory load out of bounds' self.mem_load_frac = mem_load
[docs] def set_ambient_temp(self, ambient_temp, wet_bulb): """ Updates the external temperature. """ self.ambient_temp = ambient_temp self.wet_bulb = wet_bulb
[docs] def set_bat_SoC(self, bat_SoC): """ Updates the battery state of charge. """ self.bat_SoC = bat_SoC