From 0c0e883521a609322e7ee08c21d75321f877f420 Mon Sep 17 00:00:00 2001 From: Sean Williams Date: Fri, 12 Jan 2024 11:22:51 -0600 Subject: [PATCH] Experiment with a reward penalty for walking into walls and people --- baselines/red_gym_env.py | 46 ++++++++++++++++++++++++++++-- baselines/run_baseline_parallel.py | 5 +++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/baselines/red_gym_env.py b/baselines/red_gym_env.py index 1b667ff9c..d820bc83b 100644 --- a/baselines/red_gym_env.py +++ b/baselines/red_gym_env.py @@ -50,6 +50,8 @@ def __init__( self.s_path.mkdir(exist_ok=True) self.reset_count = 0 self.all_runs = [] + self.prev_x_pos = 0 + self.prev_y_pos = 0 # Set this in SOME subclasses self.metadata = {"render.modes": []} @@ -63,6 +65,13 @@ def __init__( WindowEvent.PRESS_BUTTON_A, WindowEvent.PRESS_BUTTON_B, ] + + self.move_actions = [ + WindowEvent.PRESS_ARROW_DOWN, + WindowEvent.PRESS_ARROW_LEFT, + WindowEvent.PRESS_ARROW_RIGHT, + WindowEvent.PRESS_ARROW_UP, + ] if self.extra_buttons: self.valid_actions.extend([ @@ -150,6 +159,7 @@ def reset(self, seed=None): self.max_event_rew = 0 self.max_level_rew = 0 self.last_health = 1 + self.noop_move = 0 self.total_healing_rew = 0 self.died_count = 0 self.party_size = 0 @@ -190,6 +200,7 @@ def render(self, reduce_res=True, add_memory=True, update_mem=True): axis=0) return game_pixels_render + # Implicitly called by P00 def step(self, action): self.run_action_on_emulator(action) @@ -262,6 +273,19 @@ def append_agent_stats(self, action): x_pos = self.read_m(0xD362) y_pos = self.read_m(0xD361) map_n = self.read_m(0xD35E) + + # If player was moved (up/down/left/right) but their position didn't change, + # it may mean they went up against an edge. For example, edge of a building, person, + # or the world. We should penalize this type of action in order to not waste exploration time. + self.noop_move = 0 + if self.valid_actions[action] in self.move_actions \ + and self.prev_x_pos == x_pos and self.prev_y_pos == y_pos: + self.noop_move = 1 + + + self.prev_x_pos = x_pos + self.prev_y_pos = y_pos + levels = [self.read_m(a) for a in [0xD18C, 0xD1B8, 0xD1E4, 0xD210, 0xD23C, 0xD268]] if self.use_screen_explore: expl = ('frames', self.knn_index.get_current_count()) @@ -278,7 +302,8 @@ def append_agent_stats(self, action): 'hp': self.read_hp_fraction(), expl[0]: expl[1], 'deaths': self.died_count, 'badge': self.get_badges(), - 'event': self.progress_reward['event'], 'healr': self.total_healing_rew + 'event': self.progress_reward['event'], 'healr': self.total_healing_rew, + 'noop_move': self.noop_move, }) def update_frame_knn_index(self, frame_vec): @@ -434,6 +459,12 @@ def read_bit(self, addr, bit: int) -> bool: # add padding so zero will read '0b100000000' instead of '0b0' return bin(256 + self.read_m(addr))[-bit-1] == '1' + + def is_in_battle(self): + ''' Return boolean: true if player is in any type of battle, else false. ''' + return self.read_m(0xD057) == 12 + + # Iterate through each of the pokemon we're carrying? def get_levels_sum(self): poke_levels = [max(self.read_m(a) - 2, 0) for a in [0xD18C, 0xD1B8, 0xD1E4, 0xD210, 0xD23C, 0xD268]] return max(sum(poke_levels) - 4, 0) # subtract starting pokemon level @@ -457,6 +488,16 @@ def get_knn_reward(self): base = (self.base_explore if self.levels_satisfied else cur_size) * pre_rew post = (cur_size if self.levels_satisfied else 0) * post_rew return base + post + + def get_movement_reward(self): + ''' + Yield a reward if the player is walking in the world and makes an up/down/left/right move + that results in the player moving coordinates. The aim is to penalize running into walls + or people, wasting time. + ''' + if self.noop_move and not self.is_in_battle(): + return -1 + return 0 def get_badges(self): return self.bit_count(self.read_m(0xD356)) @@ -532,7 +573,8 @@ def get_game_state_reward(self, print_stats=False): #'op_poke': self.reward_scale*self.max_opponent_poke * 800, #'money': self.reward_scale* money * 3, #'seen_poke': self.reward_scale * seen_poke_count * 400, - 'explore': self.reward_scale * self.get_knn_reward() + 'explore': self.reward_scale * self.get_knn_reward(), + 'noop_move': self.reward_scale * self.get_movement_reward() } return state_scores diff --git a/baselines/run_baseline_parallel.py b/baselines/run_baseline_parallel.py index f4423a3a5..d34326ce7 100644 --- a/baselines/run_baseline_parallel.py +++ b/baselines/run_baseline_parallel.py @@ -27,10 +27,12 @@ def _init(): ep_length = 2048 * 8 +# ep_length = 500 sess_path = Path(f'session_{str(uuid.uuid4())[:8]}') env_config = { 'headless': True, 'save_final_state': True, 'early_stop': False, +# 'headless': False, 'save_final_state': True, 'early_stop': False, 'action_freq': 24, 'init_state': '../has_pokedex_nballs.state', 'max_steps': ep_length, 'print_rewards': True, 'save_video': False, 'fast_video': True, 'session_path': sess_path, 'gb_path': '../PokemonRed.gb', 'debug': False, 'sim_frame_dist': 2_000_000.0, @@ -38,7 +40,8 @@ def _init(): } - num_cpu = 44 #64 #46 # Also sets the number of episodes per training iteration + num_cpu = 46 #64 #46 # Also sets the number of episodes per training iteration +# num_cpu = 1 #64 #46 # Also sets the number of episodes per training iteration env = SubprocVecEnv([make_env(i, env_config) for i in range(num_cpu)]) checkpoint_callback = CheckpointCallback(save_freq=ep_length, save_path=sess_path,