diff --git a/epymarl_continuous_action_demo.ipynb b/epymarl_continuous_action_demo.ipynb new file mode 100644 index 00000000..7e8c8dc5 --- /dev/null +++ b/epymarl_continuous_action_demo.ipynb @@ -0,0 +1,1085 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M3w1MGpYePHX", + "outputId": "7b9b80d6-4b6c-4c12-8ff4-380f0135229a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'epymarl_continuous'...\n", + "remote: Enumerating objects: 855, done.\u001b[K\n", + "remote: Counting objects: 100% (599/599), done.\u001b[K\n", + "remote: Compressing objects: 100% (201/201), done.\u001b[K\n", + "remote: Total 855 (delta 470), reused 405 (delta 398), pack-reused 256 (from 1)\u001b[K\n", + "Receiving objects: 100% (855/855), 3.16 MiB | 10.20 MiB/s, done.\n", + "Resolving deltas: 100% (544/544), done.\n", + "/content/epymarl_continuous\n" + ] + } + ], + "source": [ + "!git clone https://github.com/MatthewCWeston/epymarl_continuous\n", + "%cd /content/epymarl_continuous" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Install requirements" + ], + "metadata": { + "id": "ELZK_OjiqfKr" + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true, + "id": "hO_xEsVKiAjG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "082e1c54-3f20-4efd-e846-2d2054f6b7f9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m958.1/958.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m108.5/108.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m434.2/434.2 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m108.2/108.2 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m46.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m183.5/183.5 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for mpyq (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install -qr requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "dhxh-tLciF3d" + }, + "outputs": [], + "source": [ + "!pip install -qr env_requirements.txt" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Train a model in a discrete setting" + ], + "metadata": { + "id": "8lMQhOV-3HgZ" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Run training" + ], + "metadata": { + "id": "kIuu11MVpQxD" + } + }, + { + "cell_type": "code", + "source": [ + "%cd /content/epymarl\n", + "'''\n", + " batch_size_run: Number of CPUs running agents\n", + " t_max: Total timesteps to train\n", + " save_model: Save the model during training.\n", + " buffer_size, batch_size: size of replay buffer; timesteps to update weights after.\n", + " obs_agent_id: Append a one-hot boolean to the end of the observation\n", + " runner/learner/<> log interval: Rate at which runner/learner/system should log outputs\n", + " target_update_interval_or_tau: Determines hard or soft updates to target weights.\n", + "'''\n", + "!python src/main.py --config=mappo --env-config=mpe with env_args.key=\"pz-mpe-simple-v3\" env_args.time_limit=25 env_args.max_cycles=25 save_model=True save_model_interval=20000 t_max=20000 batch_size_run=1 buffer_size=100 batch_size=100 obs_agent_id=False runner_log_interval=1000 learner_log_interval=1000 log_interval=1000 use_rnn=False" + ], + "metadata": { + "id": "l-QStfvAhI-V", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2c6312f0-9d7a-4c2f-c733-ed2f6a8ae7d7", + "collapsed": true + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[Errno 2] No such file or directory: '/content/epymarl'\n", + "/content/epymarl_continuous\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'version'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'version'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:00:43] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[INFO 07:00:43] root Saving to FileStorageObserver in results/sacred.\n", + "[DEBUG 07:00:43] pymarl Using capture mode \"fd\"\n", + "[INFO 07:00:43] pymarl Running command 'my_main'\n", + "[INFO 07:00:43] pymarl Started run with ID \"1\"\n", + "[DEBUG 07:00:43] pymarl Starting Heartbeat\n", + "[DEBUG 07:00:43] my_main Started\n", + "[WARNING 07:00:43] my_main CUDA flag use_cuda was switched OFF automatically because no CUDA devices are available!\n", + "[INFO 07:00:43] my_main Experiment Parameters:\n", + "[INFO 07:00:43] my_main \n", + "\n", + "{ 'action_selector': 'soft_policies',\n", + " 'add_value_last_step': True,\n", + " 'agent': 'rnn',\n", + " 'agent_output_type': 'pi_logits',\n", + " 'batch_size': 100,\n", + " 'batch_size_run': 1,\n", + " 'buffer_cpu_only': True,\n", + " 'buffer_size': 100,\n", + " 'checkpoint_path': '',\n", + " 'common_reward': True,\n", + " 'critic_type': 'cv_critic',\n", + " 'entropy_coef': 0.001,\n", + " 'env': 'gymma',\n", + " 'env_args': { 'continuous_actions': False,\n", + " 'key': 'pz-mpe-simple-v3',\n", + " 'max_cycles': 25,\n", + " 'pretrained_wrapper': None,\n", + " 'seed': 325454985,\n", + " 'time_limit': 25},\n", + " 'epochs': 4,\n", + " 'eps_clip': 0.2,\n", + " 'evaluate': False,\n", + " 'gamma': 0.99,\n", + " 'grad_norm_clip': 10,\n", + " 'hidden_dim': 128,\n", + " 'hypergroup': None,\n", + " 'label': 'default_label',\n", + " 'learner': 'ppo_learner',\n", + " 'learner_log_interval': 1000,\n", + " 'load_step': 0,\n", + " 'local_results_path': 'results',\n", + " 'log_interval': 1000,\n", + " 'lr': 0.0003,\n", + " 'mac': 'basic_mac',\n", + " 'mask_before_softmax': True,\n", + " 'name': 'mappo',\n", + " 'obs_agent_id': False,\n", + " 'obs_individual_obs': False,\n", + " 'obs_last_action': False,\n", + " 'optim_alpha': 0.99,\n", + " 'optim_eps': 1e-05,\n", + " 'q_nstep': 5,\n", + " 'render': False,\n", + " 'repeat_id': 1,\n", + " 'reward_scalarisation': 'sum',\n", + " 'runner': 'parallel',\n", + " 'runner_log_interval': 1000,\n", + " 'save_model': True,\n", + " 'save_model_interval': 20000,\n", + " 'save_replay': False,\n", + " 'seed': 325454985,\n", + " 'standardise_returns': False,\n", + " 'standardise_rewards': True,\n", + " 't_max': 20000,\n", + " 'target_update_interval_or_tau': 0.01,\n", + " 'test_greedy': True,\n", + " 'test_interval': 50000,\n", + " 'test_nepisode': 100,\n", + " 'use_cuda': False,\n", + " 'use_rnn': False,\n", + " 'use_tensorboard': False,\n", + " 'use_wandb': False,\n", + " 'wandb_mode': 'offline',\n", + " 'wandb_project': None,\n", + " 'wandb_save_model': False,\n", + " 'wandb_team': None}\n", + "\n", + "!!! USING MPE ENVIRONMENT\n", + "error: XDG_RUNTIME_DIR not set in the environment.\n", + "ARGS.agent_output_type equals pi_logits\n", + "BUILDING AGENTS! ARGS.N_ACTIONS = 5\n", + "[INFO 07:00:54] my_main Beginning training for 20000 timesteps\n", + "/usr/local/lib/python3.11/dist-packages/gymnasium/utils/passive_env_checker.py:245: UserWarning: \u001b[33mWARN: The reward returned by `step()` must be a float, int, np.integer or np.floating, actual type: \u001b[0m\n", + " logger.warn(\n", + "[INFO 07:00:55] my_main t_env: 25 / 20000\n", + "[INFO 07:00:55] my_main Estimated time left: 0 seconds. Time passed: 0 seconds\n", + "[INFO 07:01:00] my_main Saving models to results/models/mappo_seed325454985_pz-mpe-simple-v3_2025-01-20 07:00:43.729418/25\n", + "[INFO 07:01:04] my_main Recent Stats | t_env: 1000 | Episode: 40\n", + "ep_length_mean: 25.0000\treturn_mean: -6.9297\treturn_std: 0.0000\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:01:06] my_main Recent Stats | t_env: 2000 | Episode: 80\n", + "ep_length_mean: 25.0000\treturn_mean: -22.9197\treturn_std: 15.6504\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:01:17] my_main Recent Stats | t_env: 3000 | Episode: 120\n", + "advantage_mean: -0.0328\tagent_grad_norm: 0.3232\tcritic_grad_norm: 3.3858\tcritic_loss: 20.0603\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0145\tpi_max: 0.2220\tq_taken_mean: 0.0048\n", + "return_mean: -31.6833\treturn_std: 22.2490\ttarget_mean: -0.0280\ttd_error_abs: 3.5779\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:01:28] my_main Recent Stats | t_env: 4000 | Episode: 160\n", + "advantage_mean: 0.0081\tagent_grad_norm: 0.4809\tcritic_grad_norm: 5.6920\tcritic_loss: 14.9717\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0434\tpi_max: 0.4981\tq_taken_mean: -0.4299\n", + "return_mean: -33.6645\treturn_std: 24.8194\ttarget_mean: -0.4218\ttd_error_abs: 2.6657\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:01:39] my_main Recent Stats | t_env: 5000 | Episode: 200\n", + "advantage_mean: 0.0019\tagent_grad_norm: 0.5715\tcritic_grad_norm: 4.0342\tcritic_loss: 11.8651\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0302\tpi_max: 0.6113\tq_taken_mean: -0.2272\n", + "return_mean: -37.8320\treturn_std: 32.2903\ttarget_mean: -0.2253\ttd_error_abs: 2.1283\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:01:52] my_main Recent Stats | t_env: 6000 | Episode: 240\n", + "advantage_mean: -0.0031\tagent_grad_norm: 0.8296\tcritic_grad_norm: 3.1603\tcritic_loss: 10.1738\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0214\tpi_max: 0.6753\tq_taken_mean: 0.1021\n", + "return_mean: -41.0851\treturn_std: 36.7894\ttarget_mean: 0.0990\ttd_error_abs: 1.8246\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:02:09] my_main Recent Stats | t_env: 7000 | Episode: 280\n", + "advantage_mean: -0.0026\tagent_grad_norm: 0.6884\tcritic_grad_norm: 2.5945\tcritic_loss: 8.2177\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0180\tpi_max: 0.7182\tq_taken_mean: 0.7637\n", + "return_mean: -37.4185\treturn_std: 33.8437\ttarget_mean: 0.7611\ttd_error_abs: 1.5439\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:02:24] my_main Recent Stats | t_env: 8000 | Episode: 320\n", + "advantage_mean: 0.0037\tagent_grad_norm: 0.6502\tcritic_grad_norm: 1.9938\tcritic_loss: 4.3034\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0216\tpi_max: 0.8564\tq_taken_mean: 1.5671\n", + "return_mean: -31.4654\treturn_std: 31.7949\ttarget_mean: 1.5708\ttd_error_abs: 0.9187\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:02:37] my_main Recent Stats | t_env: 9000 | Episode: 360\n", + "advantage_mean: -0.0071\tagent_grad_norm: 0.6249\tcritic_grad_norm: 0.4502\tcritic_loss: 2.4553\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0015\tpi_max: 0.8806\tq_taken_mean: 2.7026\n", + "return_mean: -27.4281\treturn_std: 29.2183\ttarget_mean: 2.6955\ttd_error_abs: 0.6726\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:02:48] my_main Recent Stats | t_env: 10000 | Episode: 400\n", + "advantage_mean: -0.0058\tagent_grad_norm: 0.5313\tcritic_grad_norm: 0.3594\tcritic_loss: 1.4662\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0012\tpi_max: 0.8946\tq_taken_mean: 3.7236\n", + "return_mean: -20.2533\treturn_std: 21.7175\ttarget_mean: 3.7178\ttd_error_abs: 0.5728\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:02:59] my_main Recent Stats | t_env: 11000 | Episode: 440\n", + "advantage_mean: -0.0032\tagent_grad_norm: 0.4595\tcritic_grad_norm: 0.3135\tcritic_loss: 0.5808\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0023\tpi_max: 0.8992\tq_taken_mean: 4.7322\n", + "return_mean: -19.3304\treturn_std: 21.8210\ttarget_mean: 4.7290\ttd_error_abs: 0.4926\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:03:14] my_main Recent Stats | t_env: 12000 | Episode: 480\n", + "advantage_mean: -0.0049\tagent_grad_norm: 0.4810\tcritic_grad_norm: 0.3128\tcritic_loss: 0.5945\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0008\tpi_max: 0.9040\tq_taken_mean: 5.4564\n", + "return_mean: -18.0243\treturn_std: 21.7597\ttarget_mean: 5.4515\ttd_error_abs: 0.4952\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:03:27] my_main Recent Stats | t_env: 13000 | Episode: 520\n", + "advantage_mean: -0.0071\tagent_grad_norm: 0.4792\tcritic_grad_norm: 0.3474\tcritic_loss: 0.5579\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0018\tpi_max: 0.9006\tq_taken_mean: 6.3881\n", + "return_mean: -16.0734\treturn_std: 19.2666\ttarget_mean: 6.3809\ttd_error_abs: 0.4780\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:03:37] my_main Recent Stats | t_env: 14000 | Episode: 560\n", + "advantage_mean: -0.0082\tagent_grad_norm: 0.4032\tcritic_grad_norm: 0.3615\tcritic_loss: 0.5018\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0031\tpi_max: 0.8938\tq_taken_mean: 7.1837\n", + "return_mean: -15.2815\treturn_std: 18.8165\ttarget_mean: 7.1755\ttd_error_abs: 0.4494\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:03:47] my_main Recent Stats | t_env: 15000 | Episode: 600\n", + "advantage_mean: -0.0086\tagent_grad_norm: 0.3972\tcritic_grad_norm: 0.3543\tcritic_loss: 0.4522\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0038\tpi_max: 0.8918\tq_taken_mean: 7.9014\n", + "return_mean: -14.5954\treturn_std: 17.6852\ttarget_mean: 7.8928\ttd_error_abs: 0.4200\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:03:58] my_main Recent Stats | t_env: 16000 | Episode: 640\n", + "advantage_mean: -0.0104\tagent_grad_norm: 0.3606\tcritic_grad_norm: 0.4058\tcritic_loss: 0.5007\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0054\tpi_max: 0.8917\tq_taken_mean: 8.8111\n", + "return_mean: -13.0559\treturn_std: 16.1323\ttarget_mean: 8.8007\ttd_error_abs: 0.4266\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:04:10] my_main Recent Stats | t_env: 17000 | Episode: 680\n", + "advantage_mean: -0.0130\tagent_grad_norm: 0.4132\tcritic_grad_norm: 0.5269\tcritic_loss: 0.6327\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0079\tpi_max: 0.8950\tq_taken_mean: 9.9925\n", + "return_mean: -11.9053\treturn_std: 13.9895\ttarget_mean: 9.9795\ttd_error_abs: 0.4459\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:04:21] my_main Recent Stats | t_env: 18000 | Episode: 720\n", + "advantage_mean: -0.0125\tagent_grad_norm: 0.5489\tcritic_grad_norm: 0.5112\tcritic_loss: 0.6279\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0074\tpi_max: 0.9092\tq_taken_mean: 11.1412\n", + "return_mean: -11.8871\treturn_std: 13.2867\ttarget_mean: 11.1287\ttd_error_abs: 0.4471\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:04:33] my_main Recent Stats | t_env: 19000 | Episode: 760\n", + "advantage_mean: -0.0128\tagent_grad_norm: 0.5516\tcritic_grad_norm: 0.5379\tcritic_loss: 0.6009\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0083\tpi_max: 0.9253\tq_taken_mean: 12.4139\n", + "return_mean: -10.6144\treturn_std: 11.5925\ttarget_mean: 12.4012\ttd_error_abs: 0.4387\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:04:44] my_main Recent Stats | t_env: 20000 | Episode: 800\n", + "advantage_mean: -0.0149\tagent_grad_norm: 0.5191\tcritic_grad_norm: 0.6327\tcritic_loss: 0.5449\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0113\tpi_max: 0.9265\tq_taken_mean: 13.8211\n", + "return_mean: -9.2111\treturn_std: 9.2012\ttarget_mean: 13.8062\ttd_error_abs: 0.4166\n", + "test_ep_length_mean: 25.0000\ttest_return_mean: -33.4314\ttest_return_std: 31.3928\t\n", + "[INFO 07:04:44] my_main Saving models to results/models/mappo_seed325454985_pz-mpe-simple-v3_2025-01-20 07:00:43.729418/20025\n", + "[INFO 07:04:44] my_main Finished Training\n", + "Exiting Main\n", + "Stopping all threads\n", + "Thread Thread-1 is alive! Is daemon: False\n", + "Thread joined\n", + "Exiting script\n", + "[DEBUG 07:04:45] my_main Finished after 0:04:02.\n", + "[INFO 07:04:45] pymarl Completed after 0:04:02\n", + "[DEBUG 07:04:45] pymarl Stopping Heartbeat\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Imports" + ], + "metadata": { + "id": "A5KCf5itfKOU" + } + }, + { + "cell_type": "code", + "source": [ + "%cd /content/epymarl_continuous\n", + "import sys\n", + "sys.path.append('/content/epymarl_continuous/src')\n", + "import os\n", + "import yaml\n", + "import copy\n", + "import json\n", + "from types import SimpleNamespace as SN\n", + "from collections.abc import Mapping\n", + "import numpy as np\n", + "import torch\n", + "import matplotlib.pyplot as plt\n", + "from runners import REGISTRY as r_REGISTRY\n", + "from src.modules.agents.rnn_agent import RNNAgent\n", + "from src.components.action_selectors import ContinuousSelector\n", + "\n", + "from PIL import Image, ImageSequence, ImageDraw\n", + "import IPython.display\n", + "from collections import defaultdict\n", + "from pettingzoo.utils.wrappers.base import BaseWrapper\n", + "\n", + "from pettingzoo.mpe import simple_speaker_listener_v4, simple_reference_v3, simple_world_comm_v3, simple_v3" + ], + "metadata": { + "id": "56Fd6fIR-vZ9", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c62b3cf0-161b-4a0c-a4b9-0746ffaa5c26" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/epymarl_continuous\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### View Training Results" + ], + "metadata": { + "id": "-gRwsGbt-IOm" + } + }, + { + "cell_type": "code", + "source": [ + "# @title Display reward over time\n", + "env_name = 'pz-mpe-simple-v3'\n", + "alg_name = 'mappo'\n", + "\n", + "results_path = '/content/epymarl_continuous/results'\n", + "sacred_path = f'{results_path}/sacred/{alg_name}/pz-mpe-simple-v3'\n", + "results_dir = max([os.path.join(sacred_path, d) for d in os.listdir(sacred_path)], key=os.path.getmtime).split('/')[-1]\n", + "models_path = f'{results_path}/models'\n", + "models_path = max([os.path.join(models_path, d) for d in os.listdir(models_path)], key=os.path.getmtime) # Newest run\n", + "models_path = max([os.path.join(models_path, d) for d in os.listdir(models_path)], key=os.path.getmtime) # Last update\n", + "with open(f'{sacred_path}/{results_dir}/metrics.json', 'r') as f:\n", + " metrics = json.load(f)\n", + "metrics.keys()\n", + "plt.plot(metrics['return_mean']['values'])" + ], + "metadata": { + "id": "WMF79VXrK6lu", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 448 + }, + "cellView": "form", + "outputId": "11de2fab-15c9-420a-9f01-3b4ef777282a" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[]" + ] + }, + "metadata": {}, + "execution_count": 7 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Look up env data\n", + "# Get config dicts\n", + "cfg_path = '/content/epymarl_continuous/src/config'\n", + "alg_name = 'mappo'\n", + "\n", + "def recursive_dict_update(d, u):\n", + " for k, v in u.items():\n", + " if isinstance(v, Mapping):\n", + " d[k] = recursive_dict_update(d.get(k, {}), v)\n", + " else:\n", + " d[k] = v\n", + " return d\n", + "with open(os.path.join(cfg_path,\"default.yaml\",),\"r\",) as f:\n", + " cfg_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "with open(os.path.join(cfg_path,\"envs\",\"mpe.yaml\",),\"r\",) as f:\n", + " env_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "with open(os.path.join(cfg_path,\"algs\",f\"{alg_name}.yaml\",),\"r\",) as f:\n", + " alg_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "config_dict = recursive_dict_update(cfg_dict, env_dict)\n", + "config_dict = recursive_dict_update(cfg_dict, alg_dict)\n", + "#\n", + "config_dict[\"env_args\"][\"seed\"] = 0\n", + "config_dict[\"env_args\"][\"key\"] = env_name\n", + "config_dict[\"env_args\"][\"time_limit\"]=25\n", + "#\n", + "args = SN(**config_dict)\n", + "args.device = \"cpu\"\n", + "args.unique_token = 'TESTTESTTEST'\n", + "#\n", + "runner = r_REGISTRY[args.runner](args=args,logger=None) # Some algs () use parallel. batch_size_run increases parallelism. ippo, coma, ia2c, maa2c, mappo\n", + "env_info = runner.get_env_info()\n", + "args.n_agents = env_info[\"n_agents\"]\n", + "args.n_actions = env_info[\"n_actions\"]\n", + "args.state_shape = env_info[\"state_shape\"]" + ], + "metadata": { + "id": "c9igXBbxUVNg", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b2d82b20-9dfd-43a7-ad96-87e5b7bb3f0c" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "!!! USING MPE ENVIRONMENT\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Display a sample run\n", + "args.use_rnn = False # Since the default is 'true' for MAPPO but not MAPPO_NS\n", + "agent = RNNAgent(env_info['obs_shape'], args)\n", + "agent.load_state_dict(torch.load(f'{models_path}/agent.th', map_location=lambda storage, loc: storage))\n", + "env_cfg = {'max_cycles': 75}\n", + "target_env = {\n", + " \"pz-mpe-simple-speaker-listener-v4\": simple_speaker_listener_v4,\n", + " \"pz-mpe-simple-reference-v3\": simple_reference_v3,\n", + " \"pz-mpe-simple-world-comm-v3\": simple_world_comm_v3,\n", + " \"pz-mpe-simple-v3\": simple_v3,\n", + "}[env_name]\n", + "env = target_env.env(**env_cfg)\n", + "while isinstance(env, BaseWrapper): # Discard all wrappers\n", + " env = env.env\n", + "env.reset()\n", + "try:\n", + " env.env.render_mode = 'rgb_array' # We want a video of what our agent is doing.\n", + "except Exception:\n", + " env.render_mode = 'rgb_array' # For e.g. Pettingzoo, with no base env below.\n", + "agent_ids = {agent: torch.nn.functional.one_hot(torch.tensor(i), num_classes=len(env.agents)) for i, agent in enumerate(env.agents)}\n", + "env.reset()\n", + "rewards = defaultdict(lambda: 0)\n", + "random_act = False\n", + "images = []\n", + "\n", + "# We have one agent, so pad the observation with zeros and add a one-hot to the end.\n", + "def act(a, obs):\n", + " #obs = np.pad(obs, (0, env_info['obs_shape'] + len(env.agents) - len(obs)))\n", + " #obs[-len(env.agents):] = agent_ids[a]\n", + " n = env.action_space(a).n\n", + " action_logits, _ = agent(torch.tensor(obs), torch.tensor([]))\n", + " return action_logits[:n].argmax().numpy()\n", + "\n", + "while (True):\n", + " selected_agent = env.agent_selection\n", + " if (env.terminations[selected_agent] or env.truncations[selected_agent]):\n", + " break # Terminated or truncated.\n", + " obs = env.observe(selected_agent)\n", + " if (random_act): # Something to compare against, as a baseline.\n", + " action = env.action_space(selected_agent).sample()\n", + " else:\n", + " action = act(selected_agent, obs)\n", + " env.step(action)\n", + " rewards[selected_agent] += env.rewards[selected_agent]\n", + " images.append(env.render())\n", + "print(rewards)\n", + "# Display the video. Takes a few minutes with large images.\n", + "new_size = 200\n", + "gif_list = [Image.fromarray(x).resize((new_size,new_size)) for x in images]\n", + "gif_list[0].save(\"./test.gif\", save_all=True, append_images=gif_list[1:])\n", + "IPython.display.Image(filename='./test.gif')" + ], + "metadata": { + "id": "CO9yb4FQoXE1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 292 + }, + "cellView": "form", + "outputId": "e018d8db-9423-40f5-95a5-b95399c713f4" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " agent.load_state_dict(torch.load(f'{models_path}/agent.th', map_location=lambda storage, loc: storage))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "defaultdict( at 0x7b2c45fd7a60>, {'agent_0': -11.212522623872562})\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "image/gif": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Train a model in a continuous setting" + ], + "metadata": { + "id": "vycSQ9s9-Lbd" + } + }, + { + "cell_type": "code", + "source": [ + "!python src/main.py --config=mappo_c --env-config=mpe_continuous with env_args.key=\"pz-mpe-simple-v3\" env_args.time_limit=25 env_args.max_cycles=25 save_model=True save_model_interval=20000 t_max=20000 batch_size_run=1 buffer_size=100 batch_size=100 obs_agent_id=False runner_log_interval=1000 learner_log_interval=1000 log_interval=1000 use_rnn=False" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LWsKBcNsYGK4", + "outputId": "fedf6ee1-e4d1-4321-e5db-65697c22a3e1" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[DEBUG 07:05:09] git.cmd Popen(['git', 'version'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'version'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--cached', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'diff', '--abbrev=40', '--full-index', '--raw'], cwd=/content/epymarl_continuous, stdin=None, shell=False, universal_newlines=False)\n", + "[DEBUG 07:05:09] git.cmd Popen(['git', 'cat-file', '--batch-check'], cwd=/content/epymarl_continuous, stdin=, shell=False, universal_newlines=False)\n", + "[INFO 07:05:09] root Saving to FileStorageObserver in results/sacred.\n", + "[DEBUG 07:05:09] pymarl Using capture mode \"fd\"\n", + "[INFO 07:05:09] pymarl Running command 'my_main'\n", + "[INFO 07:05:09] pymarl Started run with ID \"1\"\n", + "[DEBUG 07:05:09] pymarl Starting Heartbeat\n", + "[DEBUG 07:05:09] my_main Started\n", + "[WARNING 07:05:09] my_main CUDA flag use_cuda was switched OFF automatically because no CUDA devices are available!\n", + "[INFO 07:05:09] my_main Experiment Parameters:\n", + "!!! USING MPE ENVIRONMENT\n", + "[INFO 07:05:09] my_main \n", + "\n", + "{ 'action_selector': 'continuous',\n", + " 'add_value_last_step': True,\n", + " 'agent': 'rnn',\n", + " 'agent_output_type': 'continuous_u_std',\n", + " 'batch_size': 100,\n", + " 'batch_size_run': 1,\n", + " 'buffer_cpu_only': True,\n", + " 'buffer_size': 100,\n", + " 'checkpoint_path': '',\n", + " 'common_reward': True,\n", + " 'critic_type': 'cv_critic',\n", + " 'entropy_coef': 0.001,\n", + " 'env': 'gymma',\n", + " 'env_args': { 'continuous_actions': True,\n", + " 'key': 'pz-mpe-simple-v3',\n", + " 'max_cycles': 25,\n", + " 'pretrained_wrapper': None,\n", + " 'seed': 801163060,\n", + " 'time_limit': 25},\n", + " 'epochs': 4,\n", + " 'eps_clip': 0.2,\n", + " 'evaluate': False,\n", + " 'gamma': 0.99,\n", + " 'grad_norm_clip': 10,\n", + " 'hidden_dim': 128,\n", + " 'hypergroup': None,\n", + " 'label': 'default_label',\n", + " 'learner': 'ppo_c_learner',\n", + " 'learner_log_interval': 1000,\n", + " 'load_step': 0,\n", + " 'local_results_path': 'results',\n", + " 'log_interval': 1000,\n", + " 'lr': 0.0003,\n", + " 'mac': 'basic_mac',\n", + " 'mask_before_softmax': False,\n", + " 'name': 'mappo_c',\n", + " 'obs_agent_id': False,\n", + " 'obs_individual_obs': False,\n", + " 'obs_last_action': False,\n", + " 'optim_alpha': 0.99,\n", + " 'optim_eps': 1e-05,\n", + " 'q_nstep': 5,\n", + " 'render': False,\n", + " 'repeat_id': 1,\n", + " 'reward_scalarisation': 'sum',\n", + " 'runner': 'parallel',\n", + " 'runner_log_interval': 1000,\n", + " 'save_model': True,\n", + " 'save_model_interval': 20000,\n", + " 'save_replay': False,\n", + " 'seed': 801163060,\n", + " 'standardise_returns': False,\n", + " 'standardise_rewards': True,\n", + " 't_max': 20000,\n", + " 'target_update_interval_or_tau': 0.01,\n", + " 'test_greedy': True,\n", + " 'test_interval': 50000,\n", + " 'test_nepisode': 100,\n", + " 'use_cuda': False,\n", + " 'use_rnn': False,\n", + " 'use_tensorboard': False,\n", + " 'use_wandb': False,\n", + " 'wandb_mode': 'offline',\n", + " 'wandb_project': None,\n", + " 'wandb_save_model': False,\n", + " 'wandb_team': None}\n", + "\n", + "error: XDG_RUNTIME_DIR not set in the environment.\n", + "!!! Using continuous action space\n", + "ARGS.agent_output_type equals continuous_u_std\n", + "ARGS.N_ACTIONS equals 10\n", + "BUILDING AGENTS! ARGS.N_ACTIONS = 10\n", + "Action min/max = tensor([0., 0., 0., 0., 0.])/tensor([1., 1., 1., 1., 1.])\n", + "[INFO 07:05:11] my_main Beginning training for 20000 timesteps\n", + "/usr/local/lib/python3.11/dist-packages/gymnasium/utils/passive_env_checker.py:245: UserWarning: \u001b[33mWARN: The reward returned by `step()` must be a float, int, np.integer or np.floating, actual type: \u001b[0m\n", + " logger.warn(\n", + "[INFO 07:05:11] my_main t_env: 25 / 20000\n", + "[INFO 07:05:11] my_main Estimated time left: 0 seconds. Time passed: 0 seconds\n", + "[INFO 07:05:17] my_main Saving models to results/models/mappo_c_seed801163060_pz-mpe-simple-v3_2025-01-20 07:05:09.806787/25\n", + "[INFO 07:05:19] my_main Recent Stats | t_env: 1000 | Episode: 40\n", + "ep_length_mean: 25.0000\treturn_mean: -97.4163\treturn_std: 0.0000\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:05:21] my_main Recent Stats | t_env: 2000 | Episode: 80\n", + "ep_length_mean: 25.0000\treturn_mean: -71.4287\treturn_std: 20.1024\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:05:28] my_main Recent Stats | t_env: 3000 | Episode: 120\n", + "advantage_mean: -0.0095\tagent_grad_norm: 23.9457\tcritic_grad_norm: 4.7066\tcritic_loss: 20.2512\n", + "ep_length_mean: 25.0000\tpg_loss: -0.7599\tq_taken_mean: -0.1117\treturn_mean: -60.6184\n", + "return_std: 25.0593\ttarget_mean: -0.1212\ttd_error_abs: 3.4306\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:05:40] my_main Recent Stats | t_env: 4000 | Episode: 160\n", + "advantage_mean: 0.1101\tagent_grad_norm: 12.8170\tcritic_grad_norm: 5.5825\tcritic_loss: 12.4351\n", + "ep_length_mean: 25.0000\tpg_loss: -0.9735\tq_taken_mean: -0.2281\treturn_mean: -55.4595\n", + "return_std: 27.0492\ttarget_mean: -0.1180\ttd_error_abs: 2.5364\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:05:52] my_main Recent Stats | t_env: 5000 | Episode: 200\n", + "advantage_mean: 0.0740\tagent_grad_norm: 8.9635\tcritic_grad_norm: 4.0214\tcritic_loss: 9.1564\n", + "ep_length_mean: 25.0000\tpg_loss: -0.6552\tq_taken_mean: -0.1237\treturn_mean: -50.9551\n", + "return_std: 26.0949\ttarget_mean: -0.0496\ttd_error_abs: 2.0248\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:06:02] my_main Recent Stats | t_env: 6000 | Episode: 240\n", + "advantage_mean: 0.0540\tagent_grad_norm: 7.2455\tcritic_grad_norm: 3.2324\tcritic_loss: 7.4218\n", + "ep_length_mean: 25.0000\tpg_loss: -0.4888\tq_taken_mean: 0.0144\treturn_mean: -37.4115\n", + "return_std: 30.4888\ttarget_mean: 0.0684\ttd_error_abs: 1.7751\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:06:16] my_main Recent Stats | t_env: 7000 | Episode: 280\n", + "advantage_mean: 0.0408\tagent_grad_norm: 5.9704\tcritic_grad_norm: 2.7585\tcritic_loss: 6.5333\n", + "ep_length_mean: 25.0000\tpg_loss: -0.3831\tq_taken_mean: 0.0559\treturn_mean: -35.0463\n", + "return_std: 28.2351\ttarget_mean: 0.0968\ttd_error_abs: 1.6686\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:06:29] my_main Recent Stats | t_env: 8000 | Episode: 320\n", + "advantage_mean: 0.0414\tagent_grad_norm: 1.6520\tcritic_grad_norm: 1.9828\tcritic_loss: 3.0511\n", + "ep_length_mean: 25.0000\tpg_loss: -0.2296\tq_taken_mean: 0.2686\treturn_mean: -34.1176\n", + "return_std: 27.1870\ttarget_mean: 0.3099\ttd_error_abs: 1.2362\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:06:41] my_main Recent Stats | t_env: 9000 | Episode: 360\n", + "advantage_mean: -0.0051\tagent_grad_norm: 1.8440\tcritic_grad_norm: 0.8005\tcritic_loss: 2.7219\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0039\tq_taken_mean: 0.8437\treturn_mean: -30.5352\n", + "return_std: 25.0523\ttarget_mean: 0.8387\ttd_error_abs: 1.1675\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:06:52] my_main Recent Stats | t_env: 10000 | Episode: 400\n", + "advantage_mean: -0.0098\tagent_grad_norm: 3.0485\tcritic_grad_norm: 0.7740\tcritic_loss: 2.6864\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0236\tq_taken_mean: 1.9520\treturn_mean: -28.5112\n", + "return_std: 24.8412\ttarget_mean: 1.9422\ttd_error_abs: 1.2034\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:07:05] my_main Recent Stats | t_env: 11000 | Episode: 440\n", + "advantage_mean: -0.0140\tagent_grad_norm: 3.4330\tcritic_grad_norm: 0.7819\tcritic_loss: 2.6649\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0405\tq_taken_mean: 3.4180\treturn_mean: -25.7437\n", + "return_std: 22.8684\ttarget_mean: 3.4040\ttd_error_abs: 1.2188\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:07:14] my_main Recent Stats | t_env: 12000 | Episode: 480\n", + "advantage_mean: -0.0128\tagent_grad_norm: 3.9679\tcritic_grad_norm: 0.7167\tcritic_loss: 2.3873\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0333\tq_taken_mean: 5.4551\treturn_mean: -21.8037\n", + "return_std: 20.2891\ttarget_mean: 5.4424\ttd_error_abs: 1.1596\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:07:25] my_main Recent Stats | t_env: 13000 | Episode: 520\n", + "advantage_mean: -0.0145\tagent_grad_norm: 4.1446\tcritic_grad_norm: 0.6684\tcritic_loss: 2.0717\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0400\tq_taken_mean: 7.6995\treturn_mean: -17.4344\n", + "return_std: 17.0198\ttarget_mean: 7.6850\ttd_error_abs: 1.0622\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:07:35] my_main Recent Stats | t_env: 14000 | Episode: 560\n", + "advantage_mean: -0.0129\tagent_grad_norm: 4.0729\tcritic_grad_norm: 0.6556\tcritic_loss: 1.6912\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0343\tq_taken_mean: 9.9183\treturn_mean: -14.9538\n", + "return_std: 14.6946\ttarget_mean: 9.9055\ttd_error_abs: 0.9435\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:07:47] my_main Recent Stats | t_env: 15000 | Episode: 600\n", + "advantage_mean: -0.0094\tagent_grad_norm: 3.3410\tcritic_grad_norm: 0.5496\tcritic_loss: 1.3534\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0200\tq_taken_mean: 11.8002\treturn_mean: -12.2838\n", + "return_std: 12.2316\ttarget_mean: 11.7909\ttd_error_abs: 0.8274\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:00] my_main Recent Stats | t_env: 16000 | Episode: 640\n", + "advantage_mean: -0.0036\tagent_grad_norm: 2.8306\tcritic_grad_norm: 0.4950\tcritic_loss: 1.0917\n", + "ep_length_mean: 25.0000\tpg_loss: -0.0034\tq_taken_mean: 13.4693\treturn_mean: -10.5366\n", + "return_std: 11.0997\ttarget_mean: 13.4657\ttd_error_abs: 0.7254\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:15] my_main Recent Stats | t_env: 17000 | Episode: 680\n", + "advantage_mean: -0.0062\tagent_grad_norm: 2.4501\tcritic_grad_norm: 0.5308\tcritic_loss: 0.9181\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0108\tq_taken_mean: 14.7738\treturn_mean: -9.7938\n", + "return_std: 10.0751\ttarget_mean: 14.7676\ttd_error_abs: 0.6425\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:24] my_main Recent Stats | t_env: 18000 | Episode: 720\n", + "advantage_mean: -0.0072\tagent_grad_norm: 2.3061\tcritic_grad_norm: 0.5755\tcritic_loss: 0.7976\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0196\tq_taken_mean: 15.8772\treturn_mean: -9.1509\n", + "return_std: 8.8361\ttarget_mean: 15.8700\ttd_error_abs: 0.5875\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:35] my_main Recent Stats | t_env: 19000 | Episode: 760\n", + "advantage_mean: -0.0118\tagent_grad_norm: 2.0626\tcritic_grad_norm: 0.6216\tcritic_loss: 0.6859\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0435\tq_taken_mean: 16.8370\treturn_mean: -9.4669\n", + "return_std: 8.6544\ttarget_mean: 16.8252\ttd_error_abs: 0.5461\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:44] my_main Recent Stats | t_env: 20000 | Episode: 800\n", + "advantage_mean: -0.0124\tagent_grad_norm: 1.7033\tcritic_grad_norm: 0.6542\tcritic_loss: 0.6621\n", + "ep_length_mean: 25.0000\tpg_loss: 0.0461\tq_taken_mean: 17.6163\treturn_mean: -9.8666\n", + "return_std: 9.3582\ttarget_mean: 17.6039\ttd_error_abs: 0.5283\ttest_ep_length_mean: 25.0000\n", + "test_return_mean: -36.2057\ttest_return_std: 29.6607\t\n", + "[INFO 07:08:45] my_main Saving models to results/models/mappo_c_seed801163060_pz-mpe-simple-v3_2025-01-20 07:05:09.806787/20025\n", + "[INFO 07:08:45] my_main Finished Training\n", + "Exiting Main\n", + "Stopping all threads\n", + "Thread Thread-1 is alive! Is daemon: False\n", + "Thread joined\n", + "Exiting script\n", + "[DEBUG 07:08:46] my_main Finished after 0:03:36.\n", + "[INFO 07:08:46] pymarl Completed after 0:03:36\n", + "[DEBUG 07:08:46] pymarl Stopping Heartbeat\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### View training results" + ], + "metadata": { + "id": "4I9MK2CN-Qgd" + } + }, + { + "cell_type": "code", + "source": [ + "# @title Display reward over time\n", + "env_name = 'pz-mpe-simple-v3'\n", + "alg_name = 'mappo_c'\n", + "\n", + "results_path = '/content/epymarl_continuous/results'\n", + "sacred_path = f'{results_path}/sacred/{alg_name}/pz-mpe-simple-v3'\n", + "results_dir = max([os.path.join(sacred_path, d) for d in os.listdir(sacred_path)], key=os.path.getmtime).split('/')[-1]\n", + "\n", + "models_path = f'{results_path}/models'\n", + "models_path = max([os.path.join(models_path, d) for d in os.listdir(models_path)], key=os.path.getmtime) # Newest run\n", + "models_path = max([os.path.join(models_path, d) for d in os.listdir(models_path)], key=os.path.getmtime) # Last update\n", + "\n", + "with open(f'{sacred_path}/{results_dir}/metrics.json', 'r') as f:\n", + " metrics = json.load(f)\n", + "metrics.keys()\n", + "plt.plot(metrics['return_mean']['values'])" + ], + "metadata": { + "id": "PThcSORUSxIm", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 448 + }, + "cellView": "form", + "outputId": "a925b54d-6ddc-4095-fcef-bb5915dbdff8" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[]" + ] + }, + "metadata": {}, + "execution_count": 11 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Look up env data\n", + "# Get config dicts\n", + "cfg_path = '/content/epymarl_continuous/src/config'\n", + "alg_name = 'mappo'\n", + "\n", + "def recursive_dict_update(d, u):\n", + " for k, v in u.items():\n", + " if isinstance(v, Mapping):\n", + " d[k] = recursive_dict_update(d.get(k, {}), v)\n", + " else:\n", + " d[k] = v\n", + " return d\n", + "with open(os.path.join(cfg_path,\"default.yaml\",),\"r\",) as f:\n", + " cfg_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "with open(os.path.join(cfg_path,\"envs\",\"mpe.yaml\",),\"r\",) as f:\n", + " env_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "with open(os.path.join(cfg_path,\"algs\",f\"{alg_name}.yaml\",),\"r\",) as f:\n", + " alg_dict = yaml.load(f, Loader=yaml.FullLoader)\n", + "config_dict = recursive_dict_update(cfg_dict, env_dict)\n", + "config_dict = recursive_dict_update(cfg_dict, alg_dict)\n", + "#\n", + "config_dict[\"env_args\"][\"seed\"] = 0\n", + "config_dict[\"env_args\"][\"key\"] = env_name\n", + "config_dict[\"env_args\"][\"time_limit\"]=25\n", + "#\n", + "args = SN(**config_dict)\n", + "args.device = \"cpu\"\n", + "args.unique_token = 'TESTTESTTEST'\n", + "#\n", + "runner = r_REGISTRY[args.runner](args=args,logger=None) # Some algs () use parallel. batch_size_run increases parallelism. ippo, coma, ia2c, maa2c, mappo\n", + "env_info = runner.get_env_info()\n", + "args.n_agents = env_info[\"n_agents\"]\n", + "args.n_actions = env_info[\"n_actions\"]\n", + "args.state_shape = env_info[\"state_shape\"]" + ], + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Fc21TOLrKjqN", + "outputId": "4b44da63-713e-4369-8efe-c4ceba76f7fd" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "!!! USING MPE ENVIRONMENT\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Display a sample run\n", + "args_temp = copy.copy(args)\n", + "args_temp.use_rnn = False # Since the default is apparently 'true' for MAPPO but not MAPPO_NS\n", + "args_temp.n_actions *= 2\n", + "agent = RNNAgent(env_info['obs_shape'], args_temp)\n", + "agent.load_state_dict(torch.load(f'{models_path}/agent.th', map_location=lambda storage, loc: storage))\n", + "selector = ContinuousSelector(args_temp)\n", + "\n", + "env_cfg = {'max_cycles': 75, 'continuous_actions': True}\n", + "target_env = simple_v3\n", + "\n", + "env = target_env.env(**env_cfg)\n", + "while isinstance(env, BaseWrapper): # Discard all wrappers\n", + " env = env.env\n", + "env.reset()\n", + "try:\n", + " env.env.render_mode = 'rgb_array' # We want a video of what our agent is doing.\n", + "except Exception:\n", + " env.render_mode = 'rgb_array' # For e.g. Pettingzoo, with no base env below.\n", + "\n", + "env.reset()\n", + "rewards = defaultdict(lambda: 0)\n", + "random_act = False\n", + "images = []\n", + "\n", + "# We have one agent, so pad the observation with zeros and add a one-hot to the end.\n", + "def act(a, obs, selector):\n", + " #obs = np.pad(obs, (0, env_info['obs_shape'] + len(env.agents) - len(obs)))\n", + " #obs[-len(env.agents):] = agent_ids[a]\n", + " agent_out, _ = agent(torch.tensor(obs), torch.tensor([]))\n", + " agent_out = agent_out.unsqueeze(0).unsqueeze(0)\n", + " action = selector.select_action(agent_out, None, None)\n", + " return action.squeeze()\n", + "\n", + "while (True):\n", + " selected_agent = env.agent_selection\n", + " if (env.terminations[selected_agent] or env.truncations[selected_agent]):\n", + " break # Terminated or truncated.\n", + " obs = env.observe(selected_agent)\n", + " if (random_act): # Something to compare against, as a baseline.\n", + " action = env.action_space(selected_agent).sample()\n", + " else:\n", + " action = act(selected_agent, obs, selector)\n", + " env.step(action)\n", + " rewards[selected_agent] += env.rewards[selected_agent]\n", + " images.append(env.render())\n", + "print(rewards)\n", + "\n", + "# Display the video. Takes a few minutes with large images.\n", + "new_size = 200\n", + "gif_list = [Image.fromarray(x).resize((new_size,new_size)) for x in images]\n", + "gif_list[0].save(\"./test.gif\", save_all=True, append_images=gif_list[1:])\n", + "IPython.display.Image(filename='./test.gif')" + ], + "metadata": { + "id": "AfFEXmYzTBvI", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 292 + }, + "outputId": "bc73fd68-5d57-42bc-b1c1-58ee8cebc7a0", + "cellView": "form" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " agent.load_state_dict(torch.load(f'{models_path}/agent.th', map_location=lambda storage, loc: storage))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "defaultdict( at 0x7b2c3b28e200>, {'agent_0': -15.912209230865944})\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "image/gif": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "ZoRd5y2_i7su" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index de960f82..969163df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ pygame pyparsing pytest python-dateutil -PyYAML==5.3.1 +PyYAML requests sacred seaborn @@ -34,3 +34,4 @@ urllib3 websocket-client whichcraft wrapt +sacred \ No newline at end of file diff --git a/src/components/action_selectors.py b/src/components/action_selectors.py index 9115b84a..241b0bb0 100644 --- a/src/components/action_selectors.py +++ b/src/components/action_selectors.py @@ -1,14 +1,13 @@ import torch as th +import torch.nn.functional as F from torch.distributions import Categorical from .epsilon_schedules import DecayThenFlatSchedule REGISTRY = {} -class MultinomialActionSelector(): - +class MultinomialActionSelector(): # Multinomial distribution of action probabilities def __init__(self, args): self.args = args - self.schedule = DecayThenFlatSchedule(args.epsilon_start, args.epsilon_finish, args.epsilon_anneal_time, decay="linear") self.epsilon = self.schedule.eval(0) @@ -17,62 +16,65 @@ def __init__(self, args): def select_action(self, agent_inputs, avail_actions, t_env, test_mode=False): masked_policies = agent_inputs.clone() masked_policies[avail_actions == 0.0] = 0.0 - self.epsilon = self.schedule.eval(t_env) - if test_mode and self.test_greedy: picked_actions = masked_policies.max(dim=2)[1] else: picked_actions = Categorical(masked_policies).sample().long() - return picked_actions - REGISTRY["multinomial"] = MultinomialActionSelector - -class EpsilonGreedyActionSelector(): - +class EpsilonGreedyActionSelector(): # epsilon greedy action selection def __init__(self, args): self.args = args - - self.schedule = DecayThenFlatSchedule(args.epsilon_start, args.epsilon_finish, args.epsilon_anneal_time, - decay="linear") + self.schedule = DecayThenFlatSchedule(args.epsilon_start, args.epsilon_finish, args.epsilon_anneal_time, decay="linear") self.epsilon = self.schedule.eval(0) - def select_action(self, agent_inputs, avail_actions, t_env, test_mode=False): - # Assuming agent_inputs is a batch of Q-Values for each agent bav self.epsilon = self.schedule.eval(t_env) - if test_mode: # Greedy action selection only self.epsilon = self.args.evaluation_epsilon - # mask actions that are excluded from selection masked_q_values = agent_inputs.clone() masked_q_values[avail_actions == 0.0] = -float("inf") # should never be selected! - random_numbers = th.rand_like(agent_inputs[:, :, 0]) pick_random = (random_numbers < self.epsilon).long() random_actions = Categorical(avail_actions.float()).sample().long() - picked_actions = pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1] return picked_actions - REGISTRY["epsilon_greedy"] = EpsilonGreedyActionSelector - -class SoftPoliciesSelector(): - +class SoftPoliciesSelector(): # Categorical distribution, softmaxed action logits def __init__(self, args): self.args = args - def select_action(self, agent_inputs, avail_actions, t_env, test_mode=False): m = Categorical(agent_inputs) picked_actions = m.sample().long() return picked_actions +REGISTRY["soft_policies"] = SoftPoliciesSelector -REGISTRY["soft_policies"] = SoftPoliciesSelector \ No newline at end of file +class ContinuousSelector(): # Means and standard deviations of normal distributions. [:k], [k:] + def __init__(self, args): + self.args = args + def select_action(self,agent_inputs,avail_actions,t_env,test_mode=False): + with th.no_grad(): + #print("SELECT ACTION called!") + #print(avail_actions) + k = agent_inputs.shape[-1]//2 + #print(f"!!!! {k}") + #print(agent_inputs) + #print(agent_inputs.shape) + u, var = agent_inputs[:,:,:k], agent_inputs[:,:,k:] + u, var = F.tanh(u), th.sqrt(F.softplus(var)) + action_dist = th.distributions.Normal(u, var) + action = action_dist.sample() + # For now, clip actions to [0,1] manually. + action = th.clip(action,min=0,max=1) + #print(action) + return action + +REGISTRY["continuous"] = ContinuousSelector \ No newline at end of file diff --git a/src/components/episode_buffer.py b/src/components/episode_buffer.py index ca75eab5..269d36c1 100644 --- a/src/components/episode_buffer.py +++ b/src/components/episode_buffer.py @@ -86,7 +86,8 @@ def to(self, device): def update(self, data, bs=slice(None), ts=slice(None), mark_filled=True): slices = self._parse_slices((bs, ts)) - for k, v in data.items(): + #print(f"!!! Updating buffer with data: {data}") + for k, v in data.items(): # Maps each field to save to its value. if k in self.data.transition_data: target = self.data.transition_data if mark_filled: @@ -245,5 +246,4 @@ def __repr__(self): return "ReplayBuffer. {}/{} episodes. Keys:{} Groups:{}".format(self.episodes_in_buffer, self.buffer_size, self.scheme.keys(), - self.groups.keys()) - + self.groups.keys()) \ No newline at end of file diff --git a/src/config/algs/mappo_c.yaml b/src/config/algs/mappo_c.yaml new file mode 100644 index 00000000..00ee92eb --- /dev/null +++ b/src/config/algs/mappo_c.yaml @@ -0,0 +1,34 @@ +# --- MAPPO specific parameters --- + +action_selector: "continuous" +mask_before_softmax: False + +runner: "parallel" + +buffer_size: 10 +batch_size_run: 10 +batch_size: 10 + +# update the target network every {} training steps +target_update_interval_or_tau: 0.01 + +lr: 0.0003 +hidden_dim: 128 + +obs_agent_id: False +obs_last_action: False +obs_individual_obs: False + +agent_output_type: "continuous_u_std" # Continuous means and stds +learner: "ppo_c_learner" +entropy_coef: 0.001 +use_rnn: False +standardise_returns: False +standardise_rewards: True +q_nstep: 5 # 1 corresponds to normal r + gammaV +critic_type: "cv_critic" +epochs: 4 +eps_clip: 0.2 +name: "mappo_c" + +t_max: 20050000 diff --git a/src/config/envs/mpe.yaml b/src/config/envs/mpe.yaml new file mode 100644 index 00000000..2ee6fafb --- /dev/null +++ b/src/config/envs/mpe.yaml @@ -0,0 +1,16 @@ +env: "gymma" + +env_args: + key: null + time_limit: 100 + pretrained_wrapper: null + max_cycles: 100 + continuous_actions: False + +test_greedy: True +test_nepisode: 100 +test_interval: 50000 +log_interval: 50000 +runner_log_interval: 10000 +learner_log_interval: 10000 +t_max: 2050000 \ No newline at end of file diff --git a/src/config/envs/mpe_continuous.yaml b/src/config/envs/mpe_continuous.yaml new file mode 100644 index 00000000..20be82d5 --- /dev/null +++ b/src/config/envs/mpe_continuous.yaml @@ -0,0 +1,16 @@ +env: "gymma" + +env_args: + key: null + time_limit: 100 + pretrained_wrapper: null + max_cycles: 100 + continuous_actions: True + +test_greedy: True +test_nepisode: 100 +test_interval: 50000 +log_interval: 50000 +runner_log_interval: 10000 +learner_log_interval: 10000 +t_max: 2050000 \ No newline at end of file diff --git a/src/controllers/basic_controller.py b/src/controllers/basic_controller.py index a64d3bef..34825556 100644 --- a/src/controllers/basic_controller.py +++ b/src/controllers/basic_controller.py @@ -19,24 +19,26 @@ def __init__(self, scheme, groups, args): def select_actions(self, ep_batch, t_ep, t_env, bs=slice(None), test_mode=False): # Only select actions for the selected batch elements in bs avail_actions = ep_batch["avail_actions"][:, t_ep] + # agent_outputs is log probabilities here. agent_outputs = self.forward(ep_batch, t_ep, test_mode=test_mode) chosen_actions = self.action_selector.select_action(agent_outputs[bs], avail_actions[bs], t_env, test_mode=test_mode) return chosen_actions def forward(self, ep_batch, t, test_mode=False): + # Passes agent_inputs = self._build_inputs(ep_batch, t) avail_actions = ep_batch["avail_actions"][:, t] agent_outs, self.hidden_states = self.agent(agent_inputs, self.hidden_states) - # Softmax the agent outputs if they're policy logits if self.agent_output_type == "pi_logits": - if getattr(self.args, "mask_before_softmax", True): # Make the logits for unavailable actions very negative to minimise their affect on the softmax reshaped_avail_actions = avail_actions.reshape(ep_batch.batch_size * self.n_agents, -1) agent_outs[reshaped_avail_actions == 0] = -1e10 agent_outs = th.nn.functional.softmax(agent_outs, dim=-1) - + elif self.agent_output_type == "continuous_u_std": + # Runs before select_actions; SA has it from here. + pass return agent_outs.view(ep_batch.batch_size, self.n_agents, -1) def init_hidden(self, batch_size): @@ -58,6 +60,7 @@ def load_models(self, path): self.agent.load_state_dict(th.load("{}/agent.th".format(path), map_location=lambda storage, loc: storage)) def _build_agents(self, input_shape): + print(f"BUILDING AGENTS! ARGS.N_ACTIONS = {self.args.n_actions}") self.agent = agent_REGISTRY[self.args.agent](input_shape, self.args) def _build_inputs(self, batch, t): @@ -83,5 +86,4 @@ def _get_input_shape(self, scheme): input_shape += scheme["actions_onehot"]["vshape"][0] if self.args.obs_agent_id: input_shape += self.n_agents - - return input_shape + return input_shape \ No newline at end of file diff --git a/src/envs/__init__.py b/src/envs/__init__.py index 55fec42a..c35ef307 100644 --- a/src/envs/__init__.py +++ b/src/envs/__init__.py @@ -3,7 +3,7 @@ from .multiagentenv import MultiAgentEnv from .gymma import GymmaWrapper -from .smaclite_wrapper import SMACliteWrapper +#from .smaclite_wrapper import SMACliteWrapper # Appears to be broken at the moment. if sys.platform == "linux": diff --git a/src/envs/gymma.py b/src/envs/gymma.py index aeba61fa..aa4cac86 100644 --- a/src/envs/gymma.py +++ b/src/envs/gymma.py @@ -8,22 +8,9 @@ from .multiagentenv import MultiAgentEnv from .wrappers import FlattenObservation +from .pz_wrapper import PettingZooWrapper # noqa import envs.pretrained as pretrained # noqa -try: - from .pz_wrapper import PettingZooWrapper # noqa -except ImportError: - warnings.warn( - "PettingZoo is not installed, so these environments will not be available! To install, run `pip install pettingzoo`" - ) - -try: - from .vmas_wrapper import VMASWrapper # noqa -except ImportError: - warnings.warn( - "VMAS is not installed, so these environments will not be available! To install, run `pip install 'vmas[gymnasium]'`" - ) - class GymmaWrapper(MultiAgentEnv): def __init__( @@ -48,7 +35,17 @@ def __init__( self._obs = None self._info = None - self.longest_action_space = max(self._env.action_space, key=lambda x: x.n) + try: + self.longest_action_space = max(self._env.action_space, key=lambda x: x.n) + self.cont_space = False + self.continuous_action_space = False + except Exception as e: + print('!!! Using continuous action space') + self.cont_space = True + self.longest_action_space = max(self._env.action_space, key=lambda x: x.shape) + self.action_space_min = min(self._env.action_space, key=lambda x: x.low).low + self.action_space_max = max(self._env.action_space, key=lambda x: x.high).high + self.continuous_action_space = True self.longest_observation_space = max( self._env.observation_space, key=lambda x: x.shape ) @@ -83,7 +80,11 @@ def _pad_observation(self, obs): def step(self, actions): """Returns obss, reward, terminated, truncated, info""" - actions = [int(a) for a in actions] + #print(f"!!!!! processing actions: {actions}") + if (self.cont_space): + actions = [np.array(a) for a in actions] + else: + actions = [int(a) for a in actions] obs, reward, done, truncated, self._info = self._env.step(actions) self._obs = self._pad_observation(obs) @@ -129,7 +130,10 @@ def get_avail_actions(self): def get_avail_agent_actions(self, agent_id): """Returns the available actions for agent_id""" valid = flatdim(self._env.action_space[agent_id]) * [1] - invalid = [0] * (self.longest_action_space.n - len(valid)) + if (self.cont_space): + invalid = [0] * (self.longest_action_space.shape[0] - len(valid)) + else: + invalid = [0] * (self.longest_action_space.n - len(valid)) return valid + invalid def get_total_actions(self): @@ -137,6 +141,19 @@ def get_total_actions(self): # TODO: This is only suitable for a discrete 1 dimensional action space for each agent return flatdim(self.longest_action_space) + def get_env_info(self): + env_info = { + "state_shape": self.get_state_size(), + "obs_shape": self.get_obs_size(), + "n_actions": self.get_total_actions(), + "n_agents": self.n_agents, + "episode_limit": self.episode_limit, + } + if (self.continuous_action_space): + env_info['action_min'] = self.action_space_min + env_info['action_max'] = self.action_space_max + return env_info + def reset(self, seed=None, options=None): """Returns initial observations and info""" obs, info = self._env.reset(seed=seed, options=options) diff --git a/src/envs/pz_wrapper.py b/src/envs/pz_wrapper.py index 6fac1df3..1ac2343a 100644 --- a/src/envs/pz_wrapper.py +++ b/src/envs/pz_wrapper.py @@ -22,10 +22,10 @@ def __init__(self, lib_name, env_name, **kwargs): self.last_obs = None self.action_space = Tuple( - tuple([self._env.action_spaces[k] for k in self._env.agents]) + tuple([self._env.action_space(k) for k in self._env.agents]) ) self.observation_space = Tuple( - tuple([self._env.observation_spaces[k] for k in self._env.agents]) + tuple([self._env.observation_space(k) for k in self._env.agents]) ) def reset(self, *args, **kwargs): diff --git a/src/learners/__init__.py b/src/learners/__init__.py index 0e9bd10a..48add35c 100644 --- a/src/learners/__init__.py +++ b/src/learners/__init__.py @@ -6,6 +6,7 @@ from .actor_critic_pac_dcg_learner import PACDCGLearner from .maddpg_learner import MADDPGLearner from .ppo_learner import PPOLearner +from .ppo_c_learner import PPOLearner_C REGISTRY = {} @@ -15,5 +16,6 @@ REGISTRY["actor_critic_learner"] = ActorCriticLearner REGISTRY["maddpg_learner"] = MADDPGLearner REGISTRY["ppo_learner"] = PPOLearner +REGISTRY['ppo_c_learner'] = PPOLearner_C REGISTRY["pac_learner"] = PACActorCriticLearner -REGISTRY["pac_dcg_learner"] = PACDCGLearner +REGISTRY["pac_dcg_learner"] = PACDCGLearner \ No newline at end of file diff --git a/src/learners/ppo_c_learner.py b/src/learners/ppo_c_learner.py new file mode 100644 index 00000000..a497f6b1 --- /dev/null +++ b/src/learners/ppo_c_learner.py @@ -0,0 +1,349 @@ +import copy + +import torch as th +from torch.optim import Adam +import torch.nn.functional as F + +from components.episode_buffer import EpisodeBatch +from components.standarize_stream import RunningMeanStd +from modules.critics import REGISTRY as critic_resigtry + + +class PPOLearner_C: + def __init__(self, mac, scheme, logger, args): + self.args = args + self.n_agents = args.n_agents + self.n_actions = args.n_actions + # Add action range + self.action_min = th.tensor(args.action_min) + self.action_max = th.tensor(args.action_max) + print(f"Action min/max = {self.action_min}/{self.action_max}") + self.logger = logger + + self.mac = mac + self.old_mac = copy.deepcopy(mac) + self.agent_params = list(mac.parameters()) + self.agent_optimiser = Adam(params=self.agent_params, lr=args.lr) + + self.critic = critic_resigtry[args.critic_type](scheme, args) + self.target_critic = copy.deepcopy(self.critic) + + self.critic_params = list(self.critic.parameters()) + self.critic_optimiser = Adam(params=self.critic_params, lr=args.lr) + + self.last_target_update_step = 0 + self.critic_training_steps = 0 + self.log_stats_t = -self.args.learner_log_interval - 1 + + device = "cuda" if args.use_cuda else "cpu" + if self.args.standardise_returns: + self.ret_ms = RunningMeanStd(shape=(self.n_agents,), device=device) + if self.args.standardise_rewards: + rew_shape = (1,) if self.args.common_reward else (self.n_agents,) + self.rew_ms = RunningMeanStd(shape=rew_shape, device=device) + + ############################################################# + ## ----- Retrieves probabilities (old_pi) of actions ----- ## + ############################################################# + def get_discrete_log_probs(self, batch, mask, actions, mac): + mac = [] # The outputs of the old multi-agent controller + mac.init_hidden(batch.batch_size) # Only for recurrent + for t in range(batch.max_seq_length - 1): + agent_outs = mac.forward(batch, t=t) + mac_out.append(agent_outs) + mac_out = th.stack(mac_out, dim=1) # Concat over time + p_i = mac_out # [batch, num_actions] + p_i[mask == 0] = 1.0 + p_i_taken = th.gather(p_i, dim=3, index=actions).squeeze(3) + log_pi_taken = th.log(p_i_taken + 1e-10) + return log_pi_taken + + def construct_cont_dist(self, o): + #print(o.shape) # 100,max_timesteps,1,10 + na = o.shape[-1]//2 + means, vars = o[:,:,:,:na], o[:,:,:,na:] + means, stds = F.tanh(means), th.sqrt(F.softplus(vars)) + # Force means into the target range. Same for std. devs, but half the range (as a generally good heuristic) + means = th.clamp(means, min=self.action_min, max=self.action_max) + action_range = self.action_max - self.action_min + mins = (th.zeros_like(action_range)+1e-3).unsqueeze(0).unsqueeze(0).unsqueeze(0) + maxes = (action_range/2).unsqueeze(0).unsqueeze(0).unsqueeze(0) + stds = th.clamp(stds, min=mins, max=maxes) + #stds = stds * 0 + 0.5 # Try just hard-fixing it to 0.5 (OKAY, THIS LINE WORKS! Note that this is just for optimization, std is not fixed in runtime) + # End of test + action_dists = th.distributions.Normal(means, stds) + return action_dists + + def get_continuous_log_probs(self, batch, mask, actions, mac): + #print("Get Continuous Log Probs called!") + #print(actions.shape) # 100, 100, 1, agent_output size + #print(mask.shape) # [100,100,1], looks like all ones. + mac_out = [] # The outputs of the old multi-agent controller + self.mac.init_hidden(batch.batch_size) # Only for recurrent + for t in range(batch.max_seq_length - 1): # [batch, num_actions] + agent_outs = self.mac.forward(batch, t=t) # 100, 1, 5 in discrete, 100, 1, 10 in continuous + mac_out.append(agent_outs) + mac_out = th.stack(mac_out, dim=1) # Concat over time + dists = self.construct_cont_dist(mac_out) # [batch, num_actions] -> MAC should output probs. + # Note that 'taken' is no longer used; continuous outputs always taken. + log_pi = dists.log_prob(actions).sum(axis=1) + entropy = 0 + #print(dists.entropy().shape) #100,100,1,5 + if (mac == self.mac): + entropy = dists.entropy().squeeze() + return log_pi, entropy + + ############################################################# + ## --------------------- Runs Training ------------------- ## + ############################################################# + def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): + # Get the relevant quantities + + rewards = batch["reward"][:, :-1] + actions = batch["actions"][:, :] + terminated = batch["terminated"][:, :-1].float() + mask = batch["filled"][:, :-1].float() + mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) + actions = actions[:, :-1] + + if self.args.standardise_rewards: + self.rew_ms.update(rewards) + rewards = (rewards - self.rew_ms.mean) / th.sqrt(self.rew_ms.var) + + if self.args.common_reward: + assert ( + rewards.size(2) == 1 + ), "Expected singular agent dimension for common rewards" + # reshape rewards to be of shape (batch_size, episode_length, n_agents) + rewards = rewards.expand(-1, -1, self.n_agents) + + mask = mask.repeat(1, 1, self.n_agents) + + critic_mask = mask.clone() + + # This needs to be replaced with continuous probabilities + #old_log_pi = self.get_discrete_log_probs(batch, mask, actions, self.old_mac) + old_log_pi, _ = self.get_continuous_log_probs(batch, mask, actions, self.old_mac) + + # In discrete mode, gets the indices of the old MAC's actions and uses them to grab the corresponding log probabilities. + for k in range(self.args.epochs): + # Training the critic shouldn't affect action probabilities, so I've moved this out of the middle of the probability calc. + advantages, critic_train_stats = self.train_critic_sequential( + self.critic, self.target_critic, batch, rewards, critic_mask + ) + advantages = advantages.detach() + ### Calculate log probabilities + #log_pi_taken = self.get_discrete_log_probs(batch, mask, actions, self.mac) + log_pi, entropy = self.get_continuous_log_probs(batch, mask, actions, self.mac) + + ratios = th.exp(log_pi - old_log_pi.detach()) + # surr1 can have + and - infinity as values + surr1 = ratios * advantages + surr2 = ( + th.clamp(ratios, 1 - self.args.eps_clip, 1 + self.args.eps_clip) + * advantages + ) + + # torch.Size([100, 1, 1, 5]) + #print((th.min(surr1, surr2)).shape) # torch.Size([100, 100, 5]) + #print(entropy.shape) # torch.Size([100, 100, 5]) + # Calculate entropy of normal (rather than categorical) distribution + #entropy = -th.sum(pi * th.log(pi + 1e-10), dim=-1) + + # Calculate total loss + pg_loss = ( + -( + (th.min(surr1, surr2) + self.args.entropy_coef * entropy) * mask + ).sum() + / mask.sum() + ) + + if (th.isnan(pg_loss) or th.isinf(pg_loss)): + print(pg_loss) + print("NEW LOG PI") + print(log_pi.min()) + print(log_pi.max()) + print("RATIOS") + print(ratios.min()) + print(ratios.max()) + print("SURR1") + print(surr1.min()) + print(surr1.max()) + print("SURR2") + print(surr2.min()) + print(surr2.max()) + print("ENTROPY") + print(entropy.min()) + print(entropy.max()) + print("!!!!!!!!!!!!!!!!!!!!!") + sdfafsdadsfasadfasdfasdfasdf + + # Optimise agents + self.agent_optimiser.zero_grad() + pg_loss.backward() + grad_norm = th.nn.utils.clip_grad_norm_( + self.agent_params, self.args.grad_norm_clip + ) + self.agent_optimiser.step() + + self.old_mac.load_state(self.mac) + + self.critic_training_steps += 1 + if ( + self.args.target_update_interval_or_tau > 1 + and (self.critic_training_steps - self.last_target_update_step) + / self.args.target_update_interval_or_tau + >= 1.0 + ): + self._update_targets_hard() + self.last_target_update_step = self.critic_training_steps + elif self.args.target_update_interval_or_tau <= 1.0: + self._update_targets_soft(self.args.target_update_interval_or_tau) + + if t_env - self.log_stats_t >= self.args.learner_log_interval: + ts_logged = len(critic_train_stats["critic_loss"]) + for key in [ + "critic_loss", + "critic_grad_norm", + "td_error_abs", + "q_taken_mean", + "target_mean", + ]: + self.logger.log_stat( + key, sum(critic_train_stats[key]) / ts_logged, t_env + ) + + self.logger.log_stat( + "advantage_mean", + (advantages * mask).sum().item() / mask.sum().item(), + t_env, + ) + self.logger.log_stat("pg_loss", pg_loss.item(), t_env) + self.logger.log_stat("agent_grad_norm", grad_norm.item(), t_env) + '''self.logger.log_stat( + "pi_max", + (pi.max(dim=-1)[0] * mask).sum().item() / mask.sum().item(), + t_env, + )''' # Not readily applicable to continuous spaces + self.log_stats_t = t_env + + def train_critic_sequential(self, critic, target_critic, batch, rewards, mask): + # Optimise critic + with th.no_grad(): + target_vals = target_critic(batch) + target_vals = target_vals.squeeze(3) + + if self.args.standardise_returns: + target_vals = target_vals * th.sqrt(self.ret_ms.var) + self.ret_ms.mean + + target_returns = self.nstep_returns( + rewards, mask, target_vals, self.args.q_nstep + ) + if self.args.standardise_returns: + self.ret_ms.update(target_returns) + target_returns = (target_returns - self.ret_ms.mean) / th.sqrt( + self.ret_ms.var + ) + + running_log = { + "critic_loss": [], + "critic_grad_norm": [], + "td_error_abs": [], + "target_mean": [], + "q_taken_mean": [], + } + + v = critic(batch)[:, :-1].squeeze(3) + td_error = target_returns.detach() - v + masked_td_error = td_error * mask + loss = (masked_td_error**2).sum() / mask.sum() + + self.critic_optimiser.zero_grad() + loss.backward() + grad_norm = th.nn.utils.clip_grad_norm_( + self.critic_params, self.args.grad_norm_clip + ) + self.critic_optimiser.step() + + running_log["critic_loss"].append(loss.item()) + running_log["critic_grad_norm"].append(grad_norm.item()) + mask_elems = mask.sum().item() + running_log["td_error_abs"].append( + (masked_td_error.abs().sum().item() / mask_elems) + ) + running_log["q_taken_mean"].append((v * mask).sum().item() / mask_elems) + running_log["target_mean"].append( + (target_returns * mask).sum().item() / mask_elems + ) + + return masked_td_error, running_log + + def nstep_returns(self, rewards, mask, values, nsteps): + nstep_values = th.zeros_like(values[:, :-1]) + for t_start in range(rewards.size(1)): + nstep_return_t = th.zeros_like(values[:, 0]) + for step in range(nsteps + 1): + t = t_start + step + if t >= rewards.size(1): + break + elif step == nsteps: + nstep_return_t += ( + self.args.gamma ** (step) * values[:, t] * mask[:, t] + ) + elif t == rewards.size(1) - 1 and self.args.add_value_last_step: + nstep_return_t += ( + self.args.gamma ** (step) * rewards[:, t] * mask[:, t] + ) + nstep_return_t += self.args.gamma ** (step + 1) * values[:, t + 1] + else: + nstep_return_t += ( + self.args.gamma ** (step) * rewards[:, t] * mask[:, t] + ) + nstep_values[:, t_start, :] = nstep_return_t + return nstep_values + + def _update_targets(self): + self.target_critic.load_state_dict(self.critic.state_dict()) + + def _update_targets_hard(self): + self.target_critic.load_state_dict(self.critic.state_dict()) + + def _update_targets_soft(self, tau): + for target_param, param in zip( + self.target_critic.parameters(), self.critic.parameters() + ): + target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) + + def cuda(self): + self.old_mac.cuda() + self.mac.cuda() + self.critic.cuda() + self.target_critic.cuda() + + def save_models(self, path): + self.mac.save_models(path) + th.save(self.critic.state_dict(), "{}/critic.th".format(path)) + th.save(self.agent_optimiser.state_dict(), "{}/agent_opt.th".format(path)) + th.save(self.critic_optimiser.state_dict(), "{}/critic_opt.th".format(path)) + + def load_models(self, path): + self.mac.load_models(path) + self.critic.load_state_dict( + th.load( + "{}/critic.th".format(path), map_location=lambda storage, loc: storage + ) + ) + # Not quite right but I don't want to save target networks + self.target_critic.load_state_dict(self.critic.state_dict()) + self.agent_optimiser.load_state_dict( + th.load( + "{}/agent_opt.th".format(path), + map_location=lambda storage, loc: storage, + ) + ) + self.critic_optimiser.load_state_dict( + th.load( + "{}/critic_opt.th".format(path), + map_location=lambda storage, loc: storage, + ) + ) \ No newline at end of file diff --git a/src/run.py b/src/run.py index 41e0a38d..610b7722 100644 --- a/src/run.py +++ b/src/run.py @@ -106,10 +106,21 @@ def run_sequential(args, logger): args.state_shape = env_info["state_shape"] # Default/Base scheme + print(f"ARGS.agent_output_type equals {args.agent_output_type}") + if (args.agent_output_type == 'continuous_u_std'): # Continuous actions; different shape. + actions_shape = {"vshape": env_info["n_actions"], "group": "agents", "dtype": th.float} + # Continuous spaces require mean and standard deviation + args.n_actions = env_info["n_actions"] * 2 + print(f"ARGS.N_ACTIONS equals {args.n_actions}") + # Inform the training module about the space shape + args.action_min = env_info["action_min"] + args.action_max = env_info["action_max"] + else: + actions_shape = {"vshape": (1,), "group": "agents", "dtype": th.long} scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, - "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, + "actions": actions_shape, "avail_actions": { "vshape": (env_info["n_actions"],), "group": "agents", diff --git a/src/runners/parallel_runner.py b/src/runners/parallel_runner.py index 5962c60f..dc3c55eb 100644 --- a/src/runners/parallel_runner.py +++ b/src/runners/parallel_runner.py @@ -2,6 +2,7 @@ from multiprocessing import Pipe, Process import numpy as np +import torch from components.episode_buffer import EpisodeBatch from envs import REGISTRY as env_REGISTRY @@ -23,10 +24,14 @@ def __init__(self, args, logger): # registering both smac and smacv2 causes a pysc2 error # --> dynamically register the needed env + self.continuous_actions = False # By default if self.args.env == "sc2": register_smac() elif self.args.env == "sc2v2": register_smacv2() + elif self.args.env == "gymma" and 'pz-mpe' in self.args.env_args['key']: + print("!!! USING MPE ENVIRONMENT") + self.continuous_actions = self.args.env_args['continuous_actions'] env_fn = env_REGISTRY[self.args.env] env_args = [self.args.env_args.copy() for _ in range(self.batch_size)] @@ -137,7 +142,14 @@ def run(self, test_mode=False): cpu_actions = actions.to("cpu").numpy() # Update the actions taken - actions_chosen = {"actions": actions.unsqueeze(1)} + ''' + START OF MODIFIED SECTION + ''' + #print(f"!!! PRINTING ACTIONS CHOSEN: {actions}") # tensor([[4]]) in discrete + if (actions.type() == torch.LongTensor): # Add a dimension to continuous actions + actions_chosen = {"actions": actions.unsqueeze(1)} + else: + actions_chosen = {"actions": actions} self.batch.update( actions_chosen, bs=envs_not_terminated, ts=self.t, mark_filled=False ) @@ -154,6 +166,10 @@ def run(self, test_mode=False): if idx == 0 and test_mode and self.args.render: parent_conn.send(("render", None)) + ''' + END OF MODIFIED SECTION + ''' + # Update envs_not_terminated envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if not termed @@ -352,4 +368,4 @@ def __getstate__(self): def __setstate__(self, ob): import pickle - self.x = pickle.loads(ob) + self.x = pickle.loads(ob) \ No newline at end of file