Emerge-Lab
diff --git a/‎pufferlib/config/ocean/drive.ini‎
Lines changed: 1 addition & 0 deletions b/‎pufferlib/config/ocean/drive.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pufferlib/extensions/cuda/pufferlib.cu‎
Lines changed: 12 additions & 6 deletions b/‎pufferlib/extensions/cuda/pufferlib.cu‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎pufferlib/extensions/pufferlib.cpp‎
Lines changed: 14 additions & 8 deletions b/‎pufferlib/extensions/pufferlib.cpp‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎pufferlib/ocean/drive/binding.c‎
Lines changed: 16 additions & 0 deletions b/‎pufferlib/ocean/drive/binding.c‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pufferlib/ocean/drive/drive.h‎
Lines changed: 116 additions & 5 deletions b/‎pufferlib/ocean/drive/drive.h‎
Lines changed: 116 additions & 5 deletions
@@ -36,6 +36,7 @@ init_steps = 0 # Determines which step of the trajectory to initialize the agent
 control_all_agents = False # this should be set to false unless you want to specifically want to override and control expert marked vehicles
 num_policy_controlled_agents = -1 # note: if you add this you likely need to set num_agents to a smaller number
 deterministic_agent_selection = False # if this is true it overrides vehicles marked as expert to be policy controlled
+condition_type = "none" # Options: "none", "reward", "entropy", "discount", "all"
 
 [train]
 total_timesteps = 2_000_000_000
 
@@ -20,8 +20,8 @@ __host__ __device__ void puff_advantage_row_cuda(float* values, float* rewards,
 }
 
 void vtrace_check_cuda(torch::Tensor values, torch::Tensor rewards,
-        torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
-        int num_steps, int horizon) {
+        torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages, 
+        torch::Tensor gammas, int num_steps, int horizon) {
 
     // Validate input tensors
     torch::Device device = values.device();
@@ -35,24 +35,30 @@ void vtrace_check_cuda(torch::Tensor values, torch::Tensor rewards,
             t.contiguous();
         }
     }
+    // Validate gammas tensor
+    TORCH_CHECK(gammas.dim() == 1, "Gammas must be 1D");
+    TORCH_CHECK(gammas.size(0) == num_steps, "Gammas size must match num_steps");
+    TORCH_CHECK(gammas.dtype() == torch::kFloat32, "Gammas must be float32");
+    TORCH_CHECK(gammas.is_cuda(), "Gammas must be on GPU");
+    TORCH_CHECK(gammas.is_contiguous(), "Gammas must be contiguous");
 }
 
  // [num_steps, horizon]
 __global__ void puff_advantage_kernel(float* values, float* rewards,
-        float* dones, float* importance, float* advantages, float gamma,
+        float* dones, float* importance, float* advantages, float* gammas,
         float lambda, float rho_clip, float c_clip, int num_steps, int horizon) {
     int row = blockIdx.x*blockDim.x + threadIdx.x;
     if (row >= num_steps) {
         return;
     }
     int offset = row*horizon;
     puff_advantage_row_cuda(values + offset, rewards + offset, dones + offset,
-        importance + offset, advantages + offset, gamma, lambda, rho_clip, c_clip, horizon);
+        importance + offset, advantages + offset, gammas[row], lambda, rho_clip, c_clip, horizon);
 }
 
 void compute_puff_advantage_cuda(torch::Tensor values, torch::Tensor rewards,
         torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
-        double gamma, double lambda, double rho_clip, double c_clip) {
+        torch::Tensor gammas, double lambda, double rho_clip, double c_clip) {
     int num_steps = values.size(0);
     int horizon = values.size(1);
     vtrace_check_cuda(values, rewards, dones, importance, advantages, num_steps, horizon);
@@ -67,7 +73,7 @@ void compute_puff_advantage_cuda(torch::Tensor values, torch::Tensor rewards,
         dones.data_ptr<float>(),
         importance.data_ptr<float>(),
         advantages.data_ptr<float>(),
-        gamma,
+        gammas.data_ptr<float>(),
         lambda,
         rho_clip,
         c_clip,
 
@@ -42,7 +42,7 @@ void puff_advantage_row(float* values, float* rewards, float* dones,
 
 void vtrace_check(torch::Tensor values, torch::Tensor rewards,
         torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
-        int num_steps, int horizon) {
+        torch::Tensor gammas,  int num_steps, int horizon) {
 
     // Validate input tensors
     torch::Device device = values.device();
@@ -56,36 +56,42 @@ void vtrace_check(torch::Tensor values, torch::Tensor rewards,
             t.contiguous();
         }
     }
+    // Validate gammas tensor
+    TORCH_CHECK(gammas.dim() == 1, "Gammas must be 1D");
+    TORCH_CHECK(gammas.size(0) == num_steps, "Gammas size must match num_steps");
+    TORCH_CHECK(gammas.dtype() == torch::kFloat32, "Gammas must be float32");
+    TORCH_CHECK(gammas.is_contiguous(), "Gammas must be contiguous");
 }
 
 
 // [num_steps, horizon]
 void puff_advantage(float* values, float* rewards, float* dones, float* importance,
-        float* advantages, float gamma, float lambda, float rho_clip, float c_clip,
+        float* advantages, float* gammas, float lambda, float rho_clip, float c_clip,
         int num_steps, const int horizon){
-    for (int offset = 0; offset < num_steps*horizon; offset+=horizon) {
+    for (int row = 0; row < num_steps; row++) {
+        int offset = row * horizon;
         puff_advantage_row(values + offset, rewards + offset,
             dones + offset, importance + offset, advantages + offset,
-            gamma, lambda, rho_clip, c_clip, horizon
+            gammas[row], lambda, rho_clip, c_clip, horizon
         );
     }
 }
 
 
 void compute_puff_advantage_cpu(torch::Tensor values, torch::Tensor rewards,
         torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
-        double gamma, double lambda, double rho_clip, double c_clip) {
+        torch::Tensor gammas, double lambda, double rho_clip, double c_clip) {
     int num_steps = values.size(0);
     int horizon = values.size(1);
-    vtrace_check(values, rewards, dones, importance, advantages, num_steps, horizon);
+    vtrace_check(values, rewards, dones, importance, advantages, gammas, num_steps, horizon);
     puff_advantage(values.data_ptr<float>(), rewards.data_ptr<float>(),
         dones.data_ptr<float>(), importance.data_ptr<float>(), advantages.data_ptr<float>(),
-        gamma, lambda, rho_clip, c_clip, num_steps, horizon
+        gammas.data_ptr<float>(), lambda, rho_clip, c_clip, num_steps, horizon
     );
 }
 
 TORCH_LIBRARY(pufferlib, m) {
-   m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()");
+   m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, Tensor gammas, float lambda, float rho_clip, float c_clip) -> ()");
  }
 
 TORCH_LIBRARY_IMPL(pufferlib, CPU, m) {
 
@@ -182,6 +182,22 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     env->control_all_agents = unpack(kwargs, "control_all_agents");
     env->deterministic_agent_selection = unpack(kwargs, "deterministic_agent_selection");
     env->control_non_vehicles = (int)unpack(kwargs, "control_non_vehicles");
+
+    // Conditioning parameters
+    env->use_rc = (bool)unpack(kwargs, "use_rc");
+    env->use_ec = (bool)unpack(kwargs, "use_ec");
+    env->use_dc = (bool)unpack(kwargs, "use_dc");
+    env->collision_weight_lb = (float)unpack(kwargs, "collision_weight_lb");
+    env->collision_weight_ub = (float)unpack(kwargs, "collision_weight_ub");
+    env->offroad_weight_lb = (float)unpack(kwargs, "offroad_weight_lb");
+    env->offroad_weight_ub = (float)unpack(kwargs, "offroad_weight_ub");
+    env->goal_weight_lb = (float)unpack(kwargs, "goal_weight_lb");
+    env->goal_weight_ub = (float)unpack(kwargs, "goal_weight_ub");
+    env->entropy_weight_lb = (float)unpack(kwargs, "entropy_weight_lb");
+    env->entropy_weight_ub = (float)unpack(kwargs, "entropy_weight_ub");
+    env->discount_weight_lb = (float)unpack(kwargs, "discount_weight_lb");
+    env->discount_weight_ub = (float)unpack(kwargs, "discount_weight_ub");
+
     int map_id = unpack(kwargs, "map_id");
     int max_agents = unpack(kwargs, "max_agents");
     int init_steps = unpack(kwargs, "init_steps");
 
@@ -119,6 +119,12 @@ struct Log {
     float active_agent_count;
     float expert_static_car_count;
     float static_car_count;
+    // Conditioning metrics
+    float avg_collision_weight;
+    float avg_offroad_weight;
+    float avg_goal_weight;
+    float avg_entropy_weight;
+    float avg_discount_weight;
 };
 
 typedef struct Entity Entity;
@@ -287,6 +293,27 @@ struct Drive {
     char* ini_file;
     int scenario_length;
     int control_non_vehicles;
+    // Reward conditioning
+    bool use_rc;
+    float collision_weight_lb;
+    float collision_weight_ub;
+    float offroad_weight_lb;
+    float offroad_weight_ub;
+    float goal_weight_lb;
+    float goal_weight_ub;
+    float* collision_weights;
+    float* offroad_weights;
+    float* goal_weights;
+    // Entropy conditioning
+    bool use_ec;
+    float entropy_weight_lb;
+    float entropy_weight_ub;
+    float* entropy_weights;
+    // Discount conditioning
+    bool use_dc;
+    float discount_weight_lb;
+    float discount_weight_ub;
+    float* discount_weights;
 };
 
 typedef struct {
@@ -1565,6 +1592,18 @@ void init(Drive* env){
     set_start_position(env);
     init_goal_positions(env);
     env->logs = (Log*)calloc(env->active_agent_count, sizeof(Log));
+
+    if (env->use_rc) {
+        env->collision_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+        env->offroad_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+        env->goal_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
+    if (env->use_ec) {
+        env->entropy_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
+    if (env->use_dc) {
+        env->discount_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
 }
 
 void c_close(Drive* env){
@@ -1594,6 +1633,18 @@ void c_close(Drive* env){
     freeTopologyGraph(env->topology_graph);
     // free(env->map_name);
     free(env->ini_file);
+
+    if (env->use_rc) {
+        free(env->collision_weights);
+        free(env->offroad_weights);
+        free(env->goal_weights);
+    }
+    if (env->use_ec) {
+        free(env->entropy_weights);
+    }
+    if (env->use_dc) {
+        free(env->discount_weights);
+    }
 }
 
 void allocate(Drive* env){
@@ -1606,13 +1657,38 @@ void allocate(Drive* env){
     env->actions = (float*)calloc(env->active_agent_count*2, sizeof(float));
     env->rewards = (float*)calloc(env->active_agent_count, sizeof(float));
     env->terminals= (unsigned char*)calloc(env->active_agent_count, sizeof(unsigned char));
+
+    if (env->use_rc) {
+        env->collision_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+        env->offroad_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+        env->goal_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
+    if (env->use_ec) {
+        env->entropy_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
+    if (env->use_dc) {
+        env->discount_weights = (float*)calloc(env->active_agent_count, sizeof(float));
+    }
 }
 
 void free_allocated(Drive* env){
     free(env->observations);
     free(env->actions);
     free(env->rewards);
     free(env->terminals);
+
+    if (env->use_rc) {
+        free(env->collision_weights);
+        free(env->offroad_weights);
+        free(env->goal_weights);
+    }
+    if (env->use_ec) {
+        free(env->entropy_weights);
+    }
+    if (env->use_dc) {
+        free(env->discount_weights);
+    }
+
     c_close(env);
 }
 
@@ -1704,10 +1780,6 @@ void compute_observations(Drive* env) {
         float* obs = &observations[i][0];
         Entity* ego_entity = &env->entities[env->active_agent_indices[i]];
         if(ego_entity->type > 3) break;
-        if(ego_entity->respawn_timestep != -1) {
-            obs[6] = 1;
-            //continue;
-        }
         float cos_heading = ego_entity->heading_x;
         float sin_heading = ego_entity->heading_y;
         float ego_speed = sqrtf(ego_entity->vx*ego_entity->vx + ego_entity->vy*ego_entity->vy);
@@ -1726,9 +1798,26 @@ void compute_observations(Drive* env) {
         obs[3] = ego_entity->width / MAX_VEH_WIDTH;
         obs[4] = ego_entity->length / MAX_VEH_LEN;
         obs[5] = (ego_entity->collision_state > 0) ? 1.0f : 0.0f;
+        if(ego_entity->respawn_timestep != -1) {
+            obs[6] = 1;
+            //continue;
+        }
+
+        // Add conditioning weights to observations
+        int obs_idx = 7;
+        if (env->use_rc) {
+            obs[obs_idx++] = env->collision_weights[i];
+            obs[obs_idx++] = env->offroad_weights[i];
+            obs[obs_idx++] = env->goal_weights[i];
+        }
+        if (env->use_ec) {
+            obs[obs_idx++] = env->entropy_weights[i];
+        }
+        if (env->use_dc) {
+            obs[obs_idx++] = env->discount_weights[i];
+        }
 
         // Relative Pos of other cars
-        int obs_idx = 7;  // Start after goal distances
         int cars_seen = 0;
         for(int j = 0; j < MAX_AGENTS; j++) {
             int index = -1;
@@ -1969,6 +2058,28 @@ void compute_new_goal(Drive* env, int agent_idx) {
 void c_reset(Drive* env){
     env->timestep = env->init_steps;
     set_start_position(env);
+
+    // Initialize conditioning weights
+    if (env->use_rc) {
+        for(int i = 0; i < env->active_agent_count; i++) {
+            env->collision_weights[i] = ((float)rand() / RAND_MAX) * (env->collision_weight_ub - env->collision_weight_lb) + env->collision_weight_lb;
+            env->offroad_weights[i] = ((float)rand() / RAND_MAX) * (env->offroad_weight_ub - env->offroad_weight_lb) + env->offroad_weight_lb;
+            env->goal_weights[i] = ((float)rand() / RAND_MAX) * (env->goal_weight_ub - env->goal_weight_lb) + env->goal_weight_lb;
+        }
+    }
+
+    if (env->use_ec) {
+        for(int i = 0; i < env->active_agent_count; i++) {
+            env->entropy_weights[i] = ((float)rand() / RAND_MAX) * (env->entropy_weight_ub - env->entropy_weight_lb) + env->entropy_weight_lb;
+        }
+    }
+
+    if (env->use_dc) {
+        for(int i = 0; i < env->active_agent_count; i++) {
+            env->discount_weights[i] = ((float)rand() / RAND_MAX) * (env->discount_weight_ub - env->discount_weight_lb) + env->discount_weight_lb;
+        }
+    }
+
     for(int x = 0;x<env->active_agent_count; x++){
         env->logs[x] = (Log){0};
         int agent_idx = env->active_agent_indices[x];