diff --git a/README.md b/README.md index cb5e8ad4..05b04c2b 100644 --- a/README.md +++ b/README.md @@ -275,6 +275,15 @@ dynamic benchmarks. between the two executions. **Note**: this is a beta feature and will need some adaptation for your own agent. +## Variables +Here's a list of relevant env. variables that are used by AgentLab: +- `OPEAI_API_KEY` which is used by default for OpenAI LLMs. +- `AZURE_OPENAI_API_KEY`, used by default for AzureOpenAI LLMs. +- `AZURE_OPENAI_ENDPOINT` to specify your Azure endpoint. +- `OPENAI_API_VERSION` for the Azure API. +- `OPENROUTER_API_KEY` for the Openrouter API +- `AGENTLAB_EXP_ROOT`, desired path for your experiments to be stored, defaults to `~/agentlab-results`. +- `AGENTXRAY_SHARE_GRADIO`, which prompts AgentXRay to open a public tunnel on launch. ## Misc diff --git a/pyproject.toml b/pyproject.toml index 2a1e06c3..1292836a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,10 +13,11 @@ authors = [ {name = "Alex Lacoste", email = "alex.lacoste@servicenow.com"}, {name = "Tom Marty", email = "tom.marty@polymtl.ca"}, {name = "Massimo Caccia", email = "massimo.caccia1@servicenow.com"}, - {name = "Thibault Le Sellier de Chezelles", email = "thibault.de.chezelles@gmail.com"} + {name = "Thibault Le Sellier de Chezelles", email = "thibault.de.chezelles@gmail.com"}, + {name = "Aman Jaiswal", email = "aman.jaiswal@servicenow.com"}, ] readme = "README.md" -requires-python = ">3.7" +requires-python = ">3.10" license = {text = "Apache-2.0"} classifiers = [ "Development Status :: 2 - Pre-Alpha", diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index b42d39ee..371e5426 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -74,3 +74,5 @@ Leo Boisvert,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2025-02-0 M: src/agentlab/analyze/agent_xray.py M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29," Leo Boisvert,GenericAgent-anthropic_claude-3.7-sonnet,workarena_l1,0.4.1,2025-02-25_02-32-09,d4f900c2-1de1-4e4b-a3ab-495ff2675fff,0.515,0.028,0,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.4.0,c9d2ef9648435ef1119950ecb1a0734497ccc33b,,0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29, +agentlabtraces,GenericAgent-meta-llama_llama-4-maverick,workarena_l1,0.4.1,2025-04-14_17-15-56,a6dc4022-2bb7-4b46-8b37-f62c010defc1,0.27,0.024,0,330/330,None,Linux (#135-Ubuntu SMP Fri Sep 27 13:53:58 UTC 2024),3.12.7,1.39.0,v0.4.0,5eb2ecb5e5b293170230bcbed8b17fe192af214a,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2, +agentlabtraces,GenericAgent-meta-llama_llama-4-maverick,workarena_l2_agent_curriculum_eval,0.4.1,2025-04-22_15-38-44,d62fed39-caac-4ef3-92ac-b29897c69f88,0.085,0.018,1,235/235,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,v0.4.0,43bafbcfbe398fca39e4ffdc57b2f226d2c6d3e1,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2, diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py index 92c9996d..ad9b0348 100644 --- a/src/agentlab/agents/generic_agent/__init__.py +++ b/src/agentlab/agents/generic_agent/__init__.py @@ -10,6 +10,7 @@ AGENT_3_5, AGENT_8B, AGENT_CUSTOM, + AGENT_LLAMA4_17B_INSTRUCT, AGENT_LLAMA3_70B, AGENT_LLAMA31_70B, RANDOM_SEARCH_AGENT, @@ -31,6 +32,7 @@ "AGENT_4o_VISION", "AGENT_o3_MINI", "AGENT_o1_MINI", + "AGENT_LLAMA4_17B_INSTRUCT", "AGENT_LLAMA3_70B", "AGENT_LLAMA31_70B", "AGENT_8B", diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index 789f350a..914e3249 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -10,6 +10,7 @@ from .generic_agent import GenericAgentArgs from .generic_agent_prompt import GenericPromptFlags +from .tmlr_config import BASE_FLAGS FLAGS_CUSTOM = GenericPromptFlags( obs=dp.ObsFlags( @@ -296,7 +297,10 @@ chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"], flags=FLAGS_GPT_4o_VISION, ) - +AGENT_LLAMA4_17B_INSTRUCT = GenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/llama-4-maverick"], + flags=BASE_FLAGS, +) DEFAULT_RS_FLAGS = GenericPromptFlags( flag_group="default_rs", diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 8cd7961c..6154007e 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -550,7 +550,7 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image: try: coords = action[action.index("(") + 1 : action.index(")")].split(",") coords = [c.strip() for c in coords] - if len(coords) != 2: + if len(coords) not in [2, 3]: raise ValueError(f"Invalid coordinate format: {coords}") if coords[0].startswith("x="): coords[0] = coords[0][2:] diff --git a/src/agentlab/analyze/covariate_std_err.py b/src/agentlab/analyze/covariate_std_err.py new file mode 100644 index 00000000..a989da30 --- /dev/null +++ b/src/agentlab/analyze/covariate_std_err.py @@ -0,0 +1,458 @@ +from typing import Callable, Sequence, Tuple + +import numpy as np +import statsmodels.api as sm +from sklearn.model_selection import KFold +from statsmodels.genmod.families import Binomial + + +def aggregate_std_err( + run_rewards: Sequence[np.ndarray], + baseline_rewards: np.ndarray, + std_err_fn: Callable[[np.ndarray, np.ndarray], Tuple[float, float]] = None, +) -> Tuple[float, float]: + """ + Args: + run_rewards: list of length k, each an array of shape (n,) + baseline_rewards: array of shape (m, n) holding m covariate-baselines + std_err_fn: function that, given (rewards: np.ndarray of shape (n,), + baselines: np.ndarray of shape (m, n)), + returns (mean: float, se: float) + Returns: + (overall_mean, overall_se) + """ + + if std_err_fn is None: + std_err_fn = std_err_ancova + + # 1) call the low-level routine on each run + stats = [std_err_fn(r, baseline_rewards) for r in run_rewards] + means = np.array([mu for mu, _ in stats]) + ses = np.array([sigma for _, sigma in stats]) + + k = len(means) + # 2) overall mean of the per-run means + overall_mean = means.mean() + + # 3) decompose variance: between-runs + within-runs + var_between = means.var(ddof=1) / k + var_within = (ses**2).mean() / k + + overall_se = np.sqrt(var_between + var_within) + return overall_mean, overall_se + + +def std_err_clt(rewards: np.array) -> tuple[float, float]: + """ + Computes the mean and standard error of the rewards. + + Parameters: + - rewards: array-like of shape (n,) + Observed rewards for each sample. + + Returns: + - reward_mean: float + Mean of the rewards. + - se: float + Standard error of the mean. + """ + rewards = np.asarray(rewards, dtype=float) + n = rewards.size + if n == 0: + raise ValueError("The input array is empty.") + + reward_mean = rewards.mean() + se = np.std(rewards, ddof=1) / np.sqrt(n) + + return reward_mean, se + + +def std_err_bootstrap(rewards: np.array, n_boot: int = 1000) -> tuple[float, float]: + """ + Computes the mean and standard error of the rewards using bootstrap. + + Parameters: + - rewards: array-like of shape (n,) + Observed rewards for each sample. + - n_boot: int, default=1000 + Number of bootstrap samples. + + Returns: + - reward_mean: float + Mean of the rewards. + - se: float + Standard error of the mean. + """ + rewards = np.asarray(rewards, dtype=float) + n = rewards.size + if n == 0: + raise ValueError("The input array is empty.") + + boot_means = [] + rng = np.random.default_rng() + for _ in range(n_boot): + idx = rng.integers(0, n, n) + boot_means.append(rewards[idx].mean()) + + reward_mean = np.mean(boot_means) + se = np.std(boot_means, ddof=1) + + return reward_mean, se + + +def _replace_nans_by_average(baselines): + """ + Impute NaNs in each column of the baselines matrix with that column's mean. + + Parameters: + - baselines: array-like of shape (n, k) + Baseline estimates per sample and baseline index. + + Returns: + - imputed: np.ndarray of shape (n, k) + Baselines with NaNs replaced by their column means. + """ + baselines = np.asarray(baselines, dtype=float) + # Compute column means ignoring NaNs + col_means = np.nanmean(baselines, axis=0) + # Broadcast and fill NaNs + imputed = np.where(np.isnan(baselines), col_means, baselines) + return imputed + + +def _select_best_baseline(rewards, baselines): + """ + Select the best baseline based on the total absolute error. + """ + errors = np.abs(baselines - rewards[:, None]).sum(axis=0) + j_star = int(np.argmin(errors)) + return baselines[:, j_star] + + +def std_err_diff_baselines(rewards, baselines): + """ + Find the best baseline and compute the adjusted mean and SE. + + Parameters: + - rewards: array-like of shape (n,) + Observed rewards (may contain NaN). + - baselines: array-like of shape (n, k) + k baseline estimates per sample (may contain NaN). + + Returns: + - adjusted_reward_mean: float + Mean of valid rewards. + - adjusted_se: float∫ + SE of the adjusted mean (differences) using the selected baseline. + - selected_baseline: np.ndarray of shape (n,) + The values of the chosen baseline with NaNs filled. + """ + rewards = np.asarray(rewards, dtype=float) + baselines = _replace_nans_by_average(baselines) + + if rewards.shape[0] != baselines.shape[0]: + raise ValueError("rewards and baselines must have the same length.") + + # Identify valid reward samples + valid = ~np.isnan(rewards) + reward_valid = rewards[valid] + if reward_valid.size == 0: + return np.nan, np.nan + + selected_baseline_valid = _select_best_baseline(reward_valid, baselines[valid]) + diffs = reward_valid - selected_baseline_valid + adjusted_se = np.std(diffs, ddof=1) / np.sqrt(diffs.size) + + # Adjusted mean reward is the raw mean of valid rewards + adjusted_reward_mean = reward_valid.mean() + + return adjusted_reward_mean, adjusted_se + + +def _clean_input(rewards, baselines): + rewards = np.asarray(rewards) + baselines = np.asarray(baselines) + baselines = _replace_nans_by_average(baselines) + if rewards.shape[0] != baselines.shape[0]: + raise ValueError("rewards and baselines must have the same length.") + if rewards.ndim != 1: + raise ValueError("rewards must be a 1D array.") + if baselines.ndim != 2: + raise ValueError("baselines must be a 2D array.") + + # remove nan rows + valid = ~np.isnan(rewards) + rewards = rewards[valid] + baselines = baselines[valid] + if rewards.size == 0: + raise ValueError("No valid rewards after filtering.") + if baselines.shape[0] != rewards.shape[0]: + raise ValueError("rewards and baselines must have the same length after filtering.") + + return rewards, baselines + + +def std_err_ancova(rewards, baselines): + """ + Parameters: + - rewards: array-like of shape (n,) + Observed rewards per sample + - baselines: array-like of shape (n, k) + k baseline estimates per sample + + Returns: + - adjusted_mean: float + Mean reward adjusted to the average baseline levels + - standard_error: float + Standard error of the adjusted mean + """ + rewards, baselines = _clean_input(rewards, baselines) + + # Center the baselines + baseline_means = baselines.mean(axis=0) + centered_baselines = baselines - baseline_means + + # Build design matrix with intercept + design_matrix = sm.add_constant(centered_baselines) + + # Fit the model + results = sm.OLS(rewards, design_matrix).fit() + + # Extract the adjusted mean (intercept) and its SE + adjusted_mean = results.params[0] + standard_error = results.bse[0] + + # print rsquared + print(f"R-squared: {results.rsquared:.4f}") + + return adjusted_mean, standard_error + + +def std_err_glm_cv_regularized( + rewards, baselines, lambda_grid=None, n_splits=5, n_boot=200, random_state=None +): + """ + Fit a logistic GLM with L2 regularization, selecting the penalty strength via k-fold CV, + and estimate SE of the adjusted mean via bootstrap. + + Parameters + ---------- + rewards : array-like, shape (n,) + Observed binary outcomes (0 or 1). + baselines : array-like, shape (n, k) + k baseline estimates per sample. + lambda_grid : list or array-like of floats + Candidate L2 penalty strengths (alpha values) for cv. + n_splits : int, default=5 + Number of folds for cross-validation. + n_boot : int, default=200 + Number of bootstrap replicates for SE estimation. + random_state : int or None, default=None + Seed for reproducibility in CV splits and bootstrap. + + Returns + ------- + adjusted_mean : float + Mean predicted probability across all samples under the final model. + adjusted_se : float + Bootstrap-based SE of the adjusted mean. + best_lambda : float + The selected regularization strength (alpha). + """ + + if lambda_grid is None: + lambda_grid = np.logspace(-4, 3, 10) + + # Prepare data + y = np.asarray(rewards) + B = np.asarray(baselines) + Bc = B - B.mean(axis=0) # center covariates + X = sm.add_constant(Bc) # design matrix + + # Cross-validate to pick lambda + kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) + best_lambda = None + best_score = np.inf + for lam in lambda_grid: + fold_scores = [] + for train_idx, val_idx in kf.split(X): + model = sm.GLM(y[train_idx], X[train_idx], family=Binomial()) + res = model.fit_regularized(alpha=lam, L1_wt=0) + p_val = res.predict(X[val_idx]) + p_val = np.clip(p_val, 1e-6, 1 - 1e-6) + # Negative log-likelihood per sample + nll = -np.mean(y[val_idx] * np.log(p_val) + (1 - y[val_idx]) * np.log(1 - p_val)) + fold_scores.append(nll) + avg_score = np.mean(fold_scores) + if avg_score < best_score: + best_score, best_lambda = avg_score, lam + + print(f"Best lambda std_err_glm_cv_regularized: {best_lambda:.4f} with NLL: {best_score:.4f}") + + # Fit final model on full data + final_model = sm.GLM(y, X, family=Binomial()).fit_regularized(alpha=best_lambda, L1_wt=0) + p_hat = final_model.predict(X) + adjusted_mean = np.mean(p_hat) + + # Bootstrap SE estimation + rng = np.random.RandomState(random_state) + boot_means = [] + n = len(y) + for _ in range(n_boot): + idx = rng.randint(0, n, n) + res_boot = sm.GLM(y[idx], X[idx], family=Binomial()).fit_regularized( + alpha=best_lambda, L1_wt=0 + ) + p_boot = res_boot.predict(X) + boot_means.append(np.mean(p_boot)) + adjusted_se = np.std(boot_means, ddof=1) + + return adjusted_mean, adjusted_se + + +def std_err_glm_crossfit_bootstrap( + rewards, baselines, lambda_grid=None, K=5, B=200, random_state=None +): + """ + 1) Tune lambda once on the full data via K-fold CV + 2) Bootstrap: for each replicate, cross-fit GLM with the fixed lambda and compute the mean + 3) Report the point estimate (cross-fitted mean on original data) and bootstrap SE + + Parameters + ---------- + rewards : array-like, shape (n,) + Binary outcomes (0 or 1). + baselines : array-like, shape (n, k) + Baseline covariates per observation. + lambda_grid : array-like, optional + Candidate L2 penalties for CV. Defaults to logspace(-4, 3, 10). + K : int, default=5 + Number of folds for CV and cross-fit. + B : int, default=200 + Number of bootstrap replicates. + random_state : int or None + Seed for reproducibility. + + Returns + ------- + mu_hat : float + Cross-fitted mean predicted probability on the original data. + se : float + Bootstrap-based standard error of the mean. + best_lambda : float + Selected regularization strength. + """ + y = np.asarray(rewards) + X = sm.add_constant(np.asarray(baselines) - np.asarray(baselines).mean(axis=0)) + n = len(y) + + # 1) Tune lambda on full data + if lambda_grid is None: + lambda_grid = np.logspace(-4, 3, 10) + kf_inner = KFold(n_splits=K, shuffle=True, random_state=random_state) + best_lambda, best_score = None, np.inf + for lam in lambda_grid: + scores = [] + for train_idx, val_idx in kf_inner.split(X): + model = sm.GLM(y[train_idx], X[train_idx], family=sm.families.Binomial()) + res = model.fit_regularized(alpha=lam, L1_wt=0) + p_val = np.clip(res.predict(X[val_idx]), 1e-6, 1 - 1e-6) + nll = -np.mean(y[val_idx] * np.log(p_val) + (1 - y[val_idx]) * np.log(1 - p_val)) + scores.append(nll) + if np.mean(scores) < best_score: + best_score, best_lambda = np.mean(scores), lam + + # 2) Cross-fitted point estimate on original data + kf = KFold(n_splits=K, shuffle=True, random_state=random_state) + p_full = np.zeros(n) + for train_idx, val_idx in kf.split(X): + model = sm.GLM(y[train_idx], X[train_idx], family=sm.families.Binomial()) + res = model.fit_regularized(alpha=best_lambda, L1_wt=0) + p_full[val_idx] = res.predict(X[val_idx]) + mu_hat = p_full.mean() + + # 3) Bootstrap SE via cross-fitted means + rng = np.random.RandomState(random_state) + mu_boot = np.zeros(B) + for b in range(B): + idx = rng.randint(0, n, size=n) + yb, Xb = y[idx], X[idx] + p_b = np.zeros(n) + for train_idx, val_idx in kf.split(Xb): + model = sm.GLM(yb[train_idx], Xb[train_idx], family=sm.families.Binomial()) + res = model.fit_regularized(alpha=best_lambda, L1_wt=0) + p_b[val_idx] = res.predict(Xb[val_idx]) + mu_boot[b] = p_b.mean() + + print( + f"Best lambda std_err_glm_crossfit_bootstrap: {best_lambda:.4f} with NLL: {best_score:.4f}" + ) + + se = mu_boot.std(ddof=1) + return mu_hat, se + + +def crossfit_se_min_nll(rewards, baselines, lambda_grid=None, K=5, random_state=None): + """ + Cross-fit predictions for each lambda, compute out-of-sample NLL, + select lambda with min NLL, then compute SE = std(y - p)/sqrt(n). + + Parameters + ---------- + rewards : array-like, shape (n,) + Binary outcomes (0 or 1). + baselines : array-like, shape (n, k) + Baseline covariates per observation. + lambda_grid : array-like + Candidate L2 penalties for CV. + K : int, default=5 + Number of folds for cross-fit. + random_state : int or None + Seed for reproducibility. + + Returns + ------- + best_lambda : float + Lambda with minimum out-of-sample NLL. + best_p : array, shape (n,) + Cross-fitted probabilities for best_lambda. + best_se : float + Standard error = std(y - best_p) / sqrt(n). + nll_dict : dict + NLL values for each lambda. + """ + y = np.asarray(rewards) + B = np.asarray(baselines) + # center covariates + Bc = B - B.mean(axis=0) + X = sm.add_constant(Bc) + n = len(y) + kf = KFold(n_splits=K, shuffle=True, random_state=random_state) + + nll_dict = {} + p_dict = {} + + if lambda_grid is None: + lambda_grid = np.logspace(-4, 3, 10) + + for lam in lambda_grid: + p = np.zeros(n) + for train_idx, val_idx in kf.split(X): + model = sm.GLM(y[train_idx], X[train_idx], family=sm.families.Binomial()) + res = model.fit_regularized(alpha=lam, L1_wt=0) + p[val_idx] = res.predict(X[val_idx]) + + p_clipped = np.clip(p, 1e-6, 1 - 1e-6) + nll = -np.mean(y * np.log(p_clipped) + (1 - y) * np.log(1 - p_clipped)) + nll_dict[lam] = nll + p_dict[lam] = p + + best_lambda = min(nll_dict, key=nll_dict.get) + best_p = p_dict[best_lambda] + best_se = np.std(y - best_p, ddof=0) / np.sqrt(n) + print( + f"Best lambda crossfit_se_min_nll: {best_lambda:.4f} with NLL: {nll_dict[best_lambda]:.4f}" + ) + + adjusted_mean = best_p.mean() + return adjusted_mean, best_se diff --git a/src/agentlab/analyze/covariate_toy_experiment/__init__.py b/src/agentlab/analyze/covariate_toy_experiment/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/agentlab/analyze/covariate_toy_experiment/covariate_toy_experiment.ipynb b/src/agentlab/analyze/covariate_toy_experiment/covariate_toy_experiment.ipynb new file mode 100644 index 00000000..1a743f59 --- /dev/null +++ b/src/agentlab/analyze/covariate_toy_experiment/covariate_toy_experiment.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "2189355d", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from agentlab.analyze.covariate_toy_experiment.mock_data import (\n", + " Task,\n", + " Agent,\n", + " agent_on_benchmark,\n", + " plot_task_difficulty,\n", + " plot_gaussian,\n", + " _augment_with_average,\n", + ")\n", + "\n", + "from agentlab.analyze.covariate_std_err import (\n", + " std_err_clt,\n", + " std_err_bootstrap,\n", + " std_err_ancova,\n", + " std_err_glm_cv_regularized,\n", + " crossfit_se_min_nll,\n", + " aggregate_success,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f62a8bb6", + "metadata": {}, + "outputs": [], + "source": [ + "def _make_baselines(\n", + " benchmark, k_baselines, task_types, consistancy, rng: np.random.RandomState = np.random\n", + "):\n", + "\n", + " baseline_agents = [\n", + " Agent(\n", + " rng.beta(10, 2),\n", + " benchmark,\n", + " type=rng.choice(task_types),\n", + " consistancy=consistancy,\n", + " rng=rng,\n", + " )\n", + " for _ in range(k_baselines)\n", + " ]\n", + "\n", + " rewards_baselines = []\n", + " for i, baseline in enumerate(baseline_agents):\n", + " rewards = agent_on_benchmark(baseline, benchmark, n_samples_per_task=1, rng=rng)\n", + " rewards_baselines.append(rewards.reshape(-1))\n", + " # print(f\"Baseline {i}: {str(baseline)} \")\n", + " rewards_baselines = np.array(rewards_baselines).T\n", + "\n", + " return rewards_baselines, baseline_agents" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "950d662c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Main agent: Agent(competence=0.980, type=0, success_rate=0.586, fit_ratio=0.330)\n", + "R-squared: 0.6769\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5717\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5792\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, 30.0)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n_task = 100\n", + "k_baselines = 50\n", + "task_types = [0, 0, 0, 0, 1, 1, 2, 3, 4]\n", + "main_agent_type = 0\n", + "consistancy = 4\n", + "\n", + "rng = np.random.RandomState()\n", + "benchmark = [\n", + " Task(difficulty, type=rng.choice(task_types)) for difficulty in rng.beta(0.3, 0.3, n_task)\n", + "]\n", + "\n", + "main_agent = Agent(0.98, benchmark, type=main_agent_type, consistancy=consistancy, rng=rng)\n", + "print(f\"Main agent: {main_agent}\")\n", + "\n", + "rewards = agent_on_benchmark(main_agent, benchmark, n_samples_per_task=1, rng=rng).reshape(-1)\n", + "rewards_baselines, baseline_agents = _make_baselines(\n", + " benchmark, k_baselines, task_types=task_types, consistancy=consistancy, rng=rng\n", + ")\n", + "\n", + "oversampled_rewards = agent_on_benchmark(main_agent, benchmark, n_samples_per_task=1000, rng=rng)\n", + "\n", + "\n", + "# -- helper for empirical uncertainty --\n", + "def empirical_uncertainty(oversampled_rewards):\n", + " avg = np.mean(oversampled_rewards, axis=0)\n", + " return np.mean(avg), np.std(avg)\n", + "\n", + "\n", + "# list of (label, func, args)\n", + "methods = [\n", + " (\"Vanilla std_err\", std_err_clt, (rewards,)),\n", + " (\"Bootstrap std_err\", std_err_bootstrap, (rewards,)),\n", + " (\"Empirical uncertainty\", empirical_uncertainty, (oversampled_rewards,)),\n", + " # (\"Oversampled std_err\", std_err_clt, (oversampled_rewards.reshape(-1),)),\n", + " (\"Ancova std_err\", std_err_ancova, (rewards, rewards_baselines)),\n", + " (\"GLM CV-reg std_err\", std_err_glm_cv_regularized, (rewards, rewards_baselines)),\n", + " (\"GLM crossfit_se_min_nll\", crossfit_se_min_nll, (rewards, rewards_baselines)),\n", + "]\n", + "\n", + "# compute, plot and collect results\n", + "results = []\n", + "for name, func, args in methods:\n", + " mu, se = func(*args)\n", + " plot_gaussian(mu, se, label=name)\n", + " results.append((name, mu, se))\n", + "\n", + "\n", + "plt.legend(loc=\"upper right\")\n", + "plt.ylim(0, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "448a398c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Method Mean SE Ratio\n", + "Vanilla std_err 0.580 0.050 1.000\n", + "Bootstrap std_err 0.582 0.048 1.026\n", + "GLM crossfit_se_min_nll 0.544 0.044 1.123\n", + "Ancova std_err 0.580 0.040 1.238\n", + "Empirical uncertainty 0.588 0.039 1.277\n", + "GLM CV-reg std_err 0.548 0.023 2.181\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# make a bar plot for the results, with error bars and x axis = name\n", + "plt.figure()\n", + "\n", + "# sort results from high to low stderr\n", + "results = sorted(results, key=lambda x: x[2], reverse=True)\n", + "\n", + "# print table\n", + "print(f\"{'Method':<30} {'Mean':>8} {'SE':>8} {'Ratio':>8}\")\n", + "for name, mu, se in results:\n", + " print(f\"{name:<30} {mu:8.3f} {se:8.3f} {results[0][2] / se:8.3f}\")\n", + "\n", + "plt.bar(\n", + " [name for name, _, _ in results],\n", + " [mu for _, mu, _ in results],\n", + " yerr=[se for _, _, se in results],\n", + " capsize=5,\n", + ")\n", + "\n", + "plt.xticks(rotation=20, ha=\"right\", fontsize=8)\n", + "plt.ylabel(\"Mean reward\")\n", + "plt.title(\"Mean reward with std_err\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8bdd8277", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R-squared: 0.5972\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5993\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.6254\n", + "R-squared: 0.6678\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.4980\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5068\n", + "R-squared: 0.5609\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5652\n", + "Best lambda crossfit_se_min_nll: 0.7743 with NLL: 0.5657\n", + "R-squared: 0.6464\n", + "Best lambda std_err_glm_cv_regularized: 0.7743 with NLL: 0.6104\n", + "Best lambda crossfit_se_min_nll: 0.7743 with NLL: 0.6060\n", + "R-squared: 0.6035\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5624\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5626\n", + "R-squared: 0.7575\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5062\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.4981\n", + "R-squared: 0.6813\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5217\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5189\n", + "R-squared: 0.7169\n", + "Best lambda std_err_glm_cv_regularized: 0.0215 with NLL: 0.5005\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5048\n", + "R-squared: 0.6852\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5280\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5457\n", + "R-squared: 0.6109\n", + "Best lambda std_err_glm_cv_regularized: 0.1292 with NLL: 0.5513\n", + "Best lambda crossfit_se_min_nll: 0.1292 with NLL: 0.5522\n" + ] + } + ], + "source": [ + "from collections import defaultdict\n", + "\n", + "\n", + "results = defaultdict(list)\n", + "\n", + "for i in range(10):\n", + " rewards = agent_on_benchmark(main_agent, benchmark, n_samples_per_task=1, rng=rng).reshape(-1)\n", + " oversampled_rewards = agent_on_benchmark(\n", + " main_agent, benchmark, n_samples_per_task=1000, rng=rng\n", + " )\n", + "\n", + " methods = [\n", + " (\"Empirical uncertainty\", empirical_uncertainty, (oversampled_rewards,)),\n", + " (\"Ancova std_err\", std_err_ancova, (rewards, rewards_baselines)),\n", + " (\"GLM CV-reg std_err\", std_err_glm_cv_regularized, (rewards, rewards_baselines)),\n", + " (\"GLM crossfit_se_min_nll\", crossfit_se_min_nll, (rewards, rewards_baselines)),\n", + " ]\n", + "\n", + " for name, func, args in methods:\n", + " mu, se = func(*args)\n", + " results[name].append((mu, se))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f7382f8b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Method Avg Mean Std Mean Avg SE\n", + "Empirical uncertainty 0.585 0.001 0.039\n", + "Ancova std_err 0.579 0.025 0.041\n", + "GLM CV-reg std_err 0.548 0.017 0.021\n", + "GLM crossfit_se_min_nll 0.538 0.017 0.042\n" + ] + } + ], + "source": [ + "print(f\"{'Method':<30} {'Avg Mean':>8} {\"Std Mean\":>8} {'Avg SE':>8}\")\n", + "for name, mu_se in results.items():\n", + " mu, se = zip(*mu_se)\n", + " print(f\"{name:<30} {np.mean(mu):8.3f} {np.std(mu):8.3f} {np.mean(se):8.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bab8645", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R-squared: 0.7398\n", + "R-squared: 0.5876\n", + "Agg mean: 0.5800000000000001, Agg se: 0.02891222719359173\n", + "Agent Agent(competence=0.980, type=0, success_rate=0.586, fit_ratio=0.330)\n" + ] + }, + { + "ename": "TypeError", + "evalue": "std_err_ancova() got an unexpected keyword argument 'baseline_rewards'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 16\u001b[39m\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m a, r \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(agents, rewards):\n\u001b[32m 15\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAgent \u001b[39m\u001b[38;5;132;01m{\u001b[39;00ma\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m mean, se = \u001b[43mstd_err_ancova\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrewards\u001b[49m\u001b[43m=\u001b[49m\u001b[43mr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbaseline_rewards\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrewards_baselines\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 17\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m Mean: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmean\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, SE: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mse\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mTypeError\u001b[39m: std_err_ancova() got an unexpected keyword argument 'baseline_rewards'" + ] + } + ], + "source": [ + "alt_agent = Agent(0.92, benchmark, type=main_agent_type, consistancy=consistancy, rng=rng)\n", + "\n", + "agents = [main_agent, alt_agent]\n", + "rewards = [\n", + " agent_on_benchmark(a, benchmark, n_samples_per_task=1, rng=rng).reshape(-1) for a in agents\n", + "]\n", + "agg_mean, agg_se = aggregate_success(\n", + " rewards, baseline_rewards=rewards_baselines, std_err_fn=std_err_ancova\n", + ")\n", + "\n", + "\n", + "print(f\"Agg mean: {agg_mean}, Agg se: {agg_se}\")\n", + "for a, r in zip(agents, rewards):\n", + "\n", + " print(f\"Agent {a}\")\n", + " mean, se = std_err_ancova(r, rewards_baselines)\n", + " print(f\" Mean: {mean}, SE: {se}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c4ee531a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "plot_task_difficulty([task.difficulty for task in benchmark])\n", + "\n", + "plt.figure(figsize=(5, 10))\n", + "rewards_baselines_ = _augment_with_average(rewards_baselines)\n", + "plt.imshow(rewards_baselines_, aspect=\"auto\", cmap=\"jet\")\n", + "plt.colorbar(label=\"Reward\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ui-assist", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/agentlab/analyze/covariate_toy_experiment/mock_data.py b/src/agentlab/analyze/covariate_toy_experiment/mock_data.py new file mode 100644 index 00000000..38d74324 --- /dev/null +++ b/src/agentlab/analyze/covariate_toy_experiment/mock_data.py @@ -0,0 +1,124 @@ +from dataclasses import dataclass +from uuid import uuid4 + +import matplotlib.pyplot as plt +import numpy as np + + +def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + +@dataclass +class Task: + difficulty: float + type: int = None + uuid: str = None + + def __post_init__(self): + self.uuid = str(uuid4()) + + +class Agent: + + def __init__( + self, + competence: float, + benchmark: list[Task], + type: int = None, + consistancy: float = 10, + rng: np.random.RandomState = np.random, + ): + self.competence = competence + self.type = type + self.task_success_rate = {} + fit_count = 0 + for task in benchmark: + + agent_task_competence = competence + if task.type is not None and type is not None: + if task.type != type: + agent_task_competence = competence * 0.5 + else: + fit_count += 1 + + # task_competence = agent_task_competence * (1.001 - task.difficulty) + task_success_rate = sigmoid(consistancy * (agent_task_competence - task.difficulty)) + + self.task_success_rate[task.uuid] = task_success_rate + self.fit_ratio = fit_count / len(benchmark) + + def get_task_success_rate(self, task: Task): + return self.task_success_rate[task.uuid] + + def get_success_rate(self): + return np.mean(list(self.task_success_rate.values())) + + def __str__(self): + + return f"Agent(competence={self.competence:.3f}, type={self.type}, success_rate={self.get_success_rate():.3f}, fit_ratio={self.fit_ratio:.3f})" + + +def agent_on_benchmark( + agent: Agent, + benchmark: list[Task], + n_samples_per_task=None, + rng: np.random.RandomState = np.random, +): + + all_rewards = [] + for task in benchmark: + task_success_rate = agent.get_task_success_rate(task) + + # sample n_samples_per_task from bernoulli distribution + rewards = rng.binomial(1, task_success_rate, n_samples_per_task) + + all_rewards.append(rewards) + return np.array(all_rewards) + + +def plot_task_difficulty(difficulties): + """ + Plot the difficulty of each task in the benchmark. + """ + + plt.hist(difficulties, bins=20) + plt.xlabel("Task Difficulty") + plt.ylabel("Frequency") + plt.title("Distribution of Task Difficulty") + + +def plot_gaussian(mu, sigma, label=None): + """ + Plot a Gaussian distribution with mean mu and standard deviation sigma. + """ + x = np.linspace(0, 1, 1000) + plt.plot( + x, 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((x - mu) / sigma) ** 2), label=label + ) + + +def _augment_with_average(matrix: np.ndarray) -> np.ndarray: + """Return a new array with row averages as the last column, + column averages as the last row, and the overall average in the bottom-right.""" + row_avg = matrix.mean(axis=1) + col_avg = matrix.mean(axis=0) + overall_avg = matrix.mean() + + # sort columns by their average + sorted_indices = np.argsort(col_avg) + matrix = matrix[:, sorted_indices] + col_avg = col_avg[sorted_indices] + + # sort rows by their average + sorted_indices = np.argsort(row_avg) + matrix = matrix[sorted_indices, :] + row_avg = row_avg[sorted_indices] + + aug = np.zeros((matrix.shape[0] + 1, matrix.shape[1] + 1)) + aug[:-1, :-1] = matrix + aug[:-1, -1] = row_avg + aug[-1, :-1] = col_avg + aug[-1, -1] = overall_avg + + return aug diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index ba0e15cc..5125801d 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -155,6 +155,14 @@ max_new_tokens=28_000, temperature=1e-1, ), + "openrouter/meta-llama/llama-4-maverick": OpenRouterModelArgs( + model_name="meta-llama/llama-4-maverick", + max_total_tokens=128_000, + max_input_tokens=100_000, + max_new_tokens=28_000, + temperature=1e-1, + vision_support=True, + ), "openrouter/meta-llama/llama-3.1-8b-instruct:free": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-8b-instruct:free", max_total_tokens=128_000,