NVIDIA-NeMo · lbliii · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -5,6 +5,6 @@ sphinx-copybutton   # Adds a copy button for code blocks
 nvidia-sphinx-theme # Our NVIDIA theme
 sphinxcontrib-mermaid # For mermaid diagrams
 myst-parser # For our markdown docs
-sphinx-design
-sphinxcontrib-mermaid
-swagger-plugin-for-sphinx
+sphinx-design # For our design elements
+sphinxcontrib-mermaid # For mermaid diagrams
+swagger-plugin-for-sphinx # For Swagger API documentation
@@ -10,6 +10,15 @@ We welcome contributions to the NeMo Evaluator projects! This document provides
 - [UV](https://github.com/astral-sh/uv) for fast Python package management
 - Git for version control
 
+### Project Structure
+
+This is a monorepo containing two packages:
+
+- **`packages/nemo-evaluator`** - Core evaluation library (required for building docs)
+- **`packages/nemo-evaluator-launcher`** - CLI and orchestration layer
+
+Each package has its own virtual environment managed by UV. Choose your setup based on what you're contributing to.
+
 ### Setup
 
 1. **Install UV**
@@ -22,23 +31,38 @@ We welcome contributions to the NeMo Evaluator projects! This document provides
 
    ```bash
    git clone <repository-url>
+   cd Evaluator
    ```
 
 3. **Set up development environment**
 
-   For example for **nemo-evaluator-launcher**:
+   Choose the setup that matches your contribution:
+
+   **For Launcher Development:**
 
    ```bash
-   cd nemo_evaluator_launcher
+   cd packages/nemo-evaluator-launcher
    uv sync --all-extras
+   uv run pre-commit install
    ```
 
-4. **Install pre-commit hooks**
+   **For Core Library Development:**
 
    ```bash
+   cd packages/nemo-evaluator
+   uv sync --all-extras
    uv run pre-commit install
    ```
 
+   **For Documentation:**
+
+   ```bash
+   cd packages/nemo-evaluator
+   uv sync --group docs
+   ```
+
+   Then build documentation with `make docs-html` from the repository root.
+
 ### Development Tools
 
 The project uses the following tools for development:
@@ -110,6 +134,25 @@ uv run pytest --disable-network
 3. **Clear assertions**: Use descriptive assertion messages
 4. **Mock external dependencies**: Use `pytest` fixtures and mocking for external services
 
+## Validating Documentation Snippets (Optional)
+
+Documentation builds without executing code snippets, but you may want to validate that snippets are syntactically correct and have valid imports.
+
+To validate snippets that import from both packages:
+
+```bash
+# Set up launcher environment with core library
+cd packages/nemo-evaluator-launcher
+uv sync --all-extras
+uv pip install -e ../nemo-evaluator/
+
+# Activate environment and validate a snippet
+source .venv/bin/activate
+python -m py_compile docs/evaluation/_snippets/api-examples/basic_evaluate.py
+```
+
+**Note**: Most snippets require actual model endpoints to run, so validation only checks syntax and imports, not execution.
+
 ## Pull Request Guidelines
 
 ### Before Submitting

diff --git a/docs/autodoc2_docstrings_parser.py → docs-archive/autodoc2_docstrings_parser.py b/docs/autodoc2_docstrings_parser.py → docs-archive/autodoc2_docstrings_parser.py
diff --git a/docs-archive/conf.py b/docs-archive/conf.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import os
+import sys
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "NeMo Eval"
+copyright = "2025, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+release = "0.1.0"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "myst_parser",  # For our markdown docs
+    "autodoc2",  # Generates API docs
+    "sphinx.ext.viewcode",  # For adding a link to view source code in docs
+    "sphinx.ext.doctest",  # Allows testing in docstrings
+    "sphinx.ext.napoleon",  # For google style docstrings
+    "sphinx_copybutton",  # For copy button in code blocks
+    "sphinxcontrib.mermaid",  # For mermaid diagrams
+]
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# -- Options for MyST Parser (Markdown) --------------------------------------
+# MyST Parser settings
+myst_enable_extensions = [
+    "dollarmath",  # Enables dollar math for inline math
+    "amsmath",  # Enables LaTeX math for display mode
+    "colon_fence",  # Enables code blocks using ::: delimiters instead of ```
+    "deflist",  # Supports definition lists with term: definition format
+    "fieldlist",  # Enables field lists for metadata like :author: Name
+    "tasklist",  # Adds support for GitHub-style task lists with [ ] and [x]
+    "html_image",  # Enables HTML image tags
+]
+myst_heading_anchors = 5  # Generates anchor links for headings up to level 5
+myst_auto_link_extensions = []  # Disable automatic link conversion
+myst_url_schemes = ["http", "https", "mailto"]  # Only convert these URL schemes
+
+# -- Options for Autodoc2 ---------------------------------------------------
+sys.path.insert(0, os.path.abspath(".."))
+
+autodoc2_packages = [
+    "../packages/nemo-evaluator/src/nemo_evaluator",  # Path to your package relative to conf.py
+]
+autodoc2_render_plugin = "myst"  # Use MyST for rendering docstrings
+autodoc2_output_dir = "apidocs"  # Output directory for autodoc2 (relative to docs/)
+# This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to
+# render google style docstrings.
+# Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33
+autodoc2_docstring_parser_regexes = [
+    (r".*", "docs.autodoc2_docstrings_parser"),
+]
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "nvidia_sphinx_theme"
+html_theme_options = {
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA-NeMo/Eval",
+            "icon": "fa-brands fa-github",
+        }
+    ],
+    "switcher": {
+        "json_url": "../versions1.json",
+        "version_match": release,
+    },
+    "extra_head": {
+        """
+    <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
+    """
+    },
+    "extra_footer": {
+        """
+    <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>
+    """
+    },
+}
+html_extra_path = ["project.json", "versions1.json"]
+
+# -- Warning suppression and cross-reference handling ----------------------
+nitpicky = False
+suppress_warnings = [
+    "ref.python",  # Suppress ambiguous cross-reference warnings
+    "toc.not_included",  # Suppress toctree warnings for myst-based docs
+    "myst.header",  # Suppress header level warnings
+    "myst.directive_unknown",  # Suppress unknown directive warnings
+    "myst.xref_missing",  # Suppress missing cross-reference warnings
+    "ref.doc",  # Suppress document reference warnings
+    "misc.highlighting_failure",  # Suppress Pygments highlighting warnings
+]
+
+# Github links are now getting rate limited from the Github Actions
+linkcheck_ignore = [
+    ".*github\\.com.*",
+    ".*githubusercontent\\.com.*",
+    ".*catalog\\.ngc\\.nvidia\\.com.*",  # Temporary: NGC catalog links that may not be publicly accessible
+    ".*platform\\.openai\\.com.*",  # To diagnose: OpenAI platform links that may require authentication
+]
diff --git a/docs/documentation.md → docs-archive/documentation.md b/docs/documentation.md → docs-archive/documentation.md
diff --git a/docs-archive/index.md b/docs-archive/index.md
@@ -0,0 +1,162 @@
+# NeMo Evaluator Documentation
+
+NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
+
+[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Eval/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md)
+
+## Key Pillars
+
+NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
+
+- **Reproducibility by Default**: All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
+- **Scale Anywhere**: Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
+- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
+- **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
+
+## How It Works: Launcher and Core Engine
+
+The platform consists of two main components:
+
+- **`nemo-evaluator` ([The Evaluation Core Engine](./docs/nemo-evaluator/index.md))**: A Python library that manages the interaction between an evaluation harness and the model being tested.
+- **`nemo-evaluator-launcher` ([The CLI and Orchestration](./docs/nemo-evaluator-launcher/index.md))**: The primary user interface and orchestration layer. It handles configuration, selects the execution environment, and launches the appropriate container to run the evaluation.
+
+Most users typically interact with `nemo-evaluator-launcher`, which serves as a universal gateway to different benchmarks and harnesses. However, it is also possible to interact directly with `nemo-evaluator` by following this [guide](./docs/nemo-evaluator/workflows/using-containers.md).
+
+```{mermaid}
+graph TD
+    A[User] --> B{NeMo Evaluator Launcher};
+    B -- " " --> C{Local};
+    B -- " " --> D{Slurm};
+    B -- " " --> E{Lepton};
+    subgraph Execution Environment
+        C -- "Launches Container" --> F[Evaluation Container];        
+        D -- "Launches Container" --> F;
+        E -- "Launches Container" --> F;
+    end
+    subgraph F[Evaluation Container]
+        G[Nemo Evaluator] -- " Runs " --> H[Evaluation Harness]
+    end
+    H -- "Sends Requests To" --> I[🤖 Model Endpoint];
+    I -- "Returns Responses" --> H;
+```
+
+## Quickstart
+
+Get your first evaluation result in minutes. This guide uses your local machine to run a small benchmark against an OpenAI API-compatible endpoint.
+
+## 1. Install the Launcher
+
+The launcher is the only package required to get started.
+
+```bash
+pip install nemo-evaluator-launcher
+```
+
+## 2. Set Up Your Model Endpoint
+
+NeMo Evaluator works with any model that exposes an OpenAI-compatible endpoint. For this quickstart, we will use the OpenAI API.
+
+**What is an OpenAI-compatible endpoint?** A server that exposes /v1/chat/completions and /v1/completions endpoints, matching the OpenAI API specification.
+
+**Options for model endpoints:**
+
+- **Hosted endpoints** (fastest): Use ready-to-use hosted models from providers like [build.nvidia.com](https://build.nvidia.com) that expose OpenAI-compatible APIs with no hosting required.
+- **Self-hosted options**: Host your own models using tools like NVIDIA NIM, vLLM, or TensorRT-LLM for full control over your evaluation environment.
+
+For detailed setup instructions including self-hosted configurations, see the [tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md).
+
+**Getting an NGC API Key for build.nvidia.com:**
+
+To use out-of-the-box build.nvidia.com APIs, you need an API key:
+
+1. Register an account at [build.nvidia.com](https://build.nvidia.com).
+2. In the Setup menu under Keys/Secrets, generate an API key.
+3. Set the environment variable by executing `export NGC_API_KEY=<YOUR_API_KEY>`.
+
+## 3. Run Your First Evaluation
+
+Run a small evaluation on your local machine. The launcher automatically pulls the correct container and executes the benchmark. The list of benchmarks is directly configured in the YAML file.
+
+**Configuration Examples**: Explore ready-to-use configuration files in [`packages/nemo-evaluator-launcher/examples/`](./packages/nemo-evaluator-launcher/examples/) for local, Lepton, and Slurm deployments with various model hosting options (vLLM, NIM, hosted endpoints).
+
+Once you have the example configuration file, either by cloning this repository or downloading one directly such as `local_nvidia_nemotron_nano_9b_v2.yaml`, you can run the following command:
+
+
+```bash
+nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/examples --config-name local_nvidia_nemotron_nano_9b_v2 --override execution.output_dir=<YOUR_OUTPUT_LOCAL_DIR>
+```
+
+After running this command, you will see a `job_id`, which can be used to track the job and its results. All logs will be available in your `<YOUR_OUTPUT_LOCAL_DIR>`.
+
+## 4. Check Your Results
+
+Results, logs, and run configurations are saved locally. Inspect the status of the evaluation job by using the corresponding `job_id`:
+
+```bash
+nemo-evaluator-launcher status <job_id_or_invocation_id>
+```
+
+/// note | About invocation and job IDs
+It is possible to use short version of IDs in `status` command, for example `abcd` instead of a full `abcdef0123456` or `ab.0` instead of `abcdef0123456.0`, so long as there are no collisions. This is a syntactic sugar allowing for a slightly easier usage.
+///
+
+## Next Steps
+
+- List all supported benchmarks:
+
+  ```bash
+  nemo-evaluator-launcher ls tasks
+  ```
+
+- Explore the [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
+- Scale up your evaluations using the [Slurm Executor](./docs/nemo-evaluator-launcher/executors/slurm.md) or [Lepton Executor](./docs/nemo-evaluator-launcher/executors/lepton.md).
+- Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
+- Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
+
+## Supported Benchmarks and Evaluation Harnesses
+
+NeMo Evaluator Launcher provides pre-built evaluation containers for different evaluation harnesses through the NVIDIA NGC catalog. Each harness supports a variety of benchmarks, which can then be called via `nemo-evaluator`. This table provides a list of benchmark names per harness. A more detailed list of task names can be found in the [list of NGC containers](./docs/nemo-evaluator/index.md#ngc-containers).
+
+| Container | Description | NGC Catalog | Latest Tag | Supported benchmarks |
+|-----------|-------------|-------------|------------| ------------|
+| **agentic_eval** | Agentic AI evaluation framework | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/agentic_eval) | `25.08.1` | Agentic Eval Topic Adherence, Agentic Eval Tool Call, Agentic Eval Goal and Answer Accuracy |
+| **bfcl** | Function calling | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bfcl) | `25.08.1` | BFCL v2 and v3 |
+| **bigcode-evaluation-harness** | Code generation evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bigcode-evaluation-harness) | `25.08.1` | MBPP, MBPP-Plus, HumanEval, HumanEval+, Multiple (cpp, cs, d, go, java, jl, js, lua, php, pl, py, r, rb, rkt, rs, scala, sh, swift, ts) |
+| **garak** | Safety and vulnerability testing | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/garak) | `25.08.1` | Garak |
+| **helm** | Holistic evaluation framework | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/helm) | `25.08.1` | MedHelm |
+| **hle** | Academic knowledge and problem solving | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/hle) | `25.08.1` | HLE |
+| **ifbench** | Instruction following | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/ifbench) | `25.08.1` | IFBench |
+| **livecodebench** | Coding | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/livecodebench) | `25.08.1` | LiveCodeBench (v1-v6, 0724_0125, 0824_0225) |
+| **lm-evaluation-harness** | Language model benchmarks | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/lm-evaluation-harness) | `25.08.1` | ARC Challenge (also multilingual), GSM8K, HumanEval, HumanEval+, MBPP, MINERVA MMMLU-Pro, RACE, TruthfulQA, AGIEval, BBH, BBQ, CSQA, Frames, Global MMMLU, GPQA-D, HellaSwag (also multilingual), IFEval, MGSM, MMMLU, MMMLU-Pro, MMMLU-ProX (de, es, fr, it, ja), MMLU-Redux, MUSR, OpenbookQA, Piqa, Social IQa, TruthfulQA, WikiLingua, WinoGrande|
+| **mmath** | Multilingual math reasoning | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mmath) | `25.08.1` | EN, ZH, AR, ES, FR, JA, KO, PT, TH, VI |
+| **mtbench** | Multi-turn conversation evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench) | `25.08.1` | MT-Bench |
+| **rag_retriever_eval** | RAG system evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/rag_retriever_eval) | `25.08.1` | RAG, Retriever |
+| **safety-harness** | Safety and bias evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/safety-harness) | `25.08.1` | Aegis v2, BBQ, WildGuard |
+| **scicode** | Coding for scientific research | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/scicode) | `25.08.1` | SciCode |
+| **simple-evals** | Common evaluation tasks | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/simple-evals) | `25.08.1` | GPQA-D, MATH-500, AIME 24 & 25, HumanEval, MGSM, MMMLU, MMMLU-Pro, MMMLU-lite (AR, BN, DE, EN, ES, FR, HI, ID, IT, JA, KO, MY, PT, SW, YO, ZH), SimpleQA |
+| **tooltalk** | Tool usage evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) | `25.08.1` | ToolTalk |
+| **vlmevalkit** | Vision-language model evaluation | [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) | `25.08.1` | AI2D, ChartQA, OCRBench, SlideVQA |
+
+## Contribution Guide
+
+We welcome community contributions. Please see our [Contribution Guide](https://github.com/NVIDIA-NeMo/Eval/blob/main/CONTRIBUTING.md) for instructions on submitting pull requests, reporting issues, and suggesting features.
+
+::::{toctree}
+:hidden:
+Home <self>
+::::
+
+<!-- ::::{toctree}
+:hidden:
+:caption: About NeMo Curator
+:maxdepth: 1
+
+:::: -->
+
+::::{toctree}
+:hidden:
+:caption: Libraries
+:maxdepth: 1
+NeMo Evaluator Launcher <nemo-evaluator-launcher/index.md>
+NeMo Evaluator <nemo-evaluator/index.md>
+::::
diff --git a/docs-archive/project.json b/docs-archive/project.json
@@ -0,0 +1 @@
+{"name": "nemo-eval", "version": "0.1.0"}
diff --git a/docs-archive/versions1.json b/docs-archive/versions1.json
@@ -0,0 +1,7 @@
+[
+    {
+        "preferred": true,
+        "version": "0.1.0",
+        "url": "../0.1.0"
+    }
+]
@@ -0,0 +1 @@
+# Custom Sphinx extensions for NeMo Curator documentation
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Custom Sphinx extensions for NeMo Curator documentation