From fc700c6108dfabe295aa7610775f9ca00e7d9a2f Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 24 Oct 2025 11:40:55 -0700
Subject: [PATCH 01/18] surface truncating & padding options

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../components/datasets/llm/formatting_utils.py       | 11 +++++++++--
 .../llm/test_column_mapped_text_instruction.py        |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py
index a26f21906..bbb835a6a 100644
--- a/nemo_automodel/components/datasets/llm/formatting_utils.py
+++ b/nemo_automodel/components/datasets/llm/formatting_utils.py
@@ -14,7 +14,7 @@
 
 import logging
 import re
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 logger = logging.getLogger(__name__)
 
@@ -125,6 +125,8 @@ def format_prompt_completion(
     eos_token_id: int,
     pad_token_id: int,
     seq_length: Optional[int] = None,
+    padding: Union[str, bool] = None,
+    truncation: Union[str, bool] = None,
     answer_only_loss_mask: bool = True,
 ) -> Dict[str, List[int]]:
     """
@@ -150,7 +152,7 @@ def format_prompt_completion(
     else:
         len_prompt_ids = 0
     # Tokenize full text
-    input_ids = tokenizer(full_text)["input_ids"]
+    input_ids = tokenizer(full_text, padding=padding, truncation=truncation, max_length=seq_length)["input_ids"]
 
     # Create assistant_masks: 0 for prompt tokens, 1 for answer tokens
     assistant_masks = [0] * len_prompt_ids + [1] * (len(input_ids) - len_prompt_ids)
@@ -171,6 +173,8 @@ def format_chat_template(
     eos_token_id: int,
     pad_token_id: int,
     seq_length: Optional[int] = None,
+    padding: Union[str, bool] = None,
+    truncation: Union[str, bool] = None,
     tools: Optional[List[Dict]] = None,
 ) -> Dict[str, List[int]]:
     """
@@ -199,6 +203,9 @@ def format_chat_template(
         tokenize=True,
         return_dict=True,
         return_assistant_tokens_mask=template_has_generation_kwd,
+        padding=padding,
+        truncation=truncation,
+        max_length=seq_length,
     )
 
     # Choose the last conversation as answer other history are context by finding the last masked token
diff --git a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py
index f35c13392..8ce6fa772 100644
--- a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py
+++ b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py
@@ -67,7 +67,7 @@ def __init__(self):
         self.bos_token_id = 2
         self._counter = 3  # Start token IDs from 3 to avoid conflicts
 
-    def __call__(self, text: str, add_special_tokens: bool = True):  # noqa: D401
+    def __call__(self, text: str, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None):  # noqa: D401
         """Mimic the Hugging Face tokenizer ``__call__`` API.
 
         The real tokenizer would convert *text* into a list of integer token IDs.

From 02b8a2696e5967168c85815368a9d64a80a399b3 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Sun, 26 Oct 2025 22:59:23 -0700
Subject: [PATCH 02/18] add padding/truncation options

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/column_mapped_text_instruction_dataset.py         | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
index c83100550..c2a2957b4 100644
--- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
+++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -165,6 +165,8 @@ def __init__(
         name: Optional[str] = None,
         answer_only_loss_mask: bool = True,
         seq_length: Optional[int] = None,
+        padding: Union[str, bool] = None,
+        truncation: Union[str, bool] = None,
         start_of_turn_token: Optional[str] = None,
         limit_dataset_samples: Optional[int] = None,
     ) -> None:
@@ -226,6 +228,8 @@ def __init__(
         self.answer_only_loss_mask = answer_only_loss_mask
         self.start_of_turn_token = start_of_turn_token
         self.seq_length = seq_length
+        self.padding = padding
+        self.truncation = truncation
 
     def __len__(self) -> int:  # noqa: D401
         """
@@ -293,6 +297,8 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]:
                 eos_token_id,
                 pad_token_id,
                 seq_length=self.seq_length,
+                padding=self.padding,
+                truncation=self.truncation,
             )
         else:
             prompt = " ".join(filter(lambda x: x is not None, (context, question, "")))
@@ -304,5 +310,7 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]:
                 eos_token_id,
                 pad_token_id,
                 seq_length=self.seq_length,
+                padding=self.padding,
+                truncation=self.truncation,
                 answer_only_loss_mask=self.answer_only_loss_mask,
             )

From 53c2cfe6075db710922f725d847a3c860157fce7 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 00:28:53 -0700
Subject: [PATCH 03/18] add test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/test_formatting_utils_options.py      | 201 ++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 tests/unit_tests/datasets/llm/test_formatting_utils_options.py

diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
new file mode 100644
index 000000000..6b35ae50b
--- /dev/null
+++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Iterable, List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_automodel.components.datasets.llm.formatting_utils import (
+    _add_pad_token,
+    format_chat_template,
+    format_prompt_completion,
+)
+
+
+def _read_tokenizer_dirs_from_env() -> List[Path]:
+    raw = os.environ.get("NEMO_TOKENIZER_DIRS", "").strip()
+    if not raw:
+        return []
+    parts: Iterable[str] = (p.strip() for p in raw.split(","))
+    paths: List[Path] = [Path(p) for p in parts if p]
+    return [p for p in paths if p.exists() and p.is_dir()]
+
+
+_TOKENIZER_DIRS: List[Path] = _read_tokenizer_dirs_from_env()
+
+
+def _skip_if_no_dirs():
+    if not _TOKENIZER_DIRS:
+        pytest.skip(
+            "Set NEMO_TOKENIZER_DIRS to a comma-separated list of local tokenizer dirs to run these tests.",
+            allow_module_level=True,
+        )
+
+
+# @pytest.mark.parametrize("tokenizer_dir", _TOKENIZER_DIRS, ids=lambda p: p.name if isinstance(p, Path) else str(p))
+@pytest.mark.parametrize(
+    "seq_length,padding,truncation",
+    [
+        (None, "do_not_pad", None),
+        (64, "max_length", True),
+    ],
+)
+def test_format_prompt_completion_options(seq_length, padding, truncation):
+    # _skip_if_no_dirs()
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    # qwen3_4b_instruct_2407
+    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_gemma3_2l/")
+    # Only applicable when tokenizer lacks chat template
+    # if getattr(tok, "chat_template", None):
+    #     pytest.skip(f"Tokenizer defines chat_template; skipping prompt-completion tests.")
+
+    eos_token_id = getattr(tok, "eos_token_id", 0)
+    pad_token_id = _add_pad_token(tok) or eos_token_id
+
+    # If using padding="max_length", seq_length must be an int
+    if padding == "max_length" and not isinstance(seq_length, int):
+        pytest.skip("padding='max_length' requires seq_length to be set.")
+
+    context = "France is a country in Europe."
+    question = "What is the capital of France?"
+    answer = "Paris."
+    prompt = f"{context} {question} "
+
+    out = format_prompt_completion(
+        tokenizer=tok,
+        prompt=prompt,
+        answer=answer,
+        eos_token_id=eos_token_id,
+        pad_token_id=pad_token_id,
+        seq_length=seq_length,
+        padding=padding,
+        truncation=truncation,
+        answer_only_loss_mask=True,
+    )
+
+    # Basic structure
+    assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys())
+    assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0
+
+    # seq_length enforcement (either by HF padding or our packager)
+    if isinstance(seq_length, int):
+        assert len(out["input_ids"]) == seq_length
+        assert len(out["labels"]) == seq_length
+        # Trailing padding label must be masked
+        assert out["labels"][-1] == -100
+
+    # EOS should be present in labels (supervised area) but not as last input_id
+    if getattr(tok, "eos_token_id", None) is not None and not truncation == True:
+        assert tok.eos_token_id in out["labels"], "EOS must appear in labels"
+        # find last non-pad input position and ensure it's not EOS
+        last_non_pad = len(out["input_ids"]) - 1
+        while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id:
+            last_non_pad -= 1
+        assert last_non_pad >= 0
+        assert out["input_ids"][last_non_pad] != tok.eos_token_id
+
+    # There should be masked (prompt) and supervised (answer) tokens
+    assert any(l == -100 for l in out["labels"])  # masked prompt
+    assert any(l != -100 for l in out["labels"])  # supervised answer
+
+    # Attention mask should have zeros only in padded tail (if any)
+    if isinstance(seq_length, int):
+        # From the end, once we see a 0, the rest must be 0
+        seen_zero = False
+        for v in reversed(out["attention_mask"]):
+            if v == 0:
+                seen_zero = True
+            else:
+                if seen_zero:
+                    pytest.fail("Non-zero attention_mask value after padded zeros.")
+
+
+@pytest.mark.parametrize(
+    "seq_length,padding,truncation",
+    [
+        (None, "do_not_pad", None),
+        (64, "max_length", True),
+    ],
+)
+def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, truncation):
+    _skip_if_no_dirs()
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    os.environ["HF_HUB_OFFLINE"] = "1"
+
+
+    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/")
+    # Only applicable when tokenizer DOES define a chat template
+    if not getattr(tok, "chat_template", None):
+        pytest.skip(f"Tokenizer {tokenizer_dir.name} has no chat_template; skipping chat-template tests.")
+
+    eos_token_id = getattr(tok, "eos_token_id", 0)
+    pad_token_id = _add_pad_token(tok) or eos_token_id
+
+    if padding == "max_length" and not isinstance(seq_length, int):
+        pytest.skip("padding='max_length' requires seq_length to be set.")
+
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+        {"role": "assistant", "content": "Paris."},
+    ]
+
+    out = format_chat_template(
+        tokenizer=tok,
+        formatted_text=messages,
+        eos_token_id=eos_token_id,
+        pad_token_id=pad_token_id,
+        seq_length=seq_length,
+        padding=padding,
+        truncation=truncation,
+    )
+
+    # Basic structure
+    assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys())
+    assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0
+
+    # seq_length enforcement
+    if isinstance(seq_length, int):
+        assert len(out["input_ids"]) == seq_length
+        assert len(out["labels"]) == seq_length
+        assert out["labels"][-1] == -100
+
+    # For chat templates, EOS should not be the last input id (unless it's all pad)
+    if getattr(tok, "eos_token_id", None) is not None:
+        last_non_pad = len(out["input_ids"]) - 1
+        while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id:
+            last_non_pad -= 1
+        if last_non_pad >= 0:
+            assert out["input_ids"][last_non_pad] != tok.eos_token_id
+
+    # There must be at least some supervised tokens in labels
+    assert any(l != -100 for l in out["labels"])  # assistant tokens
+
+    # Attention mask padded tail zeros, if padded
+    if isinstance(seq_length, int):
+        seen_zero = False
+        for v in reversed(out["attention_mask"]):
+            if v == 0:
+                seen_zero = True
+            else:
+                if seen_zero:
+                    pytest.fail("Non-zero attention_mask value after padded zeros.")
+
+

From 36fca1ab39ba0a2755ef3c7afc07b6763f067106 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 00:43:27 -0700
Subject: [PATCH 04/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../datasets/llm/formatting_utils.py          |  8 ++-
 .../llm/test_formatting_utils_options.py      | 53 ++++++-------------
 2 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py
index bbb835a6a..2990aebfd 100644
--- a/nemo_automodel/components/datasets/llm/formatting_utils.py
+++ b/nemo_automodel/components/datasets/llm/formatting_utils.py
@@ -66,6 +66,7 @@ def _package_tokenized_example(
     eos_token_id,
     pad_token_id,
     seq_length,
+    truncation = None,
 ):
     """
     Package a tokenized example with proper masking and padding.
@@ -77,7 +78,7 @@ def _package_tokenized_example(
         eos_token_id: The end-of-sequence token id.
         pad_token_id: The padding token id.
         seq_length: Optional sequence length for padding.
-
+        truncation: Optional truncation strategy.
     Returns:
         A dictionary with input_ids, labels, and attention_mask.
     """
@@ -86,6 +87,8 @@ def _package_tokenized_example(
     if not _has_chat_template(tokenizer) and eos_token_id != input_ids[-1]:
         input_ids += [eos_token_id]
         assistant_masks += [1]
+    if not _has_chat_template(tokenizer) and pad_token_id is not None:
+        assistant_masks += [pad_token_id]
 
     labels = input_ids.copy()
     input_ids = input_ids[:-1]
@@ -95,7 +98,7 @@ def _package_tokenized_example(
     labels[:] = [label if bool(m) else -100 for label, m in zip(labels, assistant_masks)]
     # remove BOS
     labels = labels[1:]
-    if not _has_chat_template(tokenizer):
+    if not _has_chat_template(tokenizer) and truncation is None:
         assert labels[-1] == eos_token_id, f"labels[-1]={labels[-1]} != eos_token_id={eos_token_id}"
         assert input_ids[-1] != eos_token_id, f"input_ids[-1]={input_ids[-1]} == eos_token_id={eos_token_id}"
     assert len(input_ids) == len(labels), f"len(input_ids)={len(input_ids)} != len(labels)={len(labels)}"
@@ -164,6 +167,7 @@ def format_prompt_completion(
         eos_token_id=eos_token_id,
         pad_token_id=pad_token_id,
         seq_length=seq_length,
+        truncation=truncation,
     )
 
 
diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
index 6b35ae50b..504fd7d57 100644
--- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
+++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
@@ -28,46 +28,25 @@
 )
 
 
-def _read_tokenizer_dirs_from_env() -> List[Path]:
-    raw = os.environ.get("NEMO_TOKENIZER_DIRS", "").strip()
-    if not raw:
-        return []
-    parts: Iterable[str] = (p.strip() for p in raw.split(","))
-    paths: List[Path] = [Path(p) for p in parts if p]
-    return [p for p in paths if p.exists() and p.is_dir()]
-
-
-_TOKENIZER_DIRS: List[Path] = _read_tokenizer_dirs_from_env()
-
-
-def _skip_if_no_dirs():
-    if not _TOKENIZER_DIRS:
-        pytest.skip(
-            "Set NEMO_TOKENIZER_DIRS to a comma-separated list of local tokenizer dirs to run these tests.",
-            allow_module_level=True,
-        )
-
-
-# @pytest.mark.parametrize("tokenizer_dir", _TOKENIZER_DIRS, ids=lambda p: p.name if isinstance(p, Path) else str(p))
 @pytest.mark.parametrize(
     "seq_length,padding,truncation",
     [
         (None, "do_not_pad", None),
-        (64, "max_length", True),
+        (4, "max_length", True),
     ],
 )
 def test_format_prompt_completion_options(seq_length, padding, truncation):
-    # _skip_if_no_dirs()
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
-    # qwen3_4b_instruct_2407
-    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_gemma3_2l/")
+
+    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l//")
     # Only applicable when tokenizer lacks chat template
-    # if getattr(tok, "chat_template", None):
-    #     pytest.skip(f"Tokenizer defines chat_template; skipping prompt-completion tests.")
+    assert getattr(tok, "chat_template", None) is None
 
     eos_token_id = getattr(tok, "eos_token_id", 0)
     pad_token_id = _add_pad_token(tok) or eos_token_id
+    if padding != "do_not_pad":
+        tok.pad_token = tok.eos_token
 
     # If using padding="max_length", seq_length must be an int
     if padding == "max_length" and not isinstance(seq_length, int):
@@ -95,11 +74,11 @@ def test_format_prompt_completion_options(seq_length, padding, truncation):
     assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0
 
     # seq_length enforcement (either by HF padding or our packager)
-    if isinstance(seq_length, int):
+    if isinstance(seq_length, int) and padding != "do_not_pad":
         assert len(out["input_ids"]) == seq_length
         assert len(out["labels"]) == seq_length
         # Trailing padding label must be masked
-        assert out["labels"][-1] == -100
+        assert out["labels"][-1] == -100, (out, pad_token_id)
 
     # EOS should be present in labels (supervised area) but not as last input_id
     if getattr(tok, "eos_token_id", None) is not None and not truncation == True:
@@ -113,7 +92,8 @@ def test_format_prompt_completion_options(seq_length, padding, truncation):
 
     # There should be masked (prompt) and supervised (answer) tokens
     assert any(l == -100 for l in out["labels"])  # masked prompt
-    assert any(l != -100 for l in out["labels"])  # supervised answer
+    if not truncation == True:
+        assert any(l != -100 for l in out["labels"])  # supervised answer
 
     # Attention mask should have zeros only in padded tail (if any)
     if isinstance(seq_length, int):
@@ -131,11 +111,11 @@ def test_format_prompt_completion_options(seq_length, padding, truncation):
     "seq_length,padding,truncation",
     [
         (None, "do_not_pad", None),
-        (64, "max_length", True),
+        (4, "max_length", True),
     ],
 )
-def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, truncation):
-    _skip_if_no_dirs()
+def test_format_chat_template_options(seq_length, padding, truncation):
+
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
 
@@ -143,7 +123,7 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding,
     tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/")
     # Only applicable when tokenizer DOES define a chat template
     if not getattr(tok, "chat_template", None):
-        pytest.skip(f"Tokenizer {tokenizer_dir.name} has no chat_template; skipping chat-template tests.")
+        pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.")
 
     eos_token_id = getattr(tok, "eos_token_id", 0)
     pad_token_id = _add_pad_token(tok) or eos_token_id
@@ -175,7 +155,8 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding,
     if isinstance(seq_length, int):
         assert len(out["input_ids"]) == seq_length
         assert len(out["labels"]) == seq_length
-        assert out["labels"][-1] == -100
+        if truncation == False:
+            assert out["labels"][-1] == -100
 
     # For chat templates, EOS should not be the last input id (unless it's all pad)
     if getattr(tok, "eos_token_id", None) is not None:
@@ -189,7 +170,7 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding,
     assert any(l != -100 for l in out["labels"])  # assistant tokens
 
     # Attention mask padded tail zeros, if padded
-    if isinstance(seq_length, int):
+    if isinstance(seq_length, int) and truncation == False:
         seen_zero = False
         for v in reversed(out["attention_mask"]):
             if v == 0:

From 46ec09ecaf93e4617427c954ede95056635d752a Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 00:44:26 -0700
Subject: [PATCH 05/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../datasets/llm/column_mapped_text_instruction_dataset.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
index c2a2957b4..594d095da 100644
--- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
+++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -259,6 +259,8 @@ def __getitem__(self, idx):  # noqa: D401
         row = self.dataset[idx]
         mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row}
         mapped = self._apply_tokenizer(mapped)
+        if not any(l != -100 for l in mapped["labels"]):
+            return self.__getitem__((idx + 1) % len(self.dataset))
         assert _check_all_values_equal_length(mapped), "All values must be of the same length"
         return mapped
 

From d40291b0df71794bb3dac0461c8d0af681f96f97 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 10:25:18 -0700
Subject: [PATCH 06/18] lint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../datasets/llm/column_mapped_text_instruction_dataset.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
index 594d095da..6f0fad178 100644
--- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
+++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -259,7 +259,7 @@ def __getitem__(self, idx):  # noqa: D401
         row = self.dataset[idx]
         mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row}
         mapped = self._apply_tokenizer(mapped)
-        if not any(l != -100 for l in mapped["labels"]):
+        if not any(label != -100 for label in mapped["labels"]):
             return self.__getitem__((idx + 1) % len(self.dataset))
         assert _check_all_values_equal_length(mapped), "All values must be of the same length"
         return mapped

From 0f6aa3cd95d4bdc389f013e8df8cb807a400ae29 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 10:26:01 -0700
Subject: [PATCH 07/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../unit_tests/datasets/llm/test_formatting_utils_options.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
index 504fd7d57..1d7a0f1c4 100644
--- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
+++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
@@ -39,7 +39,7 @@ def test_format_prompt_completion_options(seq_length, padding, truncation):
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
 
-    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l//")
+    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l")
     # Only applicable when tokenizer lacks chat template
     assert getattr(tok, "chat_template", None) is None
 
@@ -120,7 +120,7 @@ def test_format_chat_template_options(seq_length, padding, truncation):
     os.environ["HF_HUB_OFFLINE"] = "1"
 
 
-    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/")
+    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407")
     # Only applicable when tokenizer DOES define a chat template
     if not getattr(tok, "chat_template", None):
         pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.")

From 00a7d4bf938c1624c067031a5f8480083cd37b01 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 10:33:15 -0700
Subject: [PATCH 08/18] lint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo_automodel/components/datasets/llm/formatting_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py
index 2990aebfd..49d70db3d 100644
--- a/nemo_automodel/components/datasets/llm/formatting_utils.py
+++ b/nemo_automodel/components/datasets/llm/formatting_utils.py
@@ -66,7 +66,7 @@ def _package_tokenized_example(
     eos_token_id,
     pad_token_id,
     seq_length,
-    truncation = None,
+    truncation=None,
 ):
     """
     Package a tokenized example with proper masking and padding.

From 6f87ad75a67d5d83168ba034e572f640182d3e2a Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 11:16:07 -0700
Subject: [PATCH 09/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py
index b17d27d41..80ce69325 100644
--- a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py
+++ b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py
@@ -58,7 +58,7 @@ def _id_for_token(self, tok: str) -> int:
             self._cursor += 1
         return self._vocab[tok]
 
-    def __call__(self, text: str, *, add_special_tokens: bool = True):  # type: ignore[override]
+    def __call__(self, text: str, *, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None):  # type: ignore[override]
         ids: List[int] = []
         if add_special_tokens:
             ids.append(self.bos_token_id)

From e15be3d7ef661cdcf6bcdae2c6e518559a365eae Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 11:18:00 -0700
Subject: [PATCH 10/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../datasets/llm/test_formatting_utils_options.py     | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
index 1d7a0f1c4..94a3f68bd 100644
--- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
+++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
@@ -38,8 +38,9 @@
 def test_format_prompt_completion_options(seq_length, padding, truncation):
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
-
-    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l")
+    TOKENIZER_DIR = "/home/TestData/automodel/hf_mixtral_2l"
+    assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist"
+    tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
     # Only applicable when tokenizer lacks chat template
     assert getattr(tok, "chat_template", None) is None
 
@@ -118,9 +119,9 @@ def test_format_chat_template_options(seq_length, padding, truncation):
 
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
-
-
-    tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407")
+    TOKENIZER_DIR = "/home/TestData/automodel/qwen3_4b_instruct_2407"
+    assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist"
+    tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
     # Only applicable when tokenizer DOES define a chat template
     if not getattr(tok, "chat_template", None):
         pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.")

From e18c29668616466497f502fe6f53f31e875899e5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 14:00:23 -0700
Subject: [PATCH 11/18] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../unit_tests/datasets/llm/test_formatting_utils_options.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
index 94a3f68bd..f9229b150 100644
--- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
+++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py
@@ -38,7 +38,7 @@
 def test_format_prompt_completion_options(seq_length, padding, truncation):
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
-    TOKENIZER_DIR = "/home/TestData/automodel/hf_mixtral_2l"
+    TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/hf_mixtral_2l"
     assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist"
     tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
     # Only applicable when tokenizer lacks chat template
@@ -119,7 +119,7 @@ def test_format_chat_template_options(seq_length, padding, truncation):
 
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     os.environ["HF_HUB_OFFLINE"] = "1"
-    TOKENIZER_DIR = "/home/TestData/automodel/qwen3_4b_instruct_2407"
+    TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/qwen3_4b_instruct_2407"
     assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist"
     tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
     # Only applicable when tokenizer DOES define a chat template

From d2576acb5137d158be710be738fb4fc6198d7054 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 18:24:23 -0700
Subject: [PATCH 12/18] move file

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../hf_transformer}/test_formatting_utils_options.py              | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{unit_tests/datasets/llm => functional_tests/hf_transformer}/test_formatting_utils_options.py (100%)

diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py
similarity index 100%
rename from tests/unit_tests/datasets/llm/test_formatting_utils_options.py
rename to tests/functional_tests/hf_transformer/test_formatting_utils_options.py

From 828d477885f084a06217b18b24852825340320e6 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 27 Oct 2025 21:51:24 -0700
Subject: [PATCH 13/18] ifx

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../components/datasets/llm/formatting_utils.py           | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py
index 49d70db3d..dc5c66540 100644
--- a/nemo_automodel/components/datasets/llm/formatting_utils.py
+++ b/nemo_automodel/components/datasets/llm/formatting_utils.py
@@ -128,8 +128,8 @@ def format_prompt_completion(
     eos_token_id: int,
     pad_token_id: int,
     seq_length: Optional[int] = None,
-    padding: Union[str, bool] = None,
-    truncation: Union[str, bool] = None,
+    padding: Union[str, bool] = "do_not_pad",
+    truncation: Union[str, bool] = "do_not_truncate",
     answer_only_loss_mask: bool = True,
 ) -> Dict[str, List[int]]:
     """
@@ -177,8 +177,8 @@ def format_chat_template(
     eos_token_id: int,
     pad_token_id: int,
     seq_length: Optional[int] = None,
-    padding: Union[str, bool] = None,
-    truncation: Union[str, bool] = None,
+    padding: Union[str, bool] = "do_not_pad",
+    truncation: Union[str, bool] = "do_not_truncate",
     tools: Optional[List[Dict]] = None,
 ) -> Dict[str, List[int]]:
     """

From 15f1cdc30ff7cf34dc634182adb605258d3d8de9 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 28 Oct 2025 20:36:43 -0700
Subject: [PATCH 14/18] add test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 ..._column_mapped_text_instruction_dataset.py | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py

diff --git a/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py
new file mode 100644
index 000000000..44853785e
--- /dev/null
+++ b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import (
+    ColumnMappedTextInstructionDataset,
+)
+
+
+def _write_jsonl(tmp_path: Path) -> Path:
+    """Create a small JSONL dataset for testing."""
+    rows = [
+        {
+            "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
+            "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?",
+            "answers": "Saint Bernadette Soubirous",
+        },
+        {
+            "context": "Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised.",
+            "question": "What is in front of the Notre Dame Main Building?",
+            "answers": "a copper statue of Christ",
+        },
+        {
+            "context": "Next to the Main Building is the Basilica of the Sacred Heart.",
+            "question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?",
+            "answers": "the Main Building",
+        },
+        {
+            "context": "Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.",
+            "question": "What is the Grotto at Notre Dame?",
+            "answers": "a Marian place of prayer and reflection",
+        },
+        {
+            "context": "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
+            "question": "What sits on top of the Main Building at Notre Dame?",
+            "answers": "a golden statue of the Virgin Mary",
+        },
+    ]
+    p = tmp_path / "sample.jsonl"
+    with p.open("w") as f:
+        for r in rows:
+            f.write(json.dumps(r) + "\n")
+    return p
+
+
+def _maybe_tokenizer_dir_candidates() -> list[Path]:
+    """Return likely tokenizer directories present in CI test data mounts."""
+    candidates: list[Path] = []
+    # Known bundle with no chat template used elsewhere in the repo
+    test_data_dir = os.environ.get("TEST_DATA_DIR")
+    if test_data_dir:
+        candidates.append(Path(test_data_dir) / "hf_mixtral_2l")
+    # Explicit tokenizers used by existing unit tests
+    base = Path("/home/TestData/akoumparouli/tokenizers/")
+    names = [
+        "gpt-oss-20b",
+        "llama_3.2_1b",
+        "qwen3_30b_a3b_instruct_2507",
+    ]
+    for n in names:
+        candidates.append(base / n)
+    return [p for p in candidates if p.exists()]
+
+
+def _load_tokenizer(path: Path):
+    os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+    os.environ.setdefault("HF_HUB_OFFLINE", "1")
+    return AutoTokenizer.from_pretrained(str(path))
+
+
+def _first_sample(ds: ColumnMappedTextInstructionDataset):
+    it = iter(ds)
+    return next(it)
+
+
+@pytest.mark.parametrize(
+    "seq_length,padding,truncation",
+    [
+        (None, "do_not_pad", None),
+        (16, "max_length", True),
+        (16, "do_not_pad", True),
+        (16, True, None),  # padding=True -> longest; with single example behaves like no-op pre-packaging
+    ],
+)
+def test_dataset_non_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation):
+    """Validate shapes and masking for non-chat tokenizers across padding/truncation options."""
+    data_file = _write_jsonl(tmp_path)
+
+    # Find a tokenizer without chat template
+    for d in _maybe_tokenizer_dir_candidates():
+        tok = _load_tokenizer(d)
+        if getattr(tok, "chat_template", None) is None:
+            break
+    else:
+        pytest.skip("No non-chat tokenizer available in test data mounts")
+
+    column_mapping = {"context": "context", "question": "question", "answer": "answers"}
+
+    ds = ColumnMappedTextInstructionDataset(
+        path_or_dataset_id=str(data_file),
+        column_mapping=column_mapping,
+        tokenizer=tok,
+        seq_length=seq_length,
+        padding=padding,
+        truncation=truncation,
+        # answer_only_loss_mask default True
+    )
+
+    sample = _first_sample(ds)
+    assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys())
+    assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0
+
+    if isinstance(seq_length, int):
+        if truncation is True:
+            assert len(sample["input_ids"]) == seq_length
+            assert len(sample["labels"]) == seq_length
+            # Trailing padding in labels must be masked
+            assert sample["labels"][-1] == -100
+            assert sample["attention_mask"][-1] in (0, 1)  # depending on pack length, end can be 0
+        elif not truncation is True:
+            assert len(sample["input_ids"]) != seq_length
+            assert len(sample["labels"]) != seq_length
+
+@pytest.mark.parametrize(
+    "seq_length,padding,truncation",
+    [
+        (None, "do_not_pad", None),
+        (128, "max_length", True),
+        (16, "do_not_pad", True),
+        (16, True, None),
+    ],
+)
+def test_dataset_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation):
+    """Validate shapes and masking for chat-template tokenizers across padding/truncation options."""
+    data_file = _write_jsonl(tmp_path)
+
+    # Find a tokenizer with chat template
+    chat_tok = None
+    for d in _maybe_tokenizer_dir_candidates():
+        tok = _load_tokenizer(d)
+        if getattr(tok, "chat_template", None) is not None and callable(getattr(tok, "apply_chat_template", None)):
+            chat_tok = tok
+            break
+    if chat_tok is None:
+        pytest.skip("No chat-template tokenizer available in test data mounts")
+
+    # 3-column mapping
+    column_mapping = {"context": "context", "question": "question", "answer": "answers"}
+
+    ds = ColumnMappedTextInstructionDataset(
+        path_or_dataset_id=str(data_file),
+        column_mapping=column_mapping,
+        tokenizer=chat_tok,
+        seq_length=seq_length,
+        padding=padding,
+        truncation=truncation,
+        start_of_turn_token="<|assistant|>",  # required when answer_only_loss_mask=True and chat template present
+    )
+
+    sample = _first_sample(ds)
+    assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys())
+    assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0
+
+    if isinstance(seq_length, int):
+        if truncation is True or padding == "max_length":
+            assert len(sample["input_ids"]) == seq_length
+            assert len(sample["labels"]) == seq_length
+        elif not truncation is True:
+            assert sample["labels"][-1] != -100
+
+
+def test_dataset_two_column_mapping_non_chat(tmp_path: Path):
+    """Ensure 2-column mapping (context+answer) works with non-chat tokenizer."""
+    data_file = _write_jsonl(tmp_path)
+
+    # Choose a non-chat tokenizer
+    for d in _maybe_tokenizer_dir_candidates():
+        tok = _load_tokenizer(d)
+        if getattr(tok, "chat_template", None) is None:
+            break
+    else:
+        pytest.skip("No non-chat tokenizer available in test data mounts")
+
+    # Use only context and answers columns
+    column_mapping = {"context": "context", "answer": "answers"}
+
+    ds = ColumnMappedTextInstructionDataset(
+        path_or_dataset_id=str(data_file),
+        column_mapping=column_mapping,
+        tokenizer=tok,
+        seq_length=32,
+        padding="max_length",
+        truncation=True,
+    )
+
+    sample = _first_sample(ds)
+    assert len(sample["input_ids"]) == 32
+    assert len(sample["labels"]) == 32
+    assert len(sample["attention_mask"]) == 32
+

From 33a677c1f67722dc1cd8125d1487140aeec5c23c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 28 Oct 2025 20:43:02 -0700
Subject: [PATCH 15/18] update default values & pad_token

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/column_mapped_text_instruction_dataset.py      | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
index 6f0fad178..deb2a67c6 100644
--- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
+++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -165,8 +165,8 @@ def __init__(
         name: Optional[str] = None,
         answer_only_loss_mask: bool = True,
         seq_length: Optional[int] = None,
-        padding: Union[str, bool] = None,
-        truncation: Union[str, bool] = None,
+        padding: Union[str, bool] = "do_not_pad",
+        truncation: Union[str, bool] = "do_not_truncate",
         start_of_turn_token: Optional[str] = None,
         limit_dataset_samples: Optional[int] = None,
     ) -> None:
@@ -195,6 +195,12 @@ def __init__(
 
         assert tokenizer is not None, "Tokenizer is required"
         self.tokenizer = tokenizer
+        if getattr(self.tokenizer, 'pad_token', None) is None:
+            if hasattr(self.tokenizer, 'eos_token'):
+                self.tokenizer.pad_token = self.tokenizer
+            else:
+                logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.")
+                self.tokenizer.pad_token = ' '
 
         self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name)
 

From f155222d6243e2edf2b4ca3be8f67e67d37aa67b Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 28 Oct 2025 21:09:34 -0700
Subject: [PATCH 16/18] also print exception

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo_automodel/components/config/loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo_automodel/components/config/loader.py b/nemo_automodel/components/config/loader.py
index 01c4196d0..a3b4cb28b 100644
--- a/nemo_automodel/components/config/loader.py
+++ b/nemo_automodel/components/config/loader.py
@@ -246,11 +246,13 @@ def instantiate(self, *args, **kwargs):
                 "Instantiation failed for `{}`\n"
                 "Accepted signature : {}\n"
                 "Positional args    : {}\n"
-                "Keyword args       : {}\n".format(
+                "Keyword args       : {}\n"
+                "Exception          : {}\n".format(
                     func.__name__,
                     sig,
                     args,
                     pprint.pformat(config_kwargs, compact=True, indent=4),
+                    e,
                 ),
                 file=sys.stderr,
             )

From 27b86c679d8714ac2c38c337b4fc5a8096908328 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 28 Oct 2025 21:19:05 -0700
Subject: [PATCH 17/18] fmt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/column_mapped_text_instruction_dataset.py         | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
index deb2a67c6..8a9d82dbb 100644
--- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
+++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -28,6 +28,8 @@
     format_prompt_completion,
 )
 
+logger = logging.getLogger(__name__)
+
 # Supported cases:
 # Format:
 # - Context + question + answer
@@ -195,12 +197,12 @@ def __init__(
 
         assert tokenizer is not None, "Tokenizer is required"
         self.tokenizer = tokenizer
-        if getattr(self.tokenizer, 'pad_token', None) is None:
-            if hasattr(self.tokenizer, 'eos_token'):
+        if getattr(self.tokenizer, "pad_token", None) is None:
+            if hasattr(self.tokenizer, "eos_token"):
                 self.tokenizer.pad_token = self.tokenizer
             else:
                 logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.")
-                self.tokenizer.pad_token = ' '
+                self.tokenizer.pad_token = " "
 
         self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name)
 

From 13e625b363df76e1b510582b335a06679fc8ce3e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 28 Oct 2025 22:19:34 -0700
Subject: [PATCH 18/18] add truncation & padding options

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo_automodel/components/datasets/llm/chat_dataset.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nemo_automodel/components/datasets/llm/chat_dataset.py b/nemo_automodel/components/datasets/llm/chat_dataset.py
index 84ff8e12c..51f36e8ae 100644
--- a/nemo_automodel/components/datasets/llm/chat_dataset.py
+++ b/nemo_automodel/components/datasets/llm/chat_dataset.py
@@ -133,6 +133,8 @@ def __init__(
         split: Optional[str] = None,
         name: Optional[str] = None,
         seq_length: Optional[int] = None,
+        padding: Union[str, bool] = "do_not_pad",
+        truncation: Union[str, bool] = "do_not_truncate",
         start_of_turn_token: Optional[str] = None,
         chat_template: Optional[str] = None,
     ) -> None:
@@ -149,6 +151,8 @@ def __init__(
 
         self.tokenizer = tokenizer
         self.seq_length = seq_length
+        self.padding = padding
+        self.truncation = truncation
         self.start_of_turn_token = start_of_turn_token
 
         self.dataset = _load_openai_messages(path_or_dataset_id, split=split, name=name)
@@ -178,6 +182,8 @@ def __getitem__(self, idx: int) -> Dict[str, List[int]]:
             eos_token_id,
             self.pad_token_id,
             seq_length=self.seq_length,
+            padding=self.padding,
+            truncation=self.truncation,
             tools=tools,
         )
         return sample