From fc700c6108dfabe295aa7610775f9ca00e7d9a2f Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 24 Oct 2025 11:40:55 -0700 Subject: [PATCH 01/18] surface truncating & padding options Signed-off-by: Alexandros Koumparoulis --- .../components/datasets/llm/formatting_utils.py | 11 +++++++++-- .../llm/test_column_mapped_text_instruction.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py index a26f21906..bbb835a6a 100644 --- a/nemo_automodel/components/datasets/llm/formatting_utils.py +++ b/nemo_automodel/components/datasets/llm/formatting_utils.py @@ -14,7 +14,7 @@ import logging import re -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional, Union logger = logging.getLogger(__name__) @@ -125,6 +125,8 @@ def format_prompt_completion( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, + padding: Union[str, bool] = None, + truncation: Union[str, bool] = None, answer_only_loss_mask: bool = True, ) -> Dict[str, List[int]]: """ @@ -150,7 +152,7 @@ def format_prompt_completion( else: len_prompt_ids = 0 # Tokenize full text - input_ids = tokenizer(full_text)["input_ids"] + input_ids = tokenizer(full_text, padding=padding, truncation=truncation, max_length=seq_length)["input_ids"] # Create assistant_masks: 0 for prompt tokens, 1 for answer tokens assistant_masks = [0] * len_prompt_ids + [1] * (len(input_ids) - len_prompt_ids) @@ -171,6 +173,8 @@ def format_chat_template( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, + padding: Union[str, bool] = None, + truncation: Union[str, bool] = None, tools: Optional[List[Dict]] = None, ) -> Dict[str, List[int]]: """ @@ -199,6 +203,9 @@ def format_chat_template( tokenize=True, return_dict=True, return_assistant_tokens_mask=template_has_generation_kwd, + padding=padding, + truncation=truncation, + max_length=seq_length, ) # Choose the last conversation as answer other history are context by finding the last masked token diff --git a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py index f35c13392..8ce6fa772 100644 --- a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py +++ b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py @@ -67,7 +67,7 @@ def __init__(self): self.bos_token_id = 2 self._counter = 3 # Start token IDs from 3 to avoid conflicts - def __call__(self, text: str, add_special_tokens: bool = True): # noqa: D401 + def __call__(self, text: str, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None): # noqa: D401 """Mimic the Hugging Face tokenizer ``__call__`` API. The real tokenizer would convert *text* into a list of integer token IDs. From 02b8a2696e5967168c85815368a9d64a80a399b3 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Sun, 26 Oct 2025 22:59:23 -0700 Subject: [PATCH 02/18] add padding/truncation options Signed-off-by: Alexandros Koumparoulis --- .../llm/column_mapped_text_instruction_dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index c83100550..c2a2957b4 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -165,6 +165,8 @@ def __init__( name: Optional[str] = None, answer_only_loss_mask: bool = True, seq_length: Optional[int] = None, + padding: Union[str, bool] = None, + truncation: Union[str, bool] = None, start_of_turn_token: Optional[str] = None, limit_dataset_samples: Optional[int] = None, ) -> None: @@ -226,6 +228,8 @@ def __init__( self.answer_only_loss_mask = answer_only_loss_mask self.start_of_turn_token = start_of_turn_token self.seq_length = seq_length + self.padding = padding + self.truncation = truncation def __len__(self) -> int: # noqa: D401 """ @@ -293,6 +297,8 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]: eos_token_id, pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, ) else: prompt = " ".join(filter(lambda x: x is not None, (context, question, ""))) @@ -304,5 +310,7 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]: eos_token_id, pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, answer_only_loss_mask=self.answer_only_loss_mask, ) From 53c2cfe6075db710922f725d847a3c860157fce7 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 00:28:53 -0700 Subject: [PATCH 03/18] add test Signed-off-by: Alexandros Koumparoulis --- .../llm/test_formatting_utils_options.py | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 tests/unit_tests/datasets/llm/test_formatting_utils_options.py diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py new file mode 100644 index 000000000..6b35ae50b --- /dev/null +++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py @@ -0,0 +1,201 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Iterable, List, Tuple + +import pytest +from transformers import AutoTokenizer + +from nemo_automodel.components.datasets.llm.formatting_utils import ( + _add_pad_token, + format_chat_template, + format_prompt_completion, +) + + +def _read_tokenizer_dirs_from_env() -> List[Path]: + raw = os.environ.get("NEMO_TOKENIZER_DIRS", "").strip() + if not raw: + return [] + parts: Iterable[str] = (p.strip() for p in raw.split(",")) + paths: List[Path] = [Path(p) for p in parts if p] + return [p for p in paths if p.exists() and p.is_dir()] + + +_TOKENIZER_DIRS: List[Path] = _read_tokenizer_dirs_from_env() + + +def _skip_if_no_dirs(): + if not _TOKENIZER_DIRS: + pytest.skip( + "Set NEMO_TOKENIZER_DIRS to a comma-separated list of local tokenizer dirs to run these tests.", + allow_module_level=True, + ) + + +# @pytest.mark.parametrize("tokenizer_dir", _TOKENIZER_DIRS, ids=lambda p: p.name if isinstance(p, Path) else str(p)) +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (64, "max_length", True), + ], +) +def test_format_prompt_completion_options(seq_length, padding, truncation): + # _skip_if_no_dirs() + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + # qwen3_4b_instruct_2407 + tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_gemma3_2l/") + # Only applicable when tokenizer lacks chat template + # if getattr(tok, "chat_template", None): + # pytest.skip(f"Tokenizer defines chat_template; skipping prompt-completion tests.") + + eos_token_id = getattr(tok, "eos_token_id", 0) + pad_token_id = _add_pad_token(tok) or eos_token_id + + # If using padding="max_length", seq_length must be an int + if padding == "max_length" and not isinstance(seq_length, int): + pytest.skip("padding='max_length' requires seq_length to be set.") + + context = "France is a country in Europe." + question = "What is the capital of France?" + answer = "Paris." + prompt = f"{context} {question} " + + out = format_prompt_completion( + tokenizer=tok, + prompt=prompt, + answer=answer, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + seq_length=seq_length, + padding=padding, + truncation=truncation, + answer_only_loss_mask=True, + ) + + # Basic structure + assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys()) + assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0 + + # seq_length enforcement (either by HF padding or our packager) + if isinstance(seq_length, int): + assert len(out["input_ids"]) == seq_length + assert len(out["labels"]) == seq_length + # Trailing padding label must be masked + assert out["labels"][-1] == -100 + + # EOS should be present in labels (supervised area) but not as last input_id + if getattr(tok, "eos_token_id", None) is not None and not truncation == True: + assert tok.eos_token_id in out["labels"], "EOS must appear in labels" + # find last non-pad input position and ensure it's not EOS + last_non_pad = len(out["input_ids"]) - 1 + while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id: + last_non_pad -= 1 + assert last_non_pad >= 0 + assert out["input_ids"][last_non_pad] != tok.eos_token_id + + # There should be masked (prompt) and supervised (answer) tokens + assert any(l == -100 for l in out["labels"]) # masked prompt + assert any(l != -100 for l in out["labels"]) # supervised answer + + # Attention mask should have zeros only in padded tail (if any) + if isinstance(seq_length, int): + # From the end, once we see a 0, the rest must be 0 + seen_zero = False + for v in reversed(out["attention_mask"]): + if v == 0: + seen_zero = True + else: + if seen_zero: + pytest.fail("Non-zero attention_mask value after padded zeros.") + + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (64, "max_length", True), + ], +) +def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, truncation): + _skip_if_no_dirs() + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + + + tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/") + # Only applicable when tokenizer DOES define a chat template + if not getattr(tok, "chat_template", None): + pytest.skip(f"Tokenizer {tokenizer_dir.name} has no chat_template; skipping chat-template tests.") + + eos_token_id = getattr(tok, "eos_token_id", 0) + pad_token_id = _add_pad_token(tok) or eos_token_id + + if padding == "max_length" and not isinstance(seq_length, int): + pytest.skip("padding='max_length' requires seq_length to be set.") + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "Paris."}, + ] + + out = format_chat_template( + tokenizer=tok, + formatted_text=messages, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + seq_length=seq_length, + padding=padding, + truncation=truncation, + ) + + # Basic structure + assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys()) + assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0 + + # seq_length enforcement + if isinstance(seq_length, int): + assert len(out["input_ids"]) == seq_length + assert len(out["labels"]) == seq_length + assert out["labels"][-1] == -100 + + # For chat templates, EOS should not be the last input id (unless it's all pad) + if getattr(tok, "eos_token_id", None) is not None: + last_non_pad = len(out["input_ids"]) - 1 + while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id: + last_non_pad -= 1 + if last_non_pad >= 0: + assert out["input_ids"][last_non_pad] != tok.eos_token_id + + # There must be at least some supervised tokens in labels + assert any(l != -100 for l in out["labels"]) # assistant tokens + + # Attention mask padded tail zeros, if padded + if isinstance(seq_length, int): + seen_zero = False + for v in reversed(out["attention_mask"]): + if v == 0: + seen_zero = True + else: + if seen_zero: + pytest.fail("Non-zero attention_mask value after padded zeros.") + + From 36fca1ab39ba0a2755ef3c7afc07b6763f067106 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 00:43:27 -0700 Subject: [PATCH 04/18] fix Signed-off-by: Alexandros Koumparoulis --- .../datasets/llm/formatting_utils.py | 8 ++- .../llm/test_formatting_utils_options.py | 53 ++++++------------- 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py index bbb835a6a..2990aebfd 100644 --- a/nemo_automodel/components/datasets/llm/formatting_utils.py +++ b/nemo_automodel/components/datasets/llm/formatting_utils.py @@ -66,6 +66,7 @@ def _package_tokenized_example( eos_token_id, pad_token_id, seq_length, + truncation = None, ): """ Package a tokenized example with proper masking and padding. @@ -77,7 +78,7 @@ def _package_tokenized_example( eos_token_id: The end-of-sequence token id. pad_token_id: The padding token id. seq_length: Optional sequence length for padding. - + truncation: Optional truncation strategy. Returns: A dictionary with input_ids, labels, and attention_mask. """ @@ -86,6 +87,8 @@ def _package_tokenized_example( if not _has_chat_template(tokenizer) and eos_token_id != input_ids[-1]: input_ids += [eos_token_id] assistant_masks += [1] + if not _has_chat_template(tokenizer) and pad_token_id is not None: + assistant_masks += [pad_token_id] labels = input_ids.copy() input_ids = input_ids[:-1] @@ -95,7 +98,7 @@ def _package_tokenized_example( labels[:] = [label if bool(m) else -100 for label, m in zip(labels, assistant_masks)] # remove BOS labels = labels[1:] - if not _has_chat_template(tokenizer): + if not _has_chat_template(tokenizer) and truncation is None: assert labels[-1] == eos_token_id, f"labels[-1]={labels[-1]} != eos_token_id={eos_token_id}" assert input_ids[-1] != eos_token_id, f"input_ids[-1]={input_ids[-1]} == eos_token_id={eos_token_id}" assert len(input_ids) == len(labels), f"len(input_ids)={len(input_ids)} != len(labels)={len(labels)}" @@ -164,6 +167,7 @@ def format_prompt_completion( eos_token_id=eos_token_id, pad_token_id=pad_token_id, seq_length=seq_length, + truncation=truncation, ) diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py index 6b35ae50b..504fd7d57 100644 --- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py +++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py @@ -28,46 +28,25 @@ ) -def _read_tokenizer_dirs_from_env() -> List[Path]: - raw = os.environ.get("NEMO_TOKENIZER_DIRS", "").strip() - if not raw: - return [] - parts: Iterable[str] = (p.strip() for p in raw.split(",")) - paths: List[Path] = [Path(p) for p in parts if p] - return [p for p in paths if p.exists() and p.is_dir()] - - -_TOKENIZER_DIRS: List[Path] = _read_tokenizer_dirs_from_env() - - -def _skip_if_no_dirs(): - if not _TOKENIZER_DIRS: - pytest.skip( - "Set NEMO_TOKENIZER_DIRS to a comma-separated list of local tokenizer dirs to run these tests.", - allow_module_level=True, - ) - - -# @pytest.mark.parametrize("tokenizer_dir", _TOKENIZER_DIRS, ids=lambda p: p.name if isinstance(p, Path) else str(p)) @pytest.mark.parametrize( "seq_length,padding,truncation", [ (None, "do_not_pad", None), - (64, "max_length", True), + (4, "max_length", True), ], ) def test_format_prompt_completion_options(seq_length, padding, truncation): - # _skip_if_no_dirs() os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - # qwen3_4b_instruct_2407 - tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_gemma3_2l/") + + tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l//") # Only applicable when tokenizer lacks chat template - # if getattr(tok, "chat_template", None): - # pytest.skip(f"Tokenizer defines chat_template; skipping prompt-completion tests.") + assert getattr(tok, "chat_template", None) is None eos_token_id = getattr(tok, "eos_token_id", 0) pad_token_id = _add_pad_token(tok) or eos_token_id + if padding != "do_not_pad": + tok.pad_token = tok.eos_token # If using padding="max_length", seq_length must be an int if padding == "max_length" and not isinstance(seq_length, int): @@ -95,11 +74,11 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0 # seq_length enforcement (either by HF padding or our packager) - if isinstance(seq_length, int): + if isinstance(seq_length, int) and padding != "do_not_pad": assert len(out["input_ids"]) == seq_length assert len(out["labels"]) == seq_length # Trailing padding label must be masked - assert out["labels"][-1] == -100 + assert out["labels"][-1] == -100, (out, pad_token_id) # EOS should be present in labels (supervised area) but not as last input_id if getattr(tok, "eos_token_id", None) is not None and not truncation == True: @@ -113,7 +92,8 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): # There should be masked (prompt) and supervised (answer) tokens assert any(l == -100 for l in out["labels"]) # masked prompt - assert any(l != -100 for l in out["labels"]) # supervised answer + if not truncation == True: + assert any(l != -100 for l in out["labels"]) # supervised answer # Attention mask should have zeros only in padded tail (if any) if isinstance(seq_length, int): @@ -131,11 +111,11 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): "seq_length,padding,truncation", [ (None, "do_not_pad", None), - (64, "max_length", True), + (4, "max_length", True), ], ) -def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, truncation): - _skip_if_no_dirs() +def test_format_chat_template_options(seq_length, padding, truncation): + os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" @@ -143,7 +123,7 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/") # Only applicable when tokenizer DOES define a chat template if not getattr(tok, "chat_template", None): - pytest.skip(f"Tokenizer {tokenizer_dir.name} has no chat_template; skipping chat-template tests.") + pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.") eos_token_id = getattr(tok, "eos_token_id", 0) pad_token_id = _add_pad_token(tok) or eos_token_id @@ -175,7 +155,8 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, if isinstance(seq_length, int): assert len(out["input_ids"]) == seq_length assert len(out["labels"]) == seq_length - assert out["labels"][-1] == -100 + if truncation == False: + assert out["labels"][-1] == -100 # For chat templates, EOS should not be the last input id (unless it's all pad) if getattr(tok, "eos_token_id", None) is not None: @@ -189,7 +170,7 @@ def test_format_chat_template_options(tokenizer_dir: Path, seq_length, padding, assert any(l != -100 for l in out["labels"]) # assistant tokens # Attention mask padded tail zeros, if padded - if isinstance(seq_length, int): + if isinstance(seq_length, int) and truncation == False: seen_zero = False for v in reversed(out["attention_mask"]): if v == 0: From 46ec09ecaf93e4617427c954ede95056635d752a Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 00:44:26 -0700 Subject: [PATCH 05/18] fix Signed-off-by: Alexandros Koumparoulis --- .../datasets/llm/column_mapped_text_instruction_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index c2a2957b4..594d095da 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -259,6 +259,8 @@ def __getitem__(self, idx): # noqa: D401 row = self.dataset[idx] mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row} mapped = self._apply_tokenizer(mapped) + if not any(l != -100 for l in mapped["labels"]): + return self.__getitem__((idx + 1) % len(self.dataset)) assert _check_all_values_equal_length(mapped), "All values must be of the same length" return mapped From d40291b0df71794bb3dac0461c8d0af681f96f97 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 10:25:18 -0700 Subject: [PATCH 06/18] lint Signed-off-by: Alexandros Koumparoulis --- .../datasets/llm/column_mapped_text_instruction_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index 594d095da..6f0fad178 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -259,7 +259,7 @@ def __getitem__(self, idx): # noqa: D401 row = self.dataset[idx] mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row} mapped = self._apply_tokenizer(mapped) - if not any(l != -100 for l in mapped["labels"]): + if not any(label != -100 for label in mapped["labels"]): return self.__getitem__((idx + 1) % len(self.dataset)) assert _check_all_values_equal_length(mapped), "All values must be of the same length" return mapped From 0f6aa3cd95d4bdc389f013e8df8cb807a400ae29 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 10:26:01 -0700 Subject: [PATCH 07/18] fix Signed-off-by: Alexandros Koumparoulis --- .../unit_tests/datasets/llm/test_formatting_utils_options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py index 504fd7d57..1d7a0f1c4 100644 --- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py +++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py @@ -39,7 +39,7 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l//") + tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l") # Only applicable when tokenizer lacks chat template assert getattr(tok, "chat_template", None) is None @@ -120,7 +120,7 @@ def test_format_chat_template_options(seq_length, padding, truncation): os.environ["HF_HUB_OFFLINE"] = "1" - tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407/") + tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407") # Only applicable when tokenizer DOES define a chat template if not getattr(tok, "chat_template", None): pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.") From 00a7d4bf938c1624c067031a5f8480083cd37b01 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 10:33:15 -0700 Subject: [PATCH 08/18] lint Signed-off-by: Alexandros Koumparoulis --- nemo_automodel/components/datasets/llm/formatting_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py index 2990aebfd..49d70db3d 100644 --- a/nemo_automodel/components/datasets/llm/formatting_utils.py +++ b/nemo_automodel/components/datasets/llm/formatting_utils.py @@ -66,7 +66,7 @@ def _package_tokenized_example( eos_token_id, pad_token_id, seq_length, - truncation = None, + truncation=None, ): """ Package a tokenized example with proper masking and padding. From 6f87ad75a67d5d83168ba034e572f640182d3e2a Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 11:16:07 -0700 Subject: [PATCH 09/18] fix Signed-off-by: Alexandros Koumparoulis --- tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py index b17d27d41..80ce69325 100644 --- a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py +++ b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py @@ -58,7 +58,7 @@ def _id_for_token(self, tok: str) -> int: self._cursor += 1 return self._vocab[tok] - def __call__(self, text: str, *, add_special_tokens: bool = True): # type: ignore[override] + def __call__(self, text: str, *, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None): # type: ignore[override] ids: List[int] = [] if add_special_tokens: ids.append(self.bos_token_id) From e15be3d7ef661cdcf6bcdae2c6e518559a365eae Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 11:18:00 -0700 Subject: [PATCH 10/18] fix Signed-off-by: Alexandros Koumparoulis --- .../datasets/llm/test_formatting_utils_options.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py index 1d7a0f1c4..94a3f68bd 100644 --- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py +++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py @@ -38,8 +38,9 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - - tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/hf_mixtral_2l") + TOKENIZER_DIR = "/home/TestData/automodel/hf_mixtral_2l" + assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" + tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) # Only applicable when tokenizer lacks chat template assert getattr(tok, "chat_template", None) is None @@ -118,9 +119,9 @@ def test_format_chat_template_options(seq_length, padding, truncation): os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - - - tok = AutoTokenizer.from_pretrained("/home/TestData/automodel/qwen3_4b_instruct_2407") + TOKENIZER_DIR = "/home/TestData/automodel/qwen3_4b_instruct_2407" + assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" + tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) # Only applicable when tokenizer DOES define a chat template if not getattr(tok, "chat_template", None): pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.") From e18c29668616466497f502fe6f53f31e875899e5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 14:00:23 -0700 Subject: [PATCH 11/18] fix Signed-off-by: Alexandros Koumparoulis --- .../unit_tests/datasets/llm/test_formatting_utils_options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py index 94a3f68bd..f9229b150 100644 --- a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py +++ b/tests/unit_tests/datasets/llm/test_formatting_utils_options.py @@ -38,7 +38,7 @@ def test_format_prompt_completion_options(seq_length, padding, truncation): os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - TOKENIZER_DIR = "/home/TestData/automodel/hf_mixtral_2l" + TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/hf_mixtral_2l" assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) # Only applicable when tokenizer lacks chat template @@ -119,7 +119,7 @@ def test_format_chat_template_options(seq_length, padding, truncation): os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_HUB_OFFLINE"] = "1" - TOKENIZER_DIR = "/home/TestData/automodel/qwen3_4b_instruct_2407" + TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/qwen3_4b_instruct_2407" assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) # Only applicable when tokenizer DOES define a chat template From d2576acb5137d158be710be738fb4fc6198d7054 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 18:24:23 -0700 Subject: [PATCH 12/18] move file Signed-off-by: Alexandros Koumparoulis --- .../hf_transformer}/test_formatting_utils_options.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{unit_tests/datasets/llm => functional_tests/hf_transformer}/test_formatting_utils_options.py (100%) diff --git a/tests/unit_tests/datasets/llm/test_formatting_utils_options.py b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py similarity index 100% rename from tests/unit_tests/datasets/llm/test_formatting_utils_options.py rename to tests/functional_tests/hf_transformer/test_formatting_utils_options.py From 828d477885f084a06217b18b24852825340320e6 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 27 Oct 2025 21:51:24 -0700 Subject: [PATCH 13/18] ifx Signed-off-by: Alexandros Koumparoulis --- .../components/datasets/llm/formatting_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py index 49d70db3d..dc5c66540 100644 --- a/nemo_automodel/components/datasets/llm/formatting_utils.py +++ b/nemo_automodel/components/datasets/llm/formatting_utils.py @@ -128,8 +128,8 @@ def format_prompt_completion( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, - padding: Union[str, bool] = None, - truncation: Union[str, bool] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", answer_only_loss_mask: bool = True, ) -> Dict[str, List[int]]: """ @@ -177,8 +177,8 @@ def format_chat_template( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, - padding: Union[str, bool] = None, - truncation: Union[str, bool] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", tools: Optional[List[Dict]] = None, ) -> Dict[str, List[int]]: """ From 15f1cdc30ff7cf34dc634182adb605258d3d8de9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 28 Oct 2025 20:36:43 -0700 Subject: [PATCH 14/18] add test Signed-off-by: Alexandros Koumparoulis --- ..._column_mapped_text_instruction_dataset.py | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py diff --git a/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py new file mode 100644 index 000000000..44853785e --- /dev/null +++ b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py @@ -0,0 +1,217 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from pathlib import Path + +import pytest +from transformers import AutoTokenizer + +from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import ( + ColumnMappedTextInstructionDataset, +) + + +def _write_jsonl(tmp_path: Path) -> Path: + """Create a small JSONL dataset for testing.""" + rows = [ + { + "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary.", + "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", + "answers": "Saint Bernadette Soubirous", + }, + { + "context": "Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised.", + "question": "What is in front of the Notre Dame Main Building?", + "answers": "a copper statue of Christ", + }, + { + "context": "Next to the Main Building is the Basilica of the Sacred Heart.", + "question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?", + "answers": "the Main Building", + }, + { + "context": "Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.", + "question": "What is the Grotto at Notre Dame?", + "answers": "a Marian place of prayer and reflection", + }, + { + "context": "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.", + "question": "What sits on top of the Main Building at Notre Dame?", + "answers": "a golden statue of the Virgin Mary", + }, + ] + p = tmp_path / "sample.jsonl" + with p.open("w") as f: + for r in rows: + f.write(json.dumps(r) + "\n") + return p + + +def _maybe_tokenizer_dir_candidates() -> list[Path]: + """Return likely tokenizer directories present in CI test data mounts.""" + candidates: list[Path] = [] + # Known bundle with no chat template used elsewhere in the repo + test_data_dir = os.environ.get("TEST_DATA_DIR") + if test_data_dir: + candidates.append(Path(test_data_dir) / "hf_mixtral_2l") + # Explicit tokenizers used by existing unit tests + base = Path("/home/TestData/akoumparouli/tokenizers/") + names = [ + "gpt-oss-20b", + "llama_3.2_1b", + "qwen3_30b_a3b_instruct_2507", + ] + for n in names: + candidates.append(base / n) + return [p for p in candidates if p.exists()] + + +def _load_tokenizer(path: Path): + os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") + os.environ.setdefault("HF_HUB_OFFLINE", "1") + return AutoTokenizer.from_pretrained(str(path)) + + +def _first_sample(ds: ColumnMappedTextInstructionDataset): + it = iter(ds) + return next(it) + + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (16, "max_length", True), + (16, "do_not_pad", True), + (16, True, None), # padding=True -> longest; with single example behaves like no-op pre-packaging + ], +) +def test_dataset_non_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation): + """Validate shapes and masking for non-chat tokenizers across padding/truncation options.""" + data_file = _write_jsonl(tmp_path) + + # Find a tokenizer without chat template + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is None: + break + else: + pytest.skip("No non-chat tokenizer available in test data mounts") + + column_mapping = {"context": "context", "question": "question", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=tok, + seq_length=seq_length, + padding=padding, + truncation=truncation, + # answer_only_loss_mask default True + ) + + sample = _first_sample(ds) + assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys()) + assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0 + + if isinstance(seq_length, int): + if truncation is True: + assert len(sample["input_ids"]) == seq_length + assert len(sample["labels"]) == seq_length + # Trailing padding in labels must be masked + assert sample["labels"][-1] == -100 + assert sample["attention_mask"][-1] in (0, 1) # depending on pack length, end can be 0 + elif not truncation is True: + assert len(sample["input_ids"]) != seq_length + assert len(sample["labels"]) != seq_length + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (128, "max_length", True), + (16, "do_not_pad", True), + (16, True, None), + ], +) +def test_dataset_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation): + """Validate shapes and masking for chat-template tokenizers across padding/truncation options.""" + data_file = _write_jsonl(tmp_path) + + # Find a tokenizer with chat template + chat_tok = None + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is not None and callable(getattr(tok, "apply_chat_template", None)): + chat_tok = tok + break + if chat_tok is None: + pytest.skip("No chat-template tokenizer available in test data mounts") + + # 3-column mapping + column_mapping = {"context": "context", "question": "question", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=chat_tok, + seq_length=seq_length, + padding=padding, + truncation=truncation, + start_of_turn_token="<|assistant|>", # required when answer_only_loss_mask=True and chat template present + ) + + sample = _first_sample(ds) + assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys()) + assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0 + + if isinstance(seq_length, int): + if truncation is True or padding == "max_length": + assert len(sample["input_ids"]) == seq_length + assert len(sample["labels"]) == seq_length + elif not truncation is True: + assert sample["labels"][-1] != -100 + + +def test_dataset_two_column_mapping_non_chat(tmp_path: Path): + """Ensure 2-column mapping (context+answer) works with non-chat tokenizer.""" + data_file = _write_jsonl(tmp_path) + + # Choose a non-chat tokenizer + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is None: + break + else: + pytest.skip("No non-chat tokenizer available in test data mounts") + + # Use only context and answers columns + column_mapping = {"context": "context", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=tok, + seq_length=32, + padding="max_length", + truncation=True, + ) + + sample = _first_sample(ds) + assert len(sample["input_ids"]) == 32 + assert len(sample["labels"]) == 32 + assert len(sample["attention_mask"]) == 32 + From 33a677c1f67722dc1cd8125d1487140aeec5c23c Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 28 Oct 2025 20:43:02 -0700 Subject: [PATCH 15/18] update default values & pad_token Signed-off-by: Alexandros Koumparoulis --- .../llm/column_mapped_text_instruction_dataset.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index 6f0fad178..deb2a67c6 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -165,8 +165,8 @@ def __init__( name: Optional[str] = None, answer_only_loss_mask: bool = True, seq_length: Optional[int] = None, - padding: Union[str, bool] = None, - truncation: Union[str, bool] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", start_of_turn_token: Optional[str] = None, limit_dataset_samples: Optional[int] = None, ) -> None: @@ -195,6 +195,12 @@ def __init__( assert tokenizer is not None, "Tokenizer is required" self.tokenizer = tokenizer + if getattr(self.tokenizer, 'pad_token', None) is None: + if hasattr(self.tokenizer, 'eos_token'): + self.tokenizer.pad_token = self.tokenizer + else: + logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.") + self.tokenizer.pad_token = ' ' self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name) From f155222d6243e2edf2b4ca3be8f67e67d37aa67b Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 28 Oct 2025 21:09:34 -0700 Subject: [PATCH 16/18] also print exception Signed-off-by: Alexandros Koumparoulis --- nemo_automodel/components/config/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_automodel/components/config/loader.py b/nemo_automodel/components/config/loader.py index 01c4196d0..a3b4cb28b 100644 --- a/nemo_automodel/components/config/loader.py +++ b/nemo_automodel/components/config/loader.py @@ -246,11 +246,13 @@ def instantiate(self, *args, **kwargs): "Instantiation failed for `{}`\n" "Accepted signature : {}\n" "Positional args : {}\n" - "Keyword args : {}\n".format( + "Keyword args : {}\n" + "Exception : {}\n".format( func.__name__, sig, args, pprint.pformat(config_kwargs, compact=True, indent=4), + e, ), file=sys.stderr, ) From 27b86c679d8714ac2c38c337b4fc5a8096908328 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 28 Oct 2025 21:19:05 -0700 Subject: [PATCH 17/18] fmt Signed-off-by: Alexandros Koumparoulis --- .../llm/column_mapped_text_instruction_dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index deb2a67c6..8a9d82dbb 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -28,6 +28,8 @@ format_prompt_completion, ) +logger = logging.getLogger(__name__) + # Supported cases: # Format: # - Context + question + answer @@ -195,12 +197,12 @@ def __init__( assert tokenizer is not None, "Tokenizer is required" self.tokenizer = tokenizer - if getattr(self.tokenizer, 'pad_token', None) is None: - if hasattr(self.tokenizer, 'eos_token'): + if getattr(self.tokenizer, "pad_token", None) is None: + if hasattr(self.tokenizer, "eos_token"): self.tokenizer.pad_token = self.tokenizer else: logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.") - self.tokenizer.pad_token = ' ' + self.tokenizer.pad_token = " " self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name) From 13e625b363df76e1b510582b335a06679fc8ce3e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 28 Oct 2025 22:19:34 -0700 Subject: [PATCH 18/18] add truncation & padding options Signed-off-by: Alexandros Koumparoulis --- nemo_automodel/components/datasets/llm/chat_dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nemo_automodel/components/datasets/llm/chat_dataset.py b/nemo_automodel/components/datasets/llm/chat_dataset.py index 84ff8e12c..51f36e8ae 100644 --- a/nemo_automodel/components/datasets/llm/chat_dataset.py +++ b/nemo_automodel/components/datasets/llm/chat_dataset.py @@ -133,6 +133,8 @@ def __init__( split: Optional[str] = None, name: Optional[str] = None, seq_length: Optional[int] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", start_of_turn_token: Optional[str] = None, chat_template: Optional[str] = None, ) -> None: @@ -149,6 +151,8 @@ def __init__( self.tokenizer = tokenizer self.seq_length = seq_length + self.padding = padding + self.truncation = truncation self.start_of_turn_token = start_of_turn_token self.dataset = _load_openai_messages(path_or_dataset_id, split=split, name=name) @@ -178,6 +182,8 @@ def __getitem__(self, idx: int) -> Dict[str, List[int]]: eos_token_id, self.pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, tools=tools, ) return sample