diff --git a/nemo_automodel/components/config/loader.py b/nemo_automodel/components/config/loader.py index 01c4196d0..a3b4cb28b 100644 --- a/nemo_automodel/components/config/loader.py +++ b/nemo_automodel/components/config/loader.py @@ -246,11 +246,13 @@ def instantiate(self, *args, **kwargs): "Instantiation failed for `{}`\n" "Accepted signature : {}\n" "Positional args : {}\n" - "Keyword args : {}\n".format( + "Keyword args : {}\n" + "Exception : {}\n".format( func.__name__, sig, args, pprint.pformat(config_kwargs, compact=True, indent=4), + e, ), file=sys.stderr, ) diff --git a/nemo_automodel/components/datasets/llm/chat_dataset.py b/nemo_automodel/components/datasets/llm/chat_dataset.py index 84ff8e12c..51f36e8ae 100644 --- a/nemo_automodel/components/datasets/llm/chat_dataset.py +++ b/nemo_automodel/components/datasets/llm/chat_dataset.py @@ -133,6 +133,8 @@ def __init__( split: Optional[str] = None, name: Optional[str] = None, seq_length: Optional[int] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", start_of_turn_token: Optional[str] = None, chat_template: Optional[str] = None, ) -> None: @@ -149,6 +151,8 @@ def __init__( self.tokenizer = tokenizer self.seq_length = seq_length + self.padding = padding + self.truncation = truncation self.start_of_turn_token = start_of_turn_token self.dataset = _load_openai_messages(path_or_dataset_id, split=split, name=name) @@ -178,6 +182,8 @@ def __getitem__(self, idx: int) -> Dict[str, List[int]]: eos_token_id, self.pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, tools=tools, ) return sample diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py index c83100550..8a9d82dbb 100644 --- a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py +++ b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py @@ -28,6 +28,8 @@ format_prompt_completion, ) +logger = logging.getLogger(__name__) + # Supported cases: # Format: # - Context + question + answer @@ -165,6 +167,8 @@ def __init__( name: Optional[str] = None, answer_only_loss_mask: bool = True, seq_length: Optional[int] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", start_of_turn_token: Optional[str] = None, limit_dataset_samples: Optional[int] = None, ) -> None: @@ -193,6 +197,12 @@ def __init__( assert tokenizer is not None, "Tokenizer is required" self.tokenizer = tokenizer + if getattr(self.tokenizer, "pad_token", None) is None: + if hasattr(self.tokenizer, "eos_token"): + self.tokenizer.pad_token = self.tokenizer + else: + logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.") + self.tokenizer.pad_token = " " self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name) @@ -226,6 +236,8 @@ def __init__( self.answer_only_loss_mask = answer_only_loss_mask self.start_of_turn_token = start_of_turn_token self.seq_length = seq_length + self.padding = padding + self.truncation = truncation def __len__(self) -> int: # noqa: D401 """ @@ -255,6 +267,8 @@ def __getitem__(self, idx): # noqa: D401 row = self.dataset[idx] mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row} mapped = self._apply_tokenizer(mapped) + if not any(label != -100 for label in mapped["labels"]): + return self.__getitem__((idx + 1) % len(self.dataset)) assert _check_all_values_equal_length(mapped), "All values must be of the same length" return mapped @@ -293,6 +307,8 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]: eos_token_id, pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, ) else: prompt = " ".join(filter(lambda x: x is not None, (context, question, ""))) @@ -304,5 +320,7 @@ def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]: eos_token_id, pad_token_id, seq_length=self.seq_length, + padding=self.padding, + truncation=self.truncation, answer_only_loss_mask=self.answer_only_loss_mask, ) diff --git a/nemo_automodel/components/datasets/llm/formatting_utils.py b/nemo_automodel/components/datasets/llm/formatting_utils.py index a26f21906..dc5c66540 100644 --- a/nemo_automodel/components/datasets/llm/formatting_utils.py +++ b/nemo_automodel/components/datasets/llm/formatting_utils.py @@ -14,7 +14,7 @@ import logging import re -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional, Union logger = logging.getLogger(__name__) @@ -66,6 +66,7 @@ def _package_tokenized_example( eos_token_id, pad_token_id, seq_length, + truncation=None, ): """ Package a tokenized example with proper masking and padding. @@ -77,7 +78,7 @@ def _package_tokenized_example( eos_token_id: The end-of-sequence token id. pad_token_id: The padding token id. seq_length: Optional sequence length for padding. - + truncation: Optional truncation strategy. Returns: A dictionary with input_ids, labels, and attention_mask. """ @@ -86,6 +87,8 @@ def _package_tokenized_example( if not _has_chat_template(tokenizer) and eos_token_id != input_ids[-1]: input_ids += [eos_token_id] assistant_masks += [1] + if not _has_chat_template(tokenizer) and pad_token_id is not None: + assistant_masks += [pad_token_id] labels = input_ids.copy() input_ids = input_ids[:-1] @@ -95,7 +98,7 @@ def _package_tokenized_example( labels[:] = [label if bool(m) else -100 for label, m in zip(labels, assistant_masks)] # remove BOS labels = labels[1:] - if not _has_chat_template(tokenizer): + if not _has_chat_template(tokenizer) and truncation is None: assert labels[-1] == eos_token_id, f"labels[-1]={labels[-1]} != eos_token_id={eos_token_id}" assert input_ids[-1] != eos_token_id, f"input_ids[-1]={input_ids[-1]} == eos_token_id={eos_token_id}" assert len(input_ids) == len(labels), f"len(input_ids)={len(input_ids)} != len(labels)={len(labels)}" @@ -125,6 +128,8 @@ def format_prompt_completion( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", answer_only_loss_mask: bool = True, ) -> Dict[str, List[int]]: """ @@ -150,7 +155,7 @@ def format_prompt_completion( else: len_prompt_ids = 0 # Tokenize full text - input_ids = tokenizer(full_text)["input_ids"] + input_ids = tokenizer(full_text, padding=padding, truncation=truncation, max_length=seq_length)["input_ids"] # Create assistant_masks: 0 for prompt tokens, 1 for answer tokens assistant_masks = [0] * len_prompt_ids + [1] * (len(input_ids) - len_prompt_ids) @@ -162,6 +167,7 @@ def format_prompt_completion( eos_token_id=eos_token_id, pad_token_id=pad_token_id, seq_length=seq_length, + truncation=truncation, ) @@ -171,6 +177,8 @@ def format_chat_template( eos_token_id: int, pad_token_id: int, seq_length: Optional[int] = None, + padding: Union[str, bool] = "do_not_pad", + truncation: Union[str, bool] = "do_not_truncate", tools: Optional[List[Dict]] = None, ) -> Dict[str, List[int]]: """ @@ -199,6 +207,9 @@ def format_chat_template( tokenize=True, return_dict=True, return_assistant_tokens_mask=template_has_generation_kwd, + padding=padding, + truncation=truncation, + max_length=seq_length, ) # Choose the last conversation as answer other history are context by finding the last masked token diff --git a/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py new file mode 100644 index 000000000..44853785e --- /dev/null +++ b/tests/functional_tests/datasets/llm/test_column_mapped_text_instruction_dataset.py @@ -0,0 +1,217 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from pathlib import Path + +import pytest +from transformers import AutoTokenizer + +from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import ( + ColumnMappedTextInstructionDataset, +) + + +def _write_jsonl(tmp_path: Path) -> Path: + """Create a small JSONL dataset for testing.""" + rows = [ + { + "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary.", + "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", + "answers": "Saint Bernadette Soubirous", + }, + { + "context": "Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised.", + "question": "What is in front of the Notre Dame Main Building?", + "answers": "a copper statue of Christ", + }, + { + "context": "Next to the Main Building is the Basilica of the Sacred Heart.", + "question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?", + "answers": "the Main Building", + }, + { + "context": "Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.", + "question": "What is the Grotto at Notre Dame?", + "answers": "a Marian place of prayer and reflection", + }, + { + "context": "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.", + "question": "What sits on top of the Main Building at Notre Dame?", + "answers": "a golden statue of the Virgin Mary", + }, + ] + p = tmp_path / "sample.jsonl" + with p.open("w") as f: + for r in rows: + f.write(json.dumps(r) + "\n") + return p + + +def _maybe_tokenizer_dir_candidates() -> list[Path]: + """Return likely tokenizer directories present in CI test data mounts.""" + candidates: list[Path] = [] + # Known bundle with no chat template used elsewhere in the repo + test_data_dir = os.environ.get("TEST_DATA_DIR") + if test_data_dir: + candidates.append(Path(test_data_dir) / "hf_mixtral_2l") + # Explicit tokenizers used by existing unit tests + base = Path("/home/TestData/akoumparouli/tokenizers/") + names = [ + "gpt-oss-20b", + "llama_3.2_1b", + "qwen3_30b_a3b_instruct_2507", + ] + for n in names: + candidates.append(base / n) + return [p for p in candidates if p.exists()] + + +def _load_tokenizer(path: Path): + os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") + os.environ.setdefault("HF_HUB_OFFLINE", "1") + return AutoTokenizer.from_pretrained(str(path)) + + +def _first_sample(ds: ColumnMappedTextInstructionDataset): + it = iter(ds) + return next(it) + + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (16, "max_length", True), + (16, "do_not_pad", True), + (16, True, None), # padding=True -> longest; with single example behaves like no-op pre-packaging + ], +) +def test_dataset_non_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation): + """Validate shapes and masking for non-chat tokenizers across padding/truncation options.""" + data_file = _write_jsonl(tmp_path) + + # Find a tokenizer without chat template + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is None: + break + else: + pytest.skip("No non-chat tokenizer available in test data mounts") + + column_mapping = {"context": "context", "question": "question", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=tok, + seq_length=seq_length, + padding=padding, + truncation=truncation, + # answer_only_loss_mask default True + ) + + sample = _first_sample(ds) + assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys()) + assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0 + + if isinstance(seq_length, int): + if truncation is True: + assert len(sample["input_ids"]) == seq_length + assert len(sample["labels"]) == seq_length + # Trailing padding in labels must be masked + assert sample["labels"][-1] == -100 + assert sample["attention_mask"][-1] in (0, 1) # depending on pack length, end can be 0 + elif not truncation is True: + assert len(sample["input_ids"]) != seq_length + assert len(sample["labels"]) != seq_length + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (128, "max_length", True), + (16, "do_not_pad", True), + (16, True, None), + ], +) +def test_dataset_chat_padding_truncation_options(tmp_path: Path, seq_length, padding, truncation): + """Validate shapes and masking for chat-template tokenizers across padding/truncation options.""" + data_file = _write_jsonl(tmp_path) + + # Find a tokenizer with chat template + chat_tok = None + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is not None and callable(getattr(tok, "apply_chat_template", None)): + chat_tok = tok + break + if chat_tok is None: + pytest.skip("No chat-template tokenizer available in test data mounts") + + # 3-column mapping + column_mapping = {"context": "context", "question": "question", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=chat_tok, + seq_length=seq_length, + padding=padding, + truncation=truncation, + start_of_turn_token="<|assistant|>", # required when answer_only_loss_mask=True and chat template present + ) + + sample = _first_sample(ds) + assert set(["input_ids", "labels", "attention_mask"]).issubset(sample.keys()) + assert len(sample["input_ids"]) == len(sample["labels"]) == len(sample["attention_mask"]) > 0 + + if isinstance(seq_length, int): + if truncation is True or padding == "max_length": + assert len(sample["input_ids"]) == seq_length + assert len(sample["labels"]) == seq_length + elif not truncation is True: + assert sample["labels"][-1] != -100 + + +def test_dataset_two_column_mapping_non_chat(tmp_path: Path): + """Ensure 2-column mapping (context+answer) works with non-chat tokenizer.""" + data_file = _write_jsonl(tmp_path) + + # Choose a non-chat tokenizer + for d in _maybe_tokenizer_dir_candidates(): + tok = _load_tokenizer(d) + if getattr(tok, "chat_template", None) is None: + break + else: + pytest.skip("No non-chat tokenizer available in test data mounts") + + # Use only context and answers columns + column_mapping = {"context": "context", "answer": "answers"} + + ds = ColumnMappedTextInstructionDataset( + path_or_dataset_id=str(data_file), + column_mapping=column_mapping, + tokenizer=tok, + seq_length=32, + padding="max_length", + truncation=True, + ) + + sample = _first_sample(ds) + assert len(sample["input_ids"]) == 32 + assert len(sample["labels"]) == 32 + assert len(sample["attention_mask"]) == 32 + diff --git a/tests/functional_tests/hf_transformer/test_formatting_utils_options.py b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py new file mode 100644 index 000000000..f9229b150 --- /dev/null +++ b/tests/functional_tests/hf_transformer/test_formatting_utils_options.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Iterable, List, Tuple + +import pytest +from transformers import AutoTokenizer + +from nemo_automodel.components.datasets.llm.formatting_utils import ( + _add_pad_token, + format_chat_template, + format_prompt_completion, +) + + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (4, "max_length", True), + ], +) +def test_format_prompt_completion_options(seq_length, padding, truncation): + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/hf_mixtral_2l" + assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" + tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) + # Only applicable when tokenizer lacks chat template + assert getattr(tok, "chat_template", None) is None + + eos_token_id = getattr(tok, "eos_token_id", 0) + pad_token_id = _add_pad_token(tok) or eos_token_id + if padding != "do_not_pad": + tok.pad_token = tok.eos_token + + # If using padding="max_length", seq_length must be an int + if padding == "max_length" and not isinstance(seq_length, int): + pytest.skip("padding='max_length' requires seq_length to be set.") + + context = "France is a country in Europe." + question = "What is the capital of France?" + answer = "Paris." + prompt = f"{context} {question} " + + out = format_prompt_completion( + tokenizer=tok, + prompt=prompt, + answer=answer, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + seq_length=seq_length, + padding=padding, + truncation=truncation, + answer_only_loss_mask=True, + ) + + # Basic structure + assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys()) + assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0 + + # seq_length enforcement (either by HF padding or our packager) + if isinstance(seq_length, int) and padding != "do_not_pad": + assert len(out["input_ids"]) == seq_length + assert len(out["labels"]) == seq_length + # Trailing padding label must be masked + assert out["labels"][-1] == -100, (out, pad_token_id) + + # EOS should be present in labels (supervised area) but not as last input_id + if getattr(tok, "eos_token_id", None) is not None and not truncation == True: + assert tok.eos_token_id in out["labels"], "EOS must appear in labels" + # find last non-pad input position and ensure it's not EOS + last_non_pad = len(out["input_ids"]) - 1 + while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id: + last_non_pad -= 1 + assert last_non_pad >= 0 + assert out["input_ids"][last_non_pad] != tok.eos_token_id + + # There should be masked (prompt) and supervised (answer) tokens + assert any(l == -100 for l in out["labels"]) # masked prompt + if not truncation == True: + assert any(l != -100 for l in out["labels"]) # supervised answer + + # Attention mask should have zeros only in padded tail (if any) + if isinstance(seq_length, int): + # From the end, once we see a 0, the rest must be 0 + seen_zero = False + for v in reversed(out["attention_mask"]): + if v == 0: + seen_zero = True + else: + if seen_zero: + pytest.fail("Non-zero attention_mask value after padded zeros.") + + +@pytest.mark.parametrize( + "seq_length,padding,truncation", + [ + (None, "do_not_pad", None), + (4, "max_length", True), + ], +) +def test_format_chat_template_options(seq_length, padding, truncation): + + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + TOKENIZER_DIR = f"{os.environ['TEST_DATA_DIR']}/qwen3_4b_instruct_2407" + assert os.path.exists(TOKENIZER_DIR), "Tokenizer directory does not exist" + tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR) + # Only applicable when tokenizer DOES define a chat template + if not getattr(tok, "chat_template", None): + pytest.skip(f"Tokenizer qwen3_4b_instruct_2407 has no chat_template; skipping chat-template tests.") + + eos_token_id = getattr(tok, "eos_token_id", 0) + pad_token_id = _add_pad_token(tok) or eos_token_id + + if padding == "max_length" and not isinstance(seq_length, int): + pytest.skip("padding='max_length' requires seq_length to be set.") + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "Paris."}, + ] + + out = format_chat_template( + tokenizer=tok, + formatted_text=messages, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + seq_length=seq_length, + padding=padding, + truncation=truncation, + ) + + # Basic structure + assert set(["input_ids", "labels", "attention_mask"]).issubset(out.keys()) + assert len(out["input_ids"]) == len(out["labels"]) == len(out["attention_mask"]) > 0 + + # seq_length enforcement + if isinstance(seq_length, int): + assert len(out["input_ids"]) == seq_length + assert len(out["labels"]) == seq_length + if truncation == False: + assert out["labels"][-1] == -100 + + # For chat templates, EOS should not be the last input id (unless it's all pad) + if getattr(tok, "eos_token_id", None) is not None: + last_non_pad = len(out["input_ids"]) - 1 + while last_non_pad >= 0 and out["input_ids"][last_non_pad] == pad_token_id: + last_non_pad -= 1 + if last_non_pad >= 0: + assert out["input_ids"][last_non_pad] != tok.eos_token_id + + # There must be at least some supervised tokens in labels + assert any(l != -100 for l in out["labels"]) # assistant tokens + + # Attention mask padded tail zeros, if padded + if isinstance(seq_length, int) and truncation == False: + seen_zero = False + for v in reversed(out["attention_mask"]): + if v == 0: + seen_zero = True + else: + if seen_zero: + pytest.fail("Non-zero attention_mask value after padded zeros.") + + diff --git a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py index f35c13392..8ce6fa772 100644 --- a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py +++ b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction.py @@ -67,7 +67,7 @@ def __init__(self): self.bos_token_id = 2 self._counter = 3 # Start token IDs from 3 to avoid conflicts - def __call__(self, text: str, add_special_tokens: bool = True): # noqa: D401 + def __call__(self, text: str, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None): # noqa: D401 """Mimic the Hugging Face tokenizer ``__call__`` API. The real tokenizer would convert *text* into a list of integer token IDs. diff --git a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py index b17d27d41..80ce69325 100644 --- a/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py +++ b/tests/unit_tests/datasets/llm/test_tokenizer_apply_functions.py @@ -58,7 +58,7 @@ def _id_for_token(self, tok: str) -> int: self._cursor += 1 return self._vocab[tok] - def __call__(self, text: str, *, add_special_tokens: bool = True): # type: ignore[override] + def __call__(self, text: str, *, add_special_tokens: bool = True, padding=None, truncation=None, max_length=None): # type: ignore[override] ids: List[int] = [] if add_special_tokens: ids.append(self.bos_token_id)