diff --git a/genai-perf/genai_perf/tokenizer.py b/genai-perf/genai_perf/tokenizer.py index abcfc195..9a4e214f 100644 --- a/genai-perf/genai_perf/tokenizer.py +++ b/genai-perf/genai_perf/tokenizer.py @@ -23,6 +23,7 @@ from genai_perf.config.input.config_command import ConfigCommand from genai_perf.exceptions import GenAIPerfException +from genai_perf.utils import supports_kwarg class Tokenizer: @@ -56,6 +57,16 @@ def set_tokenizer(self, name: str, trust_remote_code: bool, revision: str) -> No tokenizer = AutoTokenizer.from_pretrained( name, trust_remote_code=trust_remote_code, revision=revision ) + + if supports_kwarg(tokenizer, "encode", "allow_special_tokens"): + # If the tokenizer encode method supports allow_special_tokens + # then we override the normal 'add_special_tokens' parameter + # with 'allow_special_tokens' to match the behavior of the + # current tokenizer. (such as Kimi) + self._call_args = {"allow_special_tokens": False} + self._encode_args = {"allow_special_tokens": False} + self._decode_args = {"skip_special_tokens": True} + except Exception as e: raise GenAIPerfException(e) self._tokenizer = tokenizer diff --git a/genai-perf/genai_perf/utils.py b/genai-perf/genai_perf/utils.py index af141cbc..82ccd477 100644 --- a/genai-perf/genai_perf/utils.py +++ b/genai-perf/genai_perf/utils.py @@ -150,3 +150,12 @@ def split_and_strip_whitespace(input_string: str) -> List[str]: Split a string by comma and strip whitespace from each item """ return [item.strip() for item in input_string.split(",")] + +def supports_kwarg(obj, method_name, kwarg): + """ Check if the given object has a method with the specified name + that accepts a keyword argument with the specified name.""" + method = getattr(obj, method_name, None) + if not method: + return False + import inspect + return kwarg in inspect.signature(method).parameters