diff --git a/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift b/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift index 77732dc..63c4945 100644 --- a/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift +++ b/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift @@ -49,6 +49,9 @@ struct TransformersCLI: AsyncParsableCommand { @Option(help: "Repetition penalty to discourage repeating tokens (typical: 1.0-2.0, 1.0 = no penalty)") var repetitionPenalty: Float? + @Option(help: "Path to a local folder containing tokenizer_config.json and tokenizer.json") + var tokenizerPath: String? + func generate( model: LanguageModel, config: GenerationConfig, @@ -104,7 +107,17 @@ struct TransformersCLI: AsyncParsableCommand { let url = URL(filePath: modelPath) let compiledURL = try compile(at: url) print("Loading model \(compiledURL)") - let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits) + let model: LanguageModel + if let tokenizerFolder { + let tokenizerURL = URL(filePath: tokenizerFolder, directoryHint: .isDirectory) + model = try LanguageModel.loadCompiled( + url: compiledURL, + tokenizerFolder: tokenizerURL, + computeUnits: computeUnits.asMLComputeUnits + ) + } else { + model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits) + } var config = model.defaultGenerationConfig config.doSample = doSample diff --git a/README.md b/README.md index a405265..f7f106d 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,36 @@ example converting and running Mistral 7B using CoreML [here](https://github.com The [modernization of Core ML](https://github.com/huggingface/swift-transformers/pull/257) and corresponding examples were primarily contributed by @joshnewnham, @1duo, @alejandro-isaza, @aseemw. Thank you 🙏 +### Offline CoreML tokenizers + +When you bundle a compiled CoreML model and tokenizer files with your app, you can skip any network requests by injecting +the tokenizer when constructing `LanguageModel`: + +```swift +let compiledURL: URL = ... // path to .mlmodelc +let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json + +// Construct the tokenizer from local files (inside an async context) +let tokenizer = try await AutoTokenizer.from(modelFolder: tokenizerFolder) +let model = try LanguageModel.loadCompiled( + url: compiledURL, + tokenizer: tokenizer +) +``` + +Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint or are compatible with the model you use. For the +Mistral example in `Examples/Mistral7B/`, you can fetch the tokenizer like this: + +```bash +huggingface-cli download \ + mistralai/Mistral-7B-Instruct-v0.3 \ + tokenizer.json tokenizer_config.json \ + --local-dir Examples/Mistral7B/local-tokenizer +``` + +If the repo is gated, authenticate with `huggingface-cli login` first. Both initializers reuse the tokenizer +you pass in and never reach out to the Hugging Face Hub. + ## Usage via SwiftPM To use `swift-transformers` with SwiftPM, you can add this to your `Package.swift`: @@ -139,5 +169,3 @@ To format your code, run `swift format -i --recursive .`. ## License [Apache 2](LICENSE). - - diff --git a/Sources/Models/LanguageModel.swift b/Sources/Models/LanguageModel.swift index 8948e7a..057d750 100644 --- a/Sources/Models/LanguageModel.swift +++ b/Sources/Models/LanguageModel.swift @@ -33,12 +33,26 @@ public class LanguageModel { /// Creates a new language model instance from a CoreML model. /// - /// - Parameter model: The CoreML model to wrap + /// - Parameters: + /// - model: The CoreML model to wrap + /// - tokenizer: Optional preconstructed tokenizer to reuse /// - Important: Triggers a fatal error if the model doesn't have the expected input shape information - public required init(model: MLModel) { + public required init( + model: MLModel, + tokenizer: Tokenizer? = nil + ) { self.model = model + _tokenizer = tokenizer (minContextLength, maxContextLength) = Self.contextRange(from: model) - configuration = LanguageModelConfigurationFromHub(modelName: modelName) + if tokenizer == nil { + self.configuration = LanguageModelConfigurationFromHub(modelName: modelName) + } else { + self.configuration = nil + } + } + + public convenience required init(model: MLModel) { + self.init(model: model, tokenizer: nil) } public func resetState() async {} @@ -142,15 +156,28 @@ public extension LanguageModel { /// - Parameters: /// - url: The URL of the compiled CoreML model file (.mlmodelc) /// - computeUnits: The compute units to use for model inference + /// - tokenizer: Optional tokenizer instance to reuse /// - Returns: A configured `LanguageModel` instance /// - Throws: An error if the model cannot be loaded from the specified URL - static func loadCompiled(url: URL, computeUnits: MLComputeUnits = .cpuAndGPU) throws -> LanguageModel { + static func loadCompiled( + url: URL, + computeUnits: MLComputeUnits = .cpuAndGPU, + tokenizer: Tokenizer? = nil + ) throws -> LanguageModel { let config = MLModelConfiguration() config.computeUnits = computeUnits let model = try MLModel(contentsOf: url, configuration: config) return switch kvCacheAvailability(for: model) { - case .statefulKVCache: LanguageModelWithStatefulKVCache(model: model) - default: LanguageModel(model: model) + case .statefulKVCache: + LanguageModelWithStatefulKVCache( + model: model, + tokenizer: tokenizer + ) + default: + LanguageModel( + model: model, + tokenizer: tokenizer + ) } } } @@ -304,7 +331,8 @@ public extension LanguageModel { /// - Throws: An error if the configuration cannot be loaded var modelConfig: Config? { get async throws { - try await configuration!.modelConfig + guard let configuration else { return nil } + return try await configuration.modelConfig } } @@ -314,7 +342,8 @@ public extension LanguageModel { /// - Throws: An error if the configuration cannot be loaded var tokenizerConfig: Config? { get async throws { - try await configuration!.tokenizerConfig + guard let configuration else { return nil } + return try await configuration.tokenizerConfig } } @@ -324,7 +353,10 @@ public extension LanguageModel { /// - Throws: An error if the tokenizer data cannot be loaded var tokenizerData: Config { get async throws { - try await configuration!.tokenizerData + guard let configuration else { + throw TokenizerError.missingConfig + } + return try await configuration.tokenizerData } } @@ -459,8 +491,11 @@ public class LanguageModelWithStatefulKVCache: LanguageModel { var state: MLState? - public required init(model: MLModel) { - super.init(model: model) + public required init( + model: MLModel, + tokenizer: Tokenizer? = nil + ) { + super.init(model: model, tokenizer: tokenizer) // To support pre-filling and extend, the input must support // flexible shapes. guard maxContextLength - minContextLength > 1 else { @@ -531,11 +566,15 @@ public class LanguageModelWithStatefulKVCache: LanguageModel { public enum TokenizerError: LocalizedError { /// The tokenizer configuration file could not be found. case tokenizerConfigNotFound + /// The language model configuration required to load tokenizer data is missing. + case missingConfig public var errorDescription: String? { switch self { case .tokenizerConfigNotFound: String(localized: "Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment: "Error when tokenizer configuration is missing") + case .missingConfig: + String(localized: "Language model configuration was not set, tokenizer assets could not be loaded.", comment: "Error when configuration needed for tokenizer data is missing") } } } diff --git a/Tests/TokenizersTests/Resources/tokenizer.json b/Tests/TokenizersTests/Resources/tokenizer.json new file mode 100644 index 0000000..65b3e6d --- /dev/null +++ b/Tests/TokenizersTests/Resources/tokenizer.json @@ -0,0 +1,51 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "" + }, + { + "id": 1, + "content": "" + }, + { + "id": 2, + "content": "" + }, + { + "id": 3, + "content": "" + } + ], + "model": { + "type": "BPE", + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "offline": 4, + "path": 5, + "_": 6 + }, + "merges": [ + "off line", + "li ne", + "pa th", + "_ of", + "_ pa" + ], + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "unk_token": "" + }, + "normalizer": { + "type": "Lowercase" + }, + "pre_tokenizer": { + "type": "Whitespace" + } +} \ No newline at end of file diff --git a/Tests/TokenizersTests/Resources/tokenizer_config.json b/Tests/TokenizersTests/Resources/tokenizer_config.json new file mode 100644 index 0000000..a72a238 --- /dev/null +++ b/Tests/TokenizersTests/Resources/tokenizer_config.json @@ -0,0 +1,9 @@ +{ + "tokenizer_class": "GPT2Tokenizer", + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + "model_max_length": 128, + "do_lower_case": false +} \ No newline at end of file diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index b731194..b81189a 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -177,6 +177,39 @@ struct TokenizerTests { #expect(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>") == [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266]) } + @Test + func tokenizerFromLocalFolder() async throws { + let bundle = Bundle.module + guard + let tokenizerConfigURL = bundle.url( + forResource: "tokenizer_config", + withExtension: "json" + ), + bundle.url( + forResource: "tokenizer", + withExtension: "json" + ) != nil + else { + Issue.record("Missing offline tokenizer fixtures") + return + } + + let configuration = LanguageModelConfigurationFromHub(modelFolder: tokenizerConfigURL.deletingLastPathComponent()) + + let tokenizerConfigOpt = try await configuration.tokenizerConfig + #expect(tokenizerConfigOpt != nil) + let tokenizerConfig = tokenizerConfigOpt! + let tokenizerData = try await configuration.tokenizerData + + let tokenizer = try AutoTokenizer.from( + tokenizerConfig: tokenizerConfig, + tokenizerData: tokenizerData + ) + + let encoded = tokenizer.encode(text: "offline path") + #expect(!encoded.isEmpty) + } + /// https://github.com/huggingface/swift-transformers/issues/96 @Test func legacyLlamaBehaviour() async throws {