Skip to content

Commit

Permalink
switch swift-tokenizers to main, remove some workarounds (#26)
Browse files Browse the repository at this point in the history
* switch swift-tokenizers to main, remove some workarounds

- swift-tokenizers is getting a lot of updates and fixes, let's track main for now
- remove some workarounds that are no longer needed

- huggingface/swift-transformers#63
  • Loading branch information
davidkoski committed Mar 14, 2024
1 parent ac273a1 commit 0af2efe
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 46 deletions.
43 changes: 1 addition & 42 deletions Libraries/LLM/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -67,54 +67,13 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok
tokenizerConfig = Config(dictionary)
}

// workaround: some merges can't be split on space in BPETokenizer
if let tokenizerClass = tokenizerConfig.tokenizerClass?.stringValue {
switch tokenizerClass {
case "T5Tokenizer":
break
default:
tokenizerData = discardUnhandledMerges(tokenizerData: tokenizerData)
}
}

let impl = try PreTrainedTokenizer(
tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)

return Tokenizer(tokenizer: impl, tokenizerConfig: tokenizerConfig)
}

public func discardUnhandledMerges(tokenizerData: Config) -> Config {
// see https://github.com/ml-explore/mlx-swift-examples/issues/1
// and https://github.com/huggingface/swift-transformers/issues/51

if let model = tokenizerData.model {
if let merges = model.dictionary["merges"] as? [String] {
// discard any merges that can't be split on a space
// (required by BPETokenizer)
let newMerges =
merges
.filter {
$0.split(separator: " ").count == 2
}

if newMerges.count != merges.count {
var newModel = model.dictionary
newModel["merges"] = newMerges

var newTokenizerData = tokenizerData.dictionary
newTokenizerData["model"] = newModel

return Config(newTokenizerData)
}
}
}

return tokenizerData
}

/// overrides for TokenizerModel/knownTokenizers
let replacementTokenizers = [
"CodeLlamaTokenizer": "LlamaTokenizer",
"GemmaTokenizer": "PreTrainedTokenizer",
"Qwen2Tokenizer": "PreTrainedTokenizer",
"Qwen2Tokenizer": "PreTrainedTokenizer"
]
4 changes: 2 additions & 2 deletions mlx-swift-examples.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -2220,8 +2220,8 @@
isa = XCRemoteSwiftPackageReference;
repositoryURL = "https://github.com/huggingface/swift-transformers";
requirement = {
kind = upToNextMajorVersion;
minimumVersion = 0.1.2;
branch = main;
kind = branch;
};
};
C392736E2B60699100368D5D /* XCRemoteSwiftPackageReference "swift-argument-parser" */ = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
"kind" : "remoteSourceControl",
"location" : "https://github.com/huggingface/swift-transformers",
"state" : {
"revision" : "564442fba36b0b694d730a62d0593e5f54043b55",
"version" : "0.1.2"
"branch" : "main",
"revision" : "24605a8c0cc974bec5b94a6752eb687bae77db31"
}
}
],
Expand Down

0 comments on commit 0af2efe

Please sign in to comment.