You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am running the phi-2 on iOS using the code from LLMEval.
I have ported over an implementation of the CodeGen Tokenizer into swift as a standalone file:
import Foundation
structBPEMerge:Hashable{letfirst:Stringletsecond:String}classCodeGenTokenizer{letencoder:[String:Int]letdecoder:[Int:String]leterrors:StringletbyteEncoder:[UInt8:String]letbyteDecoder:[String:UInt8]letbpeRanks:[BPEMerge:Int]varcache:[String:String]=[:]letaddPrefixSpace:Boolletpat:NSRegularExpressionletunkToken:StringletbosToken:StringleteosToken:StringletpadToken:String?letaddBosToken:Bool// CodeGenTokenizer.swift// ...init(){self.errors ="replace"self.byteEncoder =CodeGenTokenizer.bytesToUnicode()self.byteDecoder =self.byteEncoder.invertedDict()self.addPrefixSpace = false
self.unkToken ="<|endoftext|>"self.bosToken ="<|endoftext|>"self.eosToken ="<|endoftext|>"self.padToken =nilself.addBosToken = false
if let vocabPath =Bundle.main.path(forResource:"CodeGen-vocab", ofType:"json"),let vocabData =try?Data(contentsOf:URL(fileURLWithPath: vocabPath)),let vocabJSON =try?JSONSerialization.jsonObject(with: vocabData, options:[])as?[String:Int]{self.encoder = vocabJSON
}else{fatalError("Failed to load vocab file")}self.decoder =self.encoder.invertedDict()
if let mergesPath =Bundle.main.path(forResource:"CodeGen-merges", ofType:"txt"),let mergesData =try?Data(contentsOf:URL(fileURLWithPath: mergesPath)),let mergesString =String(data: mergesData, encoding:.utf8){letlines= mergesString.split(separator:"\n")
if lines.count >2{letbpeMerges=lines[1..<(lines.count -1)].enumerated().map{ index, line ->(Int,BPEMerge)inletpair= line.split(separator:"")return(index,BPEMerge(first:String(pair[0]), second:String(pair[1])))}self.bpeRanks =Dictionary<BPEMerge,Int>(uniqueKeysWithValues: bpeMerges.map{($0.1, $0.0)})}else{print("Merges file does not have enough lines. Skipping BPE merges.")self.bpeRanks =[:]}}else{fatalError("Failed to load merges file")}self.pat =try!NSRegularExpression(pattern:#"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"#)}// ...varunknownTokenId:Int{returnself.encoder[self.unkToken]!
}vareosTokenId:Int{returnself.encoder[self.eosToken]!
}varvocabSize:Int{returnself.encoder.count
}func getVocab()->[String:Int]{returnself.encoder
}func bpe(token:String)->String{print("token \(token)")
if let cached =self.cache[token]{return cached
}varword= token.map{String($0)}varpairs=CodeGenTokenizer.getPairs(word: word)
if pairs.isEmpty {return token
}
while true {
if let bigram = pairs.min(by:{self.bpeRanks[$0, default:Int.max]<self.bpeRanks[$1, default:Int.max]}){
if !self.bpeRanks.keys.contains(bigram){
break
}let(first, second)=(bigram.first, bigram.second)varnewWord:[String]=[]vari=0
while i < word.count {
if let j =word[i..<word.count].firstIndex(of: first){
newWord.append(contentsOf:word[i..<j])
i = j
if word[i]== first && i < word.count -1 && word[i +1]== second {
newWord.append(first + second)
i +=2}else{
newWord.append(word[i])
i +=1}}else{
newWord.append(contentsOf:word[i..<word.count])
break
}}
word = newWord
if word.count ==1{
break
}else{
pairs =CodeGenTokenizer.getPairs(word: word)}}else{
break
}}letres= word.joined(separator:"")self.cache[token]= res
return res
}func encodeUtf8(_ str:String)->[UInt8]{letutf8Bytes=Array(str.utf8)return utf8Bytes
}func tokenize(text:String)->[String]{varbpeTokens:[String]=[]letmatches=self.pat.matches(in: text, range:NSRange(text.startIndex..., in: text))
for match in matches {lettoken=String(text[Range(match.range, in: text)!])letbytes=encodeUtf8(token)letencodedToken= token.utf8.map{self.byteEncoder[UInt8($0), default:""]}.joined()letbpeTokensForToken=self.bpe(token: encodedToken).split(separator:"").map{String($0)}
bpeTokens.append(contentsOf: bpeTokensForToken)}return bpeTokens
}func encode(text:String)->[Int]{lettokens=self.tokenize(text: text)return tokens.map{self.encoder[$0, default:self.encoder[self.unkToken]!]}}func decode(tokenIds:[Int], skipSpecialTokens:Bool= false, cleanUpTokenizationSpaces:Bool?=nil, truncateBeforePattern:[String]?=nil)->String{vartokens:[String]=[]
for tokenId in tokenIds {
if let token =self.decoder[tokenId]{
tokens.append(token)}else{
tokens.append(self.unkToken)}}
if skipSpecialTokens {
tokens = tokens.filter{ $0 !=self.bosToken && $0 !=self.eosToken && $0 !=self.padToken }}vardecodedText=self.convertTokensToString(tokens: tokens)
if let cleanUpTokenizationSpaces = cleanUpTokenizationSpaces, cleanUpTokenizationSpaces {
decodedText = decodedText.replacingOccurrences(of:"", with:"")}
if let truncateBeforePattern = truncateBeforePattern {
for pattern in truncateBeforePattern {
if let range = decodedText.range(of: pattern, options:.regularExpression){
decodedText =String(decodedText[..<range.lowerBound])
break
}}}return decodedText
}func convertTokensToString(tokens:[String])->String{lettext= tokens.joined()letbyteArray= text.map{self.byteDecoder[String($0), default:0]}returnString(bytes: byteArray, encoding:.utf8)??""}staticfunc bytesToUnicode()->[UInt8:String]{varbs:[UInt8]=Array(UInt8(33)...UInt8(126))+ Array(UInt8(161)...UInt8(172))+ Array(UInt8(174)...UInt8(255))varcs:[String]= bs.map{String(Unicode.Scalar($0))}varn=0
for b in 0..<(1 << 8){
if !bs.contains(UInt8(b)){
bs.append(UInt8(b))// Adjust the starting point for mapping non-included bytes to correctly include "Ġ"// Given "Ġ" = 288 and it needs to be assigned to space (32), which is the first non-included byte,// we set the base to 256 (0x100) to align with the Python implementation's logic
cs.append(String(Unicode.Scalar(0x100+ n)!))
n +=1}}returnDictionary(uniqueKeysWithValues:zip(bs, cs))}staticfunc getPairs(word:[String])->Set<BPEMerge>{varpairs=Set<BPEMerge>()letprevChars= word.dropLast()letnextChars= word.dropFirst()
for (prev, next) in zip(prevChars, nextChars){
pairs.insert(BPEMerge(first: prev, second: next))}return pairs
}}extensionDictionarywhere Value:Hashable{typealiasInvertedDictionary<Value:Hashable,Key:Hashable>=[Value:Key]func invertedDict()->InvertedDictionary<Value,Key>{returnInvertedDictionary<Value,Key>(uniqueKeysWithValues:self.map{($1, $0)})}}
I have now having an issue generating the response from phi-2. Even though the tokenizer seems to be correct, the response from Phi-2 is incoherent.
For reference here is a snippet of the code which generates tokens:
func runModelAsync(fromText prompt:String)asyncthrows->String{print("runModelAsync")do{let(model, _)=tryawaitloadModel()awaitMainActor.run{
running = true
self.output =""}// augment the prompt as neededletprompt= modelConfiguration.prepare(prompt: prompt)letpromptTokens=MLXArray(tokenizer.encode(text: prompt))print("Prompt: \(prompt)")print(promptTokens)// each time you generate you will get something newMLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate *1000))varoutputTokens=[Int]()
for token in TokenIterator(prompt: promptTokens, model: model, temp: temperature){lettokenId= token.item(Int.self)
if tokenId == tokenizer.unknownTokenId {print("Break unknown token")
break
}
if tokenId == tokenizer.eosTokenId {print("Break eos token")
break
}
outputTokens.append(tokenId)lettext= tokenizer.decode(tokenIds: outputTokens)print("Generating \(text)")// update the output -- this will make the view show the text as it generatesawaitMainActor.run{self.output = text
}
if outputTokens.count == maxTokens {print("Break maxTokens")
break
}}awaitMainActor.run{
running = false
}}catch{awaitMainActor.run{
running = false
output ="Failed: \(error)"}}returnself.output
}
I noticed mlx-community/phi-2-hf-4bit-mlx has some custom python code which should be ran when using the python version of MLX. Since this custom code does not run on swift, does the Phi class in LLMEval implement a port of that custom code?
I noticed mlx-community/phi-2-hf-4bit-mlx has some custom python code which should be ran when using the python version of MLX. Since this custom code does not run on swift, does the Phi class in LLMEval implement a port of that custom code?
No, the code has some simple prompt augmentation, but it doesn't use the config:
For the general issue of debugging the tokenizer, since we have a working python version it is probably easiest to compare to that. I wrote up some thoughts here, see if they help:
I am running the phi-2 on iOS using the code from LLMEval.
I have ported over an implementation of the CodeGen Tokenizer into swift as a standalone file:
I have now having an issue generating the response from phi-2. Even though the tokenizer seems to be correct, the response from Phi-2 is incoherent.
For reference here is a snippet of the code which generates tokens:
Here are the print logs:
Can anyone point me in the right direction to fixing this?
The text was updated successfully, but these errors were encountered: