dotnet · tarekgh · Feb 26, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 19, 2024
diff --git a/src/Microsoft.ML.Tokenizers/Model/Cache.cs b/src/Microsoft.ML.Tokenizers/Model/Cache.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 using System.Threading;
 
@@ -95,5 +96,40 @@ internal void Set(TKey k, TValue v)
             }
             finally { _cacheLock.ExitWriteLock(); }
         }
+
+        internal KeyValuePair<TKey, TValue>[] ToArray()
+        {
+            _cacheLock.EnterReadLock();
+            try
+            {
+                return Map.ToArray();
+            }
+            finally { _cacheLock.ExitReadLock(); }
+        }
+
+        internal TValue GetOrAdd(TKey key, TValue value)
+        {
+            _cacheLock.EnterUpgradeableReadLock();
+            try
+            {
+                if (Map.TryGetValue(key, out TValue? v))
+                {
+                    return v;
+                }
+
+                _cacheLock.EnterWriteLock();
+                try
+                {
+                    if (Capacity > Map.Count)
+                    {
+                        Map[key] = value;
+                    }
+                }
+                finally { _cacheLock.ExitWriteLock(); }
+
+                return value;
+            }
+            finally { _cacheLock.ExitUpgradeableReadLock(); }
+        }
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs b/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
@@ -20,7 +20,7 @@ public sealed class EnglishRoberta : Model
         private readonly HighestOccurrenceMapping _vocabIdToHighestOccurrence;
         private readonly IReadOnlyDictionary<string, int> _vocab;
         private readonly SortedDictionary<int, string> _vocabReverse;
-        private readonly Dictionary<(string, string), int> _mergeRanks;
+        private readonly Cache<(string, string), int> _mergeRanks;
         private readonly IReadOnlyDictionary<char, char> _byteToUnicode;
         private readonly IReadOnlyDictionary<char, char> _unicodeToByte;
         private readonly string[] _charToString;
@@ -205,6 +205,11 @@ public override string[] Save(string path, string? prefix = null)
         /// <returns>The list of tokens generated from the sequence tokenization.</returns>
         public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false)
         {
+            if (string.IsNullOrEmpty(sequence))
+            {
+                return Bpe.EmptyTokensList;
+            }
+
             char[] token = ArrayPool<char>.Shared.Rent(sequence.Length);
             int[] indexMapping = ArrayPool<int>.Shared.Rent(sequence.Length);
 
@@ -258,6 +263,11 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
 
         private int TokenizeToIds(string sequence, IList<int>? accumulatedIds)
         {
+            if (string.IsNullOrEmpty(sequence))
+            {
+                return 0;
+            }
+
             if (_cache.TryGet(sequence, out List<Token>? hit))
             {
                 if (accumulatedIds is not null)
@@ -271,34 +281,17 @@ private int TokenizeToIds(string sequence, IList<int>? accumulatedIds)
                 return hit.Count;
             }
 
-            Span<char> token = stackalloc char[100];
-            Span<int> indexMapping = stackalloc int[100];
-
-            if (sequence.Length > 100)
-            {
-                token = new char[sequence.Length].AsSpan();
-                indexMapping = new int[sequence.Length].AsSpan();
-            }
-
-            int newTokenIndex = 0;
-            for (int i = 0; i < sequence.Length; i++)
+            // If the cache doesn't have the sequence, then tokenize it and add it to the cache
+            IReadOnlyList<Token> tokens = Tokenize(sequence);
+            if (accumulatedIds is not null)
             {
-                if (_byteToUnicode.TryGetValue(sequence[i], out var value))
+                foreach (var t in tokens)
                 {
-                    token[newTokenIndex] = value;
-                    indexMapping[newTokenIndex] = i;
-                    newTokenIndex++;
+                    accumulatedIds.Add(t.Id);
                 }
             }
 
-            if (newTokenIndex == 0)
-            {
-                return 0;
-            }
-
-            List<Token> result = EncodeToTokens(token.Slice(0, newTokenIndex), indexMapping);
-            _cache.Set(sequence, result);
-            return result.Count;
+            return tokens.Count;
         }
 
         /// <summary>
@@ -477,9 +470,9 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)
             return vocab;
         }
 
-        private Dictionary<(string, string), int> GetMergeRanks(Stream mergeStream)
+        private Cache<(string, string), int> GetMergeRanks(Stream mergeStream)
         {
-            var mergeRanks = new Dictionary<(string, string), int>();
+            var mergeRanks = new Cache<(string, string), int>(60_000);
             try
             {
                 using StreamReader reader = new StreamReader(mergeStream);
@@ -500,7 +493,7 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)
                         throw new Exception($"Invalid format of merge file: \"{line}\"");
                     }
 
-                    mergeRanks.Add((line.Substring(0, index), line.Substring(index + 1)), rank++);
+                    mergeRanks.Set((line.Substring(0, index), line.Substring(index + 1)), rank++);
                 }
             }
             catch (Exception e)
@@ -538,26 +531,19 @@ private static int GetByteToUnicode(out IReadOnlyDictionary<char, char> byteToUn
         }
 
         /// <summary>
-        /// Encode a token into BPE-ed Ids. E.g., "playing" into ["play", "ing"].
+        /// Encode a token into BPE-ed sub-tokens. E.g., "playing" into ["play", "ing"].
         /// </summary>
-        /// <param name="token">The token to encode.</param>
-        /// <param name="ids">The list of Ids to encode the token into.</param>
-        /// <returns>The number of encoded ids.</returns>
-        private int EncodeToIds(Span<char> token, IList<int>? ids)
+        private List<Token> EncodeToTokens(Span<char> token, Span<int> indexMapping)
         {
             if (token.Length == 0)
             {
-                return 0;
+                return Bpe.EmptyTokensList;
             }
 
             if (token.Length == 1)
             {
-                if (ids is not null)
-                {
-                    ids.Add(_vocab[_charToString[token[0]]]);
-                }
-
-                return 1;
+                string tokenValue = _charToString[token[0]];
+                return new List<Token> { new Token(_vocab[tokenValue], tokenValue, (indexMapping[0], indexMapping[0] + 1)) };
             }
 
             List<string> word = new(token.Length);
@@ -586,7 +572,7 @@ private int EncodeToIds(Span<char> token, IList<int>? ids)
 
                 // get the most frequent bi-gram pair
                 var (first, second) = pairs.ArgMin(pair => _mergeRanks.GetOrAdd(pair, int.MaxValue));
-                if (!_mergeRanks.ContainsKey((first, second)))
+                if (!_mergeRanks.TryGet((first, second), out int _))
                 {
                     break;
                 }
@@ -605,6 +591,7 @@ private int EncodeToIds(Span<char> token, IList<int>? ids)
                         {
                             newWord.Add(word[k]);
                         }
+
                         break;
                     }
                     else
@@ -614,104 +601,7 @@ private int EncodeToIds(Span<char> token, IList<int>? ids)
                         {
                             newWord.Add(word[k]);
                         }
-                        i = j;
-                    }
-
-                    // check the next element is {second} or not
-                    if (i < word.Count - 1 && word[i + 1] == second)
-                    {
-                        newWord.Add(first + second);
-                        i += 2;
-                    }
-                    else
-                    {
-                        newWord.Add(word[i]);
-                        i += 1;
-                    }
-                }
-
-                List<string> temp = word;
-                word = newWord;
-                newWord = temp;
-                newWord.Clear();
-
-                // otherwise, continue merging
-                WordToPairs(word, pairs);
-            }
-
-            if (ids is not null)
-            {
-                foreach (string w in word)
-                {
-                    ids.Add(_vocab[w]);
-                }
-            }
-
-            return word.Count;
-        }
-
-        /// <summary>
-        /// Encode a token into BPE-ed sub-tokens. E.g., "playing" into ["play", "ing"].
-        /// </summary>
-        private List<Token> EncodeToTokens(Span<char> token, Span<int> indexMapping)
-        {
-            if (token.Length == 0)
-            {
-                return Bpe.EmptyTokensList;
-            }
-
-            if (token.Length == 1)
-            {
-                string tokenValue = _charToString[token[0]];
-                return new List<Token> { new Token(_vocab[tokenValue], tokenValue, (indexMapping[0], indexMapping[0] + 1)) };
-            }
-
-            List<string> word = new(token.Length);
-            foreach (char c in token)
-            {
-                Debug.Assert(c < _charToString.Length);
-                word.Add(_charToString[c]);
-            }
-
-            HashSet<(string, string)> pairs = new();
-
-            WordToPairs(word, pairs);
-
-            var newWord = new List<string>();
-
-            Debug.Assert(pairs.Count != 0, "Pairs should not be empty.");
-
-            while (true)
-            {
-                /* while conditions */
-                // if only one element left, merge is finished (with the whole word merged)
-                if (word.Count == 1)
-                {
-                    break;
-                }
 
-                // get the most frequent bi-gram pair
-                var (first, second) = pairs.ArgMin(pair => _mergeRanks.GetOrAdd(pair, int.MaxValue));
-                if (!_mergeRanks.ContainsKey((first, second)))
-                {
-                    break;
-                }
-                /* end while conditions */
-
-                // search and merge all (first, second) pairs in {word}
-                var i = 0;
-                while (i < word.Count)
-                {
-                    // find the next occurrence of {first} and add the elements before into {newWord}
-                    var j = word.IndexOf(first, i);
-                    if (j == -1)
-                    {
-                        newWord.AddRange(word.Skip(i));
-                        break;
-                    }
-                    else
-                    {
-                        newWord.AddRange(word.Skip(i).Take(j - i));
                         i = j;
                     }
 

diff --git a/src/Microsoft.ML.Tokenizers/Tokenizer.cs b/src/Microsoft.ML.Tokenizers/Tokenizer.cs
@@ -336,7 +336,7 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
                                                                 { "code-search-babbage-code-001", ModelEncoding.R50kBase },
                                                                 { "code-search-ada-code-001", ModelEncoding.R50kBase },
 
-                                                                //open source
+                                                                // open source
                                                                 { "gpt2", ModelEncoding.GPT2 }
                                                             };
 

diff --git a/test/Microsoft.ML.Tokenizers.Tests/EnglishRobertaTests.cs b/test/Microsoft.ML.Tokenizers.Tests/EnglishRobertaTests.cs
@@ -106,6 +106,13 @@ public async void TokenizationTest()
                 using Stream translationStream = File.OpenRead(translationFile);
                 tokenizer = new Tokenizer(new EnglishRoberta(vocabStream, mergeStream, translationStream), RobertaPreTokenizer.Instance);
                 TestTokenizer(tokenizer);
+
+                // Ensure caching works regardless of which method is called first.
+                for (CallingOrder order = CallingOrder.Encode; order <= CallingOrder.CountTokens; order++)
+                {
+                    tokenizer = new Tokenizer(new EnglishRoberta(vocabFile, mergeFile, translationFile), RobertaPreTokenizer.Instance);
+                    TestTokenizer(tokenizer, order);
+                }
             }
             finally
             {
@@ -122,17 +129,46 @@ public async void TokenizationTest()
             }
         }
 
-        private void TestTokenizer(Tokenizer tokenizer)
+        private enum CallingOrder
+        {
+            Encode,
+            EncodeToIds,
+            CountTokens
+        }
+
+        // Calling EncodeToIds after calling Encode will cause EncodeToIds uses the cached data from the previous Encode call.
+        // Calling with callIdsFirst = true will test the other way around.
+        private void TestTokenizer(Tokenizer tokenizer, CallingOrder callingOrder = CallingOrder.Encode)
         {
             Assert.NotNull(tokenizer.Model);
             Assert.True(tokenizer.Model is EnglishRoberta);
             Assert.True(tokenizer.PreTokenizer is RobertaPreTokenizer);
 
             foreach (object[] p in BertaData)
             {
-                TokenizerResult encoding = tokenizer.Encode((string)p[0]);
-                IReadOnlyList<int> ids = tokenizer.EncodeToIds((string)p[0]);
-                int idsCount = tokenizer.CountTokens((string)p[0]);
+                IReadOnlyList<int> ids;
+                TokenizerResult encoding;
+                int idsCount;
+
+                if (callingOrder == CallingOrder.Encode)
+                {
+                    encoding = tokenizer.Encode((string)p[0]);
+                    ids = tokenizer.EncodeToIds((string)p[0]);
+                    idsCount = tokenizer.CountTokens((string)p[0]);
+                }
+                else if (callingOrder == CallingOrder.EncodeToIds)
+                {
+                    ids = tokenizer.EncodeToIds((string)p[0]);
+                    encoding = tokenizer.Encode((string)p[0]);
+                    idsCount = tokenizer.CountTokens((string)p[0]);
+                }
+                else // CountTokens
+                {
+                    idsCount = tokenizer.CountTokens((string)p[0]);
+                    ids = tokenizer.EncodeToIds((string)p[0]);
+                    encoding = tokenizer.Encode((string)p[0]);
+                }
+
                 Assert.Equal(p[1], encoding.Ids);
                 Assert.Equal(p[1], ids);
                 Assert.Equal(((int[])p[1]).Length, idsCount);