dotnet · tarekgh · Feb 26, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 19, 2024
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
@@ -54,87 +54,114 @@ public string? UnknownToken
         }
 
         /// <summary>
-        /// An optional prefix to use on any sub-word that exist only behind another one
+        /// A prefix to be used for every subword that is not a beginning-of-word
         /// </summary>
-        public string? ContinuingSubwordPrefix { get; set; }
+        public string? ContinuingSubwordPrefix { get; private set; }
 
         /// <summary>
         /// An optional suffix to characterize and end-of-word sub-word
         /// </summary>
-        public string? EndOfWordSuffix { get; set; }
+        public string? EndOfWordSuffix { get; private set; }
 
         /// <summary>
         /// Gets or sets whether allowing multiple unknown tokens get fused
         /// </summary>
-        public bool FuseUnknownTokens { get; set; }
+        public bool FuseUnknownTokens { get; private set; }
+
 
         /// <summary>
-        /// Construct a new Bpe model object with no tokenization vocabulary. This constructor is useful only in the training scenario.
+        /// Construct a new Bpe model object to use for sentence tokenization.
         /// </summary>
-        public Bpe()
+        /// <param name="vocabFile">The JSON file path containing the dictionary of string keys and their ids.</param>
+        /// <param name="mergesFile">The file path containing the tokens's pairs list.</param>
+        /// <param name="unknownToken"> The unknown token to be used by the model.</param>
+        /// <param name="continuingSubwordPrefix">The prefix to attach to sub-word units that don’t represent a beginning of word.</param>
+        /// <param name="endOfWordSuffix">The suffix to attach to sub-word units that represent an end of word.</param>
+        /// <param name="fuseUnknownTokens">Indicate whether allowing multiple unknown tokens get fused.</param>
+        public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, string? continuingSubwordPrefix = null, string? endOfWordSuffix = null, bool fuseUnknownTokens = false) :
+            this(vocabFile is null ? throw new ArgumentNullException(nameof(vocabFile)) : File.Open(vocabFile, FileMode.Open, FileAccess.Read),
+                mergesFile is null ? null : File.Open(mergesFile, FileMode.Open, FileAccess.Read), unknownToken, continuingSubwordPrefix, endOfWordSuffix, fuseUnknownTokens, disposeStreams: true)
         {
-            Vocab = new();
-            VocabReverse = new();
-            Merges = new();
-
-            UnknownToken = "[Unk]";
         }
 
         /// <summary>
-        /// Construct a new Bpe model object to use for sentence tokenization and tokenizer training.
+        /// Construct a new Bpe model object to use for sentence tokenization.
         /// </summary>
-        /// <param name="vocabFile">The JSON file path containing the dictionary of string keys and their ids.</param>
-        /// <param name="mergesFile">The file path containing the tokens's pairs list.</param>
+        /// <param name="vocabStream">The JSON stream containing the dictionary of string keys and their ids.</param>
+        /// <param name="mergesStream">The stream containing the tokens's pairs list.</param>
         /// <param name="unknownToken"> The unknown token to be used by the model.</param>
         /// <param name="continuingSubwordPrefix">The prefix to attach to sub-word units that don’t represent a beginning of word.</param>
         /// <param name="endOfWordSuffix">The suffix to attach to sub-word units that represent an end of word.</param>
-        public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, string? continuingSubwordPrefix = null, string? endOfWordSuffix = null)
+        /// <param name="fuseUnknownTokens">Indicate whether allowing multiple unknown tokens get fused.</param>
+        public Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken = null, string? continuingSubwordPrefix = null, string? endOfWordSuffix = null, bool fuseUnknownTokens = false) :
+                this(vocabStream, mergesStream, unknownToken, continuingSubwordPrefix, endOfWordSuffix, fuseUnknownTokens, disposeStreams: false)
         {
-            ContinuingSubwordPrefix = continuingSubwordPrefix;
-            EndOfWordSuffix = endOfWordSuffix;
-
-            (Dictionary<string, int>? vocab1, Vec<(string, string)> merges) = ReadFile(vocabFile, mergesFile);
-            Vocab = vocab1 ?? new Dictionary<string, int>();
-            Cache = new Cache<string, Word>();
-
-            VocabReverse = new();
-
-            foreach (KeyValuePair<string, int> kvp in Vocab)
-            {
-                VocabReverse.Add(kvp.Value, kvp.Key);
-            }
+        }
 
-            if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))
+        private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, string? continuingSubwordPrefix, string? endOfWordSuffix, bool fuseUnknownTokens, bool disposeStreams)
+        {
+            try
             {
-                unknownToken = unkToken;
-            }
+                if (vocabStream is null)
+                {
+                    throw new ArgumentNullException(nameof(vocabStream));
+                }
 
-            UnknownToken = unknownToken;
+                FuseUnknownTokens = fuseUnknownTokens;
+                ContinuingSubwordPrefix = continuingSubwordPrefix;
+                EndOfWordSuffix = endOfWordSuffix;
 
-            int prefixLen = ContinuingSubwordPrefix is null ? 0 : ContinuingSubwordPrefix.Length;
+                (Dictionary<string, int>? vocab1, Vec<(string, string)> merges) = ReadModelData(vocabStream, mergesStream);
+                Vocab = vocab1 ?? new Dictionary<string, int>();
+                Cache = new Cache<string, Word>();
 
-            Merges = new();
-            for (int i = 0; i < merges.Count; i++)
-            {
-                (string a, string b) mergeValues = merges[i];
+                VocabReverse = new();
 
-                if (!Vocab.TryGetValue(mergeValues.a, out int aId))
+                foreach (KeyValuePair<string, int> kvp in Vocab)
                 {
-                    throw new InvalidOperationException($"Trying to merge a token {mergeValues.a} which not exist in the vocabulary.");
+                    VocabReverse.Add(kvp.Value, kvp.Key);
                 }
 
-                if (!Vocab.TryGetValue(mergeValues.b, out int bId))
+                if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))
                 {
-                    throw new InvalidOperationException($"Trying to merge a token {mergeValues.b} which not exist in the vocabulary.");
+                    unknownToken = unkToken;
                 }
 
-                string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}";
-                if (!Vocab.TryGetValue(newToken, out int newId))
+                UnknownToken = unknownToken;
+
+                int prefixLen = ContinuingSubwordPrefix is null ? 0 : ContinuingSubwordPrefix.Length;
+
+                Merges = new();
+                for (int i = 0; i < merges.Count; i++)
                 {
-                    throw new InvalidOperationException($"Trying to merge a token {newToken} which not exist in the vocabulary.");
-                }
+                    (string a, string b) mergeValues = merges[i];
+
+                    if (!Vocab.TryGetValue(mergeValues.a, out int aId))
+                    {
+                        throw new InvalidOperationException($"Trying to merge a token '{mergeValues.a}' which not exist in the vocabulary.");
+                    }
+
+                    if (!Vocab.TryGetValue(mergeValues.b, out int bId))
+                    {
+                        throw new InvalidOperationException($"Trying to merge a token '{mergeValues.b}' which not exist in the vocabulary.");
+                    }
 
-                Merges.Add(new Pair<int>(aId, bId), (i, newId));
+                    string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}";
+                    if (!Vocab.TryGetValue(newToken, out int newId))
+                    {
+                        throw new InvalidOperationException($"Trying to merge a token '{newToken}' which not exist in the vocabulary.");
+                    }
+
+                    Merges.Add(new Pair<int>(aId, bId), (i, newId));
+                }
+            }
+            finally
+            {
+                if (disposeStreams)
+                {
+                    vocabStream.Dispose();
+                    mergesStream?.Dispose();
+                }
             }
         }
 
@@ -195,8 +222,9 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
         /// </summary>
         /// <param name="id">The Id to map to the token.</param>
         /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
+        /// <param name="filterUnsupportedChars">Indicate if want to filter the unsupported characters during the decoding.</param>
         /// <returns>The mapped token of the Id.</returns>
-        public override string? IdToToken(int id, bool skipSpecialTokens = false)
+        public override string? IdToToken(int id, bool skipSpecialTokens = false, bool filterUnsupportedChars = true)
         {
             if (VocabReverse.TryGetValue(id, out string? value))
             {
@@ -216,53 +244,10 @@ public override IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialTok
         /// </summary>
         public override int GetVocabSize() => Vocab.Count;
 
-        /// <summary>
-        /// Gets a trainer object to use in training the model and generate the vocabulary and merges data.
-        /// </summary>
-        public override Trainer? GetTrainer() => new BpeTrainer();
-
-        /// <summary>
-        /// Save the model data into the vocabulary and merges files.
-        /// </summary>
-        /// <param name="path">The file system path to store the generated files at.</param>
-        /// <param name="prefix">Optional prefix for the generated file names.</param>
-        /// <returns>The list of all saved files.</returns>
-        public override string[] Save(string path, string? prefix = null)
-        {
-            // Write vocab.json
-            string vocabFileNname = prefix is null ? "vocab.json" : $"{prefix}-vocab.json";
-            string vocabPath = Path.Combine(path, vocabFileNname);
-            string serialized = JsonSerializer.Serialize(VocabReverse, new JsonSerializerOptions { Converters = { new DictReversingConverter() } });
-            File.WriteAllText(vocabPath, serialized, System.Text.Encoding.UTF8);
-
-            // Write merges.txt
-            string mergeFileName = prefix is null ? "merges.txt" : $"{prefix}-merges.txt";
-            string mergePath = Path.Combine(path, mergeFileName);
-            (Pair<int> pair, int rank)[] pairsArray = new (Pair<int>, int)[Merges.Count];
-            int i = 0;
-            foreach (var p in Merges)
-            {
-                pairsArray[i++] = (p.Key, p.Value.Item1 /* rank */);
-            }
-            Array.Sort(pairsArray, (x, y) => x.rank.CompareTo(y.rank));
-            using StreamWriter file = new(mergePath, append: false, System.Text.Encoding.UTF8);
-            file.WriteLine("#version: 0.2 - Trained by `huggingface/tokenizers`");
-            foreach (var p in pairsArray)
-            {
-                file.WriteLine($"{VocabReverse[p.pair.First]} {VocabReverse[p.pair.Second]}");
-            }
-
-            return new string[] { vocabPath, mergePath };
-        }
-
         /// Read the given files to extract the vocab and merges
-        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string vocab, string? merges)
+        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadModelData(Stream vocab, Stream? merges)
         {
-            Dictionary<string, int>? dic;
-            using (Stream stream = File.OpenRead(vocab))
-            {
-                dic = JsonSerializer.Deserialize<Dictionary<string, int>>(stream) as Dictionary<string, int>;
-            }
+            Dictionary<string, int>? dic = JsonSerializer.Deserialize<Dictionary<string, int>>(vocab) as Dictionary<string, int>;
 
             return (dic, ConvertMergesToHashmap(merges));
         }
@@ -287,23 +272,32 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(strin
 
         /// Converts the merges strings (for example from `merges.txt` file) with the format
         /// "{pair_a} {pair_b}" into the format expected by the BPE struct
-        internal static Vec<(string, string)> ConvertMergesToHashmap(string? mergesFile)
+        internal static Vec<(string, string)> ConvertMergesToHashmap(Stream? mergesStream)
         {
-            if (mergesFile is null)
+            if (mergesStream is null)
             {
                 return new Vec<(string, string)>();
             }
 
+            using StreamReader reader = new StreamReader(mergesStream);
+
             Vec<(string, string)> merges = new(1000);
 
             int lineNumber = 0;
-            foreach (string line in System.IO.File.ReadLines(mergesFile))
+            while (true)
             {
+                string? line = reader.ReadLine();
+                if (line is null)
+                {
+                    break;
+                }
+
                 lineNumber++;
                 if (line.StartsWith("#version", StringComparison.Ordinal) || line.Length == 0)
                 {
                     continue;
                 }
+
                 int index = line.IndexOf(' ');
                 if (index < 0 || index == line.Length - 1 || line.IndexOf(' ', index + 1) >= 0)
                 {