diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs index 686669518b..cc4700491b 100644 --- a/src/Microsoft.ML.Tokenizers/Model/BPE.cs +++ b/src/Microsoft.ML.Tokenizers/Model/BPE.cs @@ -56,17 +56,17 @@ private set /// /// A prefix to be used for every subword that is not a beginning-of-word /// - public string? ContinuingSubwordPrefix { get; private set; } + public string? ContinuingSubwordPrefix { get; } /// /// An optional suffix to characterize and end-of-word sub-word /// - public string? EndOfWordSuffix { get; private set; } + public string? EndOfWordSuffix { get; } /// /// Gets or sets whether allowing multiple unknown tokens get fused /// - public bool FuseUnknownTokens { get; private set; } + public bool FuseUnknownTokens { get; } /// @@ -146,6 +146,11 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri throw new InvalidOperationException($"Trying to merge a token '{mergeValues.b}' which not exist in the vocabulary."); } + if (mergeValues.b.Length <= prefixLen) + { + throw new InvalidOperationException($"The merge value '{mergeValues.b}' is too short to be merged with a prefix of length {prefixLen}. This implies that the merge file is either damaged or missing the prefix in its entries."); + } + string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}"; if (!_vocab.TryGetValue(newToken, out int newId)) { @@ -252,19 +257,19 @@ internal static (Dictionary?, Vec<(string, string)>) ReadModelData( private readonly Dictionary _vocab; /// Contains the mapping between Pairs and their (rank, newId). - internal Dictionary, (int, int)> Merges { get; set; } + internal Dictionary, (int, int)> Merges { get; } /// Contains the cache for optimizing the encoding step. - internal Cache? Cache { get; set; } + internal Cache? Cache { get; } internal static readonly int DefaultCacheCapacity = 10_000; /// Reversed vocabulary, to rebuild the text. - internal SortedDictionary VocabReverse { get; set; } + internal SortedDictionary VocabReverse { get; } /// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will /// perform no merges, so the result will just be characters. - internal float? Dropout { get; set; } + internal float? Dropout { get; } /// Converts the merges strings (for example from `merges.txt` file) with the format /// "{pair_a} {pair_b}" into the format expected by the BPE struct