diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
index 686669518b..cc4700491b 100644
--- a/src/Microsoft.ML.Tokenizers/Model/BPE.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
@@ -56,17 +56,17 @@ private set
///
/// A prefix to be used for every subword that is not a beginning-of-word
///
- public string? ContinuingSubwordPrefix { get; private set; }
+ public string? ContinuingSubwordPrefix { get; }
///
/// An optional suffix to characterize and end-of-word sub-word
///
- public string? EndOfWordSuffix { get; private set; }
+ public string? EndOfWordSuffix { get; }
///
/// Gets or sets whether allowing multiple unknown tokens get fused
///
- public bool FuseUnknownTokens { get; private set; }
+ public bool FuseUnknownTokens { get; }
///
@@ -146,6 +146,11 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
throw new InvalidOperationException($"Trying to merge a token '{mergeValues.b}' which not exist in the vocabulary.");
}
+ if (mergeValues.b.Length <= prefixLen)
+ {
+ throw new InvalidOperationException($"The merge value '{mergeValues.b}' is too short to be merged with a prefix of length {prefixLen}. This implies that the merge file is either damaged or missing the prefix in its entries.");
+ }
+
string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}";
if (!_vocab.TryGetValue(newToken, out int newId))
{
@@ -252,19 +257,19 @@ internal static (Dictionary?, Vec<(string, string)>) ReadModelData(
private readonly Dictionary _vocab;
/// Contains the mapping between Pairs and their (rank, newId).
- internal Dictionary, (int, int)> Merges { get; set; }
+ internal Dictionary, (int, int)> Merges { get; }
/// Contains the cache for optimizing the encoding step.
- internal Cache? Cache { get; set; }
+ internal Cache? Cache { get; }
internal static readonly int DefaultCacheCapacity = 10_000;
/// Reversed vocabulary, to rebuild the text.
- internal SortedDictionary VocabReverse { get; set; }
+ internal SortedDictionary VocabReverse { get; }
/// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will
/// perform no merges, so the result will just be characters.
- internal float? Dropout { get; set; }
+ internal float? Dropout { get; }
/// Converts the merges strings (for example from `merges.txt` file) with the format
/// "{pair_a} {pair_b}" into the format expected by the BPE struct