Skip to content

Commit

Permalink
Minor Bpe cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekgh committed Feb 23, 2024
1 parent d48b32d commit 191ab03
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions src/Microsoft.ML.Tokenizers/Model/BPE.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ private set
/// <summary>
/// A prefix to be used for every subword that is not a beginning-of-word
/// </summary>
public string? ContinuingSubwordPrefix { get; private set; }
public string? ContinuingSubwordPrefix { get; }

/// <summary>
/// An optional suffix to characterize and end-of-word sub-word
/// </summary>
public string? EndOfWordSuffix { get; private set; }
public string? EndOfWordSuffix { get; }

/// <summary>
/// Gets or sets whether allowing multiple unknown tokens get fused
/// </summary>
public bool FuseUnknownTokens { get; private set; }
public bool FuseUnknownTokens { get; }


/// <summary>
Expand Down Expand Up @@ -146,6 +146,11 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
throw new InvalidOperationException($"Trying to merge a token '{mergeValues.b}' which not exist in the vocabulary.");
}

if (mergeValues.b.Length <= prefixLen)
{
throw new InvalidOperationException($"The merge value '{mergeValues.b}' is too short to be merged with a prefix of length {prefixLen}. This implies that the merge file is either damaged or missing the prefix in its entries.");
}

string newToken = $"{mergeValues.a}{mergeValues.b.Substring(prefixLen)}";
if (!_vocab.TryGetValue(newToken, out int newId))
{
Expand Down Expand Up @@ -252,19 +257,19 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadModelData(
private readonly Dictionary<string, int> _vocab;

/// Contains the mapping between Pairs and their (rank, newId).
internal Dictionary<Pair<int>, (int, int)> Merges { get; set; }
internal Dictionary<Pair<int>, (int, int)> Merges { get; }

/// Contains the cache for optimizing the encoding step.
internal Cache<string, Word>? Cache { get; set; }
internal Cache<string, Word>? Cache { get; }

internal static readonly int DefaultCacheCapacity = 10_000;

/// Reversed vocabulary, to rebuild the text.
internal SortedDictionary<int, string> VocabReverse { get; set; }
internal SortedDictionary<int, string> VocabReverse { get; }

/// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will
/// perform no merges, so the result will just be characters.
internal float? Dropout { get; set; }
internal float? Dropout { get; }

/// Converts the merges strings (for example from `merges.txt` file) with the format
/// "{pair_a} {pair_b}" into the format expected by the BPE struct
Expand Down

0 comments on commit 191ab03

Please sign in to comment.