added byte level bpe for gpt2 (initial version)

microsoft · Mar 4, 2021 · 9d0c9dd · 9d0c9dd
1 parent 66adfa8
commit 9d0c9dd
Show file tree

Hide file tree

Showing 33 changed files with 50,703 additions and 100,613 deletions.
diff --git a/blingfireclient.library/inc/FADictConfKeeper.h b/blingfireclient.library/inc/FADictConfKeeper.h
@@ -58,6 +58,7 @@ class FADictConfKeeper {
     const int GetDirection () const;
     const FAMultiMapCA * GetCharMap () const;
     const int GetTokAlgo () const;
+    const int GetIdOffset () const;
 
 private:
     // input LDB
@@ -81,6 +82,9 @@ class FADictConfKeeper {
     FAMultiMap_pack_fixed * m_pCharMap;
     // indicates what runtime algo to use with these data
     int m_TokAlgo;
+    // specifies a value to be added to all IDs from a tokenizer
+    int m_IdOffset;
+
 };
 
 }

diff --git a/blingfireclient.library/inc/FAFsmConst.h b/blingfireclient.library/inc/FAFsmConst.h
@@ -261,6 +261,7 @@ class FAFsmConst {
         PARAM_MAX_LENGTH,  // maximum length, e.g. maximum token length
         PARAM_VERIFY_LDB_BIN, // if specified, requires a CRC32-like check for the LDB file to pass
         PARAM_TOKENIZATION_TYPE, // specifies which tokenization runtime should be used
+        PARAM_ID_OFFSET,   // specifies the integer value to be added to all output IDs (used in Bling Fire tokenizer)
         PARAM_COUNT,
     };
 
@@ -399,6 +400,9 @@ class FAFsmConst {
         TOKENIZE_BPE = 3,
         TOKENIZE_BPE_OPT = 4,     // optimized version of the BPE, prefers a single token match over
                                   //  subtoken, assumes tokens are delimited with U+x2581 
+        TOKENIZE_BPE_BYTE_OPT = 5,// optimized version of the BPE, prefers a single token match over
+                                  //  subtoken, assumes tokens are delimited with U+x2581 uses UTF-8 bytes
+                                  //  as input characters and U+x2581
         TOKENIZE_COUNT,
     };
 

diff --git a/blingfireclient.library/inc/FAUtf8Utils.h b/blingfireclient.library/inc/FAUtf8Utils.h
@@ -74,6 +74,32 @@ const int FAStrUtf8ToArray (
         const int MaxSize
     );
 
+
+/// Converts UTF8 string of specified length to the array of ints
+///  using each byte as a character.
+/// Returns the number of used elements in the array.
+/// Returns -1 if the input sequence is invalid.
+const int FAStrUtf8AsBytesToArray (
+        const char * pStr, 
+        const int Len, 
+        __out_ecount(MaxSize) int * pArray, 
+        const int MaxSize
+    );
+
+/// Converts UTF8 string of specified length to the array of ints
+///  using each byte as a character and for each returned character
+//   returns its offset in the pStr.
+/// Returns the number of used elements in the array.
+/// Returns -1 if the input sequence is invalid.
+const int FAStrUtf8AsBytesToArray (
+        const char * pStr,
+        const int Len,
+        __out_ecount(MaxSize) int * pArray,
+        __out_ecount(MaxSize) int * pOffsets,
+        const int MaxSize
+    );
+
+
 /// Converts int (UTF-32LE) into UTF-16LE encoded string
 /// returns end pointer of the output sequence
 /// return NULL if buffer length is insufficient

diff --git a/blingfireclient.library/src/FADictConfKeeper.cpp b/blingfireclient.library/src/FADictConfKeeper.cpp
@@ -35,7 +35,8 @@ FADictConfKeeper::FADictConfKeeper () :
     m_NoTrUse (true),
     m_Direction (FAFsmConst::DIR_L2R),
     m_pCharMap (NULL),
-    m_TokAlgo (FAFsmConst::TOKENIZE_DEFAULT)
+    m_TokAlgo (FAFsmConst::TOKENIZE_DEFAULT),
+    m_IdOffset (0)
 {}
 
 
@@ -94,6 +95,11 @@ void FADictConfKeeper::Init (const int * pValues, const int Size)
 
             break;
         }
+        case FAFsmConst::PARAM_ID_OFFSET:
+        {
+            m_IdOffset = pValues [++i];
+            break;
+        }
         case FAFsmConst::PARAM_FSM_TYPE:
         {
             m_FsmType = pValues [++i];
@@ -250,6 +256,8 @@ void FADictConfKeeper::Clear ()
     m_Direction = FAFsmConst::DIR_L2R;
     m_pI2Info = NULL;
     m_FsmType = FAFsmConst::TYPE_MEALY_DFA;
+    m_TokAlgo = FAFsmConst::TOKENIZE_DEFAULT;
+    m_IdOffset = 0;
 }
 
 
@@ -317,4 +325,10 @@ const int FADictConfKeeper::GetTokAlgo () const
     return m_TokAlgo;
 }
 
+const int FADictConfKeeper::GetIdOffset () const
+{
+    return m_IdOffset;
+}
+
+
 }
diff --git a/blingfireclient.library/src/FAUtf8Utils.cpp b/blingfireclient.library/src/FAUtf8Utils.cpp
@@ -311,6 +311,75 @@ const int FAStrUtf8ToArray (
     return i;
 }
 
+
+const int FAStrUtf8AsBytesToArray (
+        const char * pStr, 
+        const int Len, 
+        __out_ecount(MaxSize) int * pArray, 
+        const int MaxSize
+    )
+{
+    DebugLogAssert (0 == Len || pStr);
+    DebugLogAssert (pArray);
+
+    const char * pEnd = pStr + Len;
+    const int * pArrayEnd = pArray + MaxSize;
+
+    // check for Byte-Order-Mark (Utf-8 encoded U+FEFF symbol)
+    if (3 <= Len) {
+        if (0xEF == (unsigned char) pStr [0] && 
+            0xBB == (unsigned char) pStr [1] && 
+            0xBF == (unsigned char) pStr [2])
+            pStr += 3;
+    }
+
+    // process symbol sequence
+    int i = 0;
+    while (pStr < pEnd && pArray < pArrayEnd) {
+        *pArray++ = (unsigned char) *pStr++;
+        i++;
+    }
+
+    return i;
+}
+
+
+const int FAStrUtf8AsBytesToArray (
+        const char * pStr,
+        const int Len,
+        __out_ecount(MaxSize) int * pArray,
+        __out_ecount(MaxSize) int * pOffsets,
+        const int MaxSize
+    )
+{
+    DebugLogAssert (0 == Len || pStr);
+    DebugLogAssert (pArray && pOffsets);
+
+    const char * pBegin = pStr;
+    const char * pEnd = pStr + Len;
+    const int * pArrayEnd = pArray + MaxSize;
+
+    // check for Byte-Order-Mark (Utf-8 encoded U+FEFF symbol)
+    if (3 <= Len) {
+        if (0xEF == (unsigned char) pStr [0] && 
+            0xBB == (unsigned char) pStr [1] && 
+            0xBF == (unsigned char) pStr [2])
+            pStr += 3;
+    }
+
+    // process symbol sequence
+    int i = 0;
+    while (pStr < pEnd && pArray < pArrayEnd) {
+
+        const int Offset = (int) (pStr - pBegin);
+        *pArray++ = (unsigned char) *pStr++;
+        pOffsets [i++] = Offset;
+    }
+
+    return i;
+}
+
+
 wchar_t * FAIntToUtf16LE (
         int Symbol,
         __out_ecount(MaxSize) wchar_t * ptr,

diff --git a/blingfiretools/blingfiretokdll/blingfiretokdll.cpp b/blingfiretools/blingfiretokdll/blingfiretokdll.cpp
@@ -64,11 +64,14 @@ struct FAModelData
     // BPE runtime, it uses the m_DictConf for data
     FATokenSegmentationTools_1best_bpe_t < int > m_SegEngineBpe;
     bool m_isBpe;
+    // indicates wether characters are bytes of the UTF-8 rather than the Unicode symbols
+    bool m_useRawBytes;
 
     FAModelData ():
         m_hasWbd (false),
         m_hasSeg (false),
-        m_isBpe (false)
+        m_isBpe (false),
+        m_useRawBytes (false)
     {}
 };
 
@@ -826,11 +829,14 @@ void* SetModelData(FAModelData * pNewModelData, const unsigned char * pImgBytes)
 
         // check if this is a Unigram LM or BPE model
         pNewModelData->m_isBpe = FAFsmConst::TOKENIZE_BPE == pNewModelData->m_DictConf.GetTokAlgo()
-            || FAFsmConst::TOKENIZE_BPE_OPT == pNewModelData->m_DictConf.GetTokAlgo();
+            || FAFsmConst::TOKENIZE_BPE_OPT == pNewModelData->m_DictConf.GetTokAlgo()
+            || FAFsmConst::TOKENIZE_BPE_BYTE_OPT == pNewModelData->m_DictConf.GetTokAlgo();
+
+        // see if we need to treat UTF-8 bytes as input
+        pNewModelData->m_useRawBytes = FAFsmConst::TOKENIZE_BPE_BYTE_OPT == pNewModelData->m_DictConf.GetTokAlgo();
 
         // initialize the segmentation engine
-        if (pNewModelData->m_isBpe)
-        {
+        if (pNewModelData->m_isBpe) {
             pNewModelData->m_SegEngineBpe.SetConf(&pNewModelData->m_DictConf);
         } else {
             pNewModelData->m_SegEngine.SetConf(&pNewModelData->m_DictConf);
@@ -1180,20 +1186,27 @@ const int TextToIdsWithOffsets_sp(
         pOffsets[0] = 0; // added for prepended first character
     }
 
-    // convert input to UTF-32 (write past the added first space)
-    int BuffSize = fNeedOffsets ? 
-        ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
-        ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
-    if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
-        return 0;
-    }
-    BuffSize++; // to accomodate the first space
-
     // get the model data
     const FAModelData * pModelData = (const FAModelData *)ModelPtr;
     const FADictConfKeeper * pConf = &(pModelData->m_DictConf);
     const FAMultiMapCA * pCharMap = pConf->GetCharMap ();
 
+    // convert input to UTF-32 or bytes (write output past the added first space)
+    int BuffSize;
+    if(false == pModelData->m_useRawBytes) {
+        BuffSize = fNeedOffsets ? 
+            ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
+            ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
+    } else {
+        BuffSize = fNeedOffsets ? 
+            ::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
+            ::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
+    }
+    if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
+        return 0;
+    }
+    BuffSize++; // to accomodate the first space
+
     // needed for normalization
     std::vector< int > utf32input_norm;
     int * pNormBuff = NULL;
@@ -1292,13 +1305,13 @@ const int TextToIdsWithOffsets_sp(
     }
 
     int OutSize = 0;
-
+    int IdOffset = pConf->GetIdOffset (); // see if we need to shift output IDs by a constant
     // return the ids only
     for (int i = 0; i < WbdOutSize && OutSize < MaxIdsArrLength; i += 3) {
 
         // copy id
         const int id = pWbdResults [i];
-        pIdsArr [OutSize] = id;
+        pIdsArr [OutSize] = id + IdOffset;
 
         // copy offsets if needed
         if (fNeedOffsets) {

diff --git a/blingfiretools/fa_build_conf/fa_build_conf.cpp b/blingfiretools/fa_build_conf/fa_build_conf.cpp
@@ -247,6 +247,7 @@ void SetupConfParams ()
     g_parser.AddNumParam ("max-distance", FAFsmConst::PARAM_MAX_DISTANCE);
     g_parser.AddNumParam ("max-ambiguous-distance", FAFsmConst::PARAM_MAX_DISTANCE);
     g_parser.AddNumParam ("threshold", FAFsmConst::PARAM_THRESHOLD);
+    g_parser.AddNumParam ("id-offset", FAFsmConst::PARAM_ID_OFFSET);
 
     // requires a CRC32-like check for the LDB file to pass
     g_parser.AddParam ("verify-ldb-bin", FAFsmConst::PARAM_VERIFY_LDB_BIN);
@@ -262,6 +263,8 @@ void SetupConfParams ()
                           "bpe", FAFsmConst::TOKENIZE_BPE);
     g_parser.AddStrParam ("tokalgo", FAFsmConst::PARAM_TOKENIZATION_TYPE,
                           "bpe-opt", FAFsmConst::TOKENIZE_BPE_OPT);
+    g_parser.AddStrParam ("tokalgo", FAFsmConst::PARAM_TOKENIZATION_TYPE,
+                          "byte-bpe-opt", FAFsmConst::TOKENIZE_BPE_BYTE_OPT);
 
 }
 

diff --git a/dist-pypi/blingfire/gpt2.bin b/dist-pypi/blingfire/gpt2.bin
diff --git a/dist-pypi/blingfire/libblingfiretokdll.so b/dist-pypi/blingfire/libblingfiretokdll.so
diff --git a/dist-pypi/blingfire/uri100k.bin b/dist-pypi/blingfire/uri100k.bin
diff --git a/dist-pypi/blingfire/uri250k.bin b/dist-pypi/blingfire/uri250k.bin
diff --git a/dist-pypi/blingfire/uri500k.bin b/dist-pypi/blingfire/uri500k.bin
diff --git a/dist-pypi/setup.py b/dist-pypi/setup.py
@@ -5,15 +5,15 @@
 
 setup(
     name="blingfire",
-    version="0.1.3",
+    version="0.1.5",
     author="Bling",
     author_email="bling@microsoft.com",
     description="Python wrapper of lightening fast Finite State machine and REgular expression manipulation library.",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/microsoft/blingfire/",
     packages=['blingfire'],
-    package_data={'blingfire':['bert_base_tok.bin','bert_base_cased_tok.bin','bert_chinese.bin','bert_multi_cased.bin','wbd_chuni.bin','xlnet.bin','xlnet_nonorm.bin','xlm_roberta_base.bin','laser100k.bin','laser250k.bin','laser500k.bin','libblingfiretokdll.so','blingfiretokdll.dll','libblingfiretokdll.dylib']},
+    package_data={'blingfire':['bert_base_tok.bin','bert_base_cased_tok.bin','bert_chinese.bin','bert_multi_cased.bin','wbd_chuni.bin','xlnet.bin','xlnet_nonorm.bin','xlm_roberta_base.bin','gpt2.bin','laser100k.bin','laser250k.bin','laser500k.bin','uri100k.bin','uri250k.bin','uri500k.bin','libblingfiretokdll.so','blingfiretokdll.dll','libblingfiretokdll.dylib']},
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",

diff --git a/ldbsrc/gpt2/README.TXT b/ldbsrc/gpt2/README.TXT
@@ -1,10 +1,33 @@
-ruby export_vocab.rb
+# For details of the original implementation see this https://huggingface.co/transformers/_modules/transformers/models/gpt2/tokenization_gpt2.html#GPT2Tokenizer
+# Note: we will not use UTF-8 as input encoding for the dictionary, instead each symbol is represented as an integer
+#  this is because GPT2 uses byte level alphabet. We need to specify --input-enc=DEC when we build the dictionary 
+#  (see fa_line2chain_unicode --help for details).
 
-# produce pos.dict.utf8 file and tagset.txt:
-cat spiece.model.exportvocab.txt | awk 'BEGIN {FS="\t"} NF == 2 { if (NR > 1) { print $1 "\tWORD_ID_" NR-1 "\t" ($2 == 0 ? "-0.00001" : $2); }  print "WORD_ID_" NR " " NR > "tagset.txt"; }' > pos.dict.utf8
+# run this python script to generate tagset.txt and pos.dict.utf8 from vocab.json
+python export_vocab.py
 
-# zip it:
+# zip it the dictionary file
 zip pos.dict.utf8.zip pos.dict.utf8
 
-# build as usual
-make -f Makefile.gnu lang=gpt2 all
+# make sure the tools are compiled and are in the path, see wiki for details
+
+# build LDB as usual, you should get an output like below and no error messages (do "clean" target if encounter errors)
+~/BlingFire/ldbsrc$ make -f Makefile.gnu lang=gpt2 all 
+
+fa_build_conf \
+  --in=gpt2/ldb.conf.small \
+  --out=gpt2/tmp/ldb.mmap.small.txt
+fa_fsm2fsm_pack --type=mmap \
+  --in=gpt2/tmp/ldb.mmap.small.txt \
+  --out=gpt2/tmp/ldb.conf.small.dump \
+  --auto-test
+unzip -p gpt2/pos.dict.utf8.zip | \
+fa_build_dict  --input-enc=DEC --type=mph --raw --tagset=gpt2/tagset.txt --float-nums \
+  --out-fsm=gpt2/tmp/pos.dict.fsm.txt \
+  --out-k2i=gpt2/tmp/pos.dict.k2i.txt \
+  --out-i2info=gpt2/tmp/pos.dict.i2t.txt
+fa_fsm2fsm_pack --alg=triv --type=mealy-dfa  --in=gpt2/tmp/pos.dict.fsm.txt --out=gpt2/tmp/pos.dict.fsm.small.dump --auto-test
+fa_fsm2fsm_pack --alg=triv --type=arr --force-flat  --in=gpt2/tmp/pos.dict.k2i.txt --out=gpt2/tmp/pos.dict.k2i.small.dump --auto-test
+fa_fsm2fsm_pack --alg=fixed --type=mmap  --in=gpt2/tmp/pos.dict.i2t.txt --out=gpt2/tmp/pos.dict.i2t.small.dump --auto-test
+fa_merge_dumps --out=ldb/gpt2.bin gpt2/tmp/ldb.conf.small.dump gpt2/tmp/pos.dict.fsm.small.dump gpt2/tmp/pos.dict.k2i.small.dump gpt2/tmp/pos.dict.i2t.small.dump 
+