Skip to content

Commit

Permalink
added byte level bpe for gpt2 (initial version)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei Alonichau committed Mar 4, 2021
1 parent 66adfa8 commit 9d0c9dd
Show file tree
Hide file tree
Showing 33 changed files with 50,703 additions and 100,613 deletions.
4 changes: 4 additions & 0 deletions blingfireclient.library/inc/FADictConfKeeper.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class FADictConfKeeper {
const int GetDirection () const;
const FAMultiMapCA * GetCharMap () const;
const int GetTokAlgo () const;
const int GetIdOffset () const;

private:
// input LDB
Expand All @@ -81,6 +82,9 @@ class FADictConfKeeper {
FAMultiMap_pack_fixed * m_pCharMap;
// indicates what runtime algo to use with these data
int m_TokAlgo;
// specifies a value to be added to all IDs from a tokenizer
int m_IdOffset;

};

}
Expand Down
4 changes: 4 additions & 0 deletions blingfireclient.library/inc/FAFsmConst.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ class FAFsmConst {
PARAM_MAX_LENGTH, // maximum length, e.g. maximum token length
PARAM_VERIFY_LDB_BIN, // if specified, requires a CRC32-like check for the LDB file to pass
PARAM_TOKENIZATION_TYPE, // specifies which tokenization runtime should be used
PARAM_ID_OFFSET, // specifies the integer value to be added to all output IDs (used in Bling Fire tokenizer)
PARAM_COUNT,
};

Expand Down Expand Up @@ -399,6 +400,9 @@ class FAFsmConst {
TOKENIZE_BPE = 3,
TOKENIZE_BPE_OPT = 4, // optimized version of the BPE, prefers a single token match over
// subtoken, assumes tokens are delimited with U+x2581
TOKENIZE_BPE_BYTE_OPT = 5,// optimized version of the BPE, prefers a single token match over
// subtoken, assumes tokens are delimited with U+x2581 uses UTF-8 bytes
// as input characters and U+x2581
TOKENIZE_COUNT,
};

Expand Down
26 changes: 26 additions & 0 deletions blingfireclient.library/inc/FAUtf8Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,32 @@ const int FAStrUtf8ToArray (
const int MaxSize
);


/// Converts UTF8 string of specified length to the array of ints
/// using each byte as a character.
/// Returns the number of used elements in the array.
/// Returns -1 if the input sequence is invalid.
const int FAStrUtf8AsBytesToArray (
const char * pStr,
const int Len,
__out_ecount(MaxSize) int * pArray,
const int MaxSize
);

/// Converts UTF8 string of specified length to the array of ints
/// using each byte as a character and for each returned character
// returns its offset in the pStr.
/// Returns the number of used elements in the array.
/// Returns -1 if the input sequence is invalid.
const int FAStrUtf8AsBytesToArray (
const char * pStr,
const int Len,
__out_ecount(MaxSize) int * pArray,
__out_ecount(MaxSize) int * pOffsets,
const int MaxSize
);


/// Converts int (UTF-32LE) into UTF-16LE encoded string
/// returns end pointer of the output sequence
/// return NULL if buffer length is insufficient
Expand Down
16 changes: 15 additions & 1 deletion blingfireclient.library/src/FADictConfKeeper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ FADictConfKeeper::FADictConfKeeper () :
m_NoTrUse (true),
m_Direction (FAFsmConst::DIR_L2R),
m_pCharMap (NULL),
m_TokAlgo (FAFsmConst::TOKENIZE_DEFAULT)
m_TokAlgo (FAFsmConst::TOKENIZE_DEFAULT),
m_IdOffset (0)
{}


Expand Down Expand Up @@ -94,6 +95,11 @@ void FADictConfKeeper::Init (const int * pValues, const int Size)

break;
}
case FAFsmConst::PARAM_ID_OFFSET:
{
m_IdOffset = pValues [++i];
break;
}
case FAFsmConst::PARAM_FSM_TYPE:
{
m_FsmType = pValues [++i];
Expand Down Expand Up @@ -250,6 +256,8 @@ void FADictConfKeeper::Clear ()
m_Direction = FAFsmConst::DIR_L2R;
m_pI2Info = NULL;
m_FsmType = FAFsmConst::TYPE_MEALY_DFA;
m_TokAlgo = FAFsmConst::TOKENIZE_DEFAULT;
m_IdOffset = 0;
}


Expand Down Expand Up @@ -317,4 +325,10 @@ const int FADictConfKeeper::GetTokAlgo () const
return m_TokAlgo;
}

const int FADictConfKeeper::GetIdOffset () const
{
return m_IdOffset;
}


}
69 changes: 69 additions & 0 deletions blingfireclient.library/src/FAUtf8Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,75 @@ const int FAStrUtf8ToArray (
return i;
}


const int FAStrUtf8AsBytesToArray (
const char * pStr,
const int Len,
__out_ecount(MaxSize) int * pArray,
const int MaxSize
)
{
DebugLogAssert (0 == Len || pStr);
DebugLogAssert (pArray);

const char * pEnd = pStr + Len;
const int * pArrayEnd = pArray + MaxSize;

// check for Byte-Order-Mark (Utf-8 encoded U+FEFF symbol)
if (3 <= Len) {
if (0xEF == (unsigned char) pStr [0] &&
0xBB == (unsigned char) pStr [1] &&
0xBF == (unsigned char) pStr [2])
pStr += 3;
}

// process symbol sequence
int i = 0;
while (pStr < pEnd && pArray < pArrayEnd) {
*pArray++ = (unsigned char) *pStr++;
i++;
}

return i;
}


const int FAStrUtf8AsBytesToArray (
const char * pStr,
const int Len,
__out_ecount(MaxSize) int * pArray,
__out_ecount(MaxSize) int * pOffsets,
const int MaxSize
)
{
DebugLogAssert (0 == Len || pStr);
DebugLogAssert (pArray && pOffsets);

const char * pBegin = pStr;
const char * pEnd = pStr + Len;
const int * pArrayEnd = pArray + MaxSize;

// check for Byte-Order-Mark (Utf-8 encoded U+FEFF symbol)
if (3 <= Len) {
if (0xEF == (unsigned char) pStr [0] &&
0xBB == (unsigned char) pStr [1] &&
0xBF == (unsigned char) pStr [2])
pStr += 3;
}

// process symbol sequence
int i = 0;
while (pStr < pEnd && pArray < pArrayEnd) {

const int Offset = (int) (pStr - pBegin);
*pArray++ = (unsigned char) *pStr++;
pOffsets [i++] = Offset;
}

return i;
}


wchar_t * FAIntToUtf16LE (
int Symbol,
__out_ecount(MaxSize) wchar_t * ptr,
Expand Down
43 changes: 28 additions & 15 deletions blingfiretools/blingfiretokdll/blingfiretokdll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,14 @@ struct FAModelData
// BPE runtime, it uses the m_DictConf for data
FATokenSegmentationTools_1best_bpe_t < int > m_SegEngineBpe;
bool m_isBpe;
// indicates wether characters are bytes of the UTF-8 rather than the Unicode symbols
bool m_useRawBytes;

FAModelData ():
m_hasWbd (false),
m_hasSeg (false),
m_isBpe (false)
m_isBpe (false),
m_useRawBytes (false)
{}
};

Expand Down Expand Up @@ -826,11 +829,14 @@ void* SetModelData(FAModelData * pNewModelData, const unsigned char * pImgBytes)

// check if this is a Unigram LM or BPE model
pNewModelData->m_isBpe = FAFsmConst::TOKENIZE_BPE == pNewModelData->m_DictConf.GetTokAlgo()
|| FAFsmConst::TOKENIZE_BPE_OPT == pNewModelData->m_DictConf.GetTokAlgo();
|| FAFsmConst::TOKENIZE_BPE_OPT == pNewModelData->m_DictConf.GetTokAlgo()
|| FAFsmConst::TOKENIZE_BPE_BYTE_OPT == pNewModelData->m_DictConf.GetTokAlgo();

// see if we need to treat UTF-8 bytes as input
pNewModelData->m_useRawBytes = FAFsmConst::TOKENIZE_BPE_BYTE_OPT == pNewModelData->m_DictConf.GetTokAlgo();

// initialize the segmentation engine
if (pNewModelData->m_isBpe)
{
if (pNewModelData->m_isBpe) {
pNewModelData->m_SegEngineBpe.SetConf(&pNewModelData->m_DictConf);
} else {
pNewModelData->m_SegEngine.SetConf(&pNewModelData->m_DictConf);
Expand Down Expand Up @@ -1180,20 +1186,27 @@ const int TextToIdsWithOffsets_sp(
pOffsets[0] = 0; // added for prepended first character
}

// convert input to UTF-32 (write past the added first space)
int BuffSize = fNeedOffsets ?
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
return 0;
}
BuffSize++; // to accomodate the first space

// get the model data
const FAModelData * pModelData = (const FAModelData *)ModelPtr;
const FADictConfKeeper * pConf = &(pModelData->m_DictConf);
const FAMultiMapCA * pCharMap = pConf->GetCharMap ();

// convert input to UTF-32 or bytes (write output past the added first space)
int BuffSize;
if(false == pModelData->m_useRawBytes) {
BuffSize = fNeedOffsets ?
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
} else {
BuffSize = fNeedOffsets ?
::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, pOffsets + 1, InUtf8StrByteCount) :
::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + 1, InUtf8StrByteCount);
}
if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
return 0;
}
BuffSize++; // to accomodate the first space

// needed for normalization
std::vector< int > utf32input_norm;
int * pNormBuff = NULL;
Expand Down Expand Up @@ -1292,13 +1305,13 @@ const int TextToIdsWithOffsets_sp(
}

int OutSize = 0;

int IdOffset = pConf->GetIdOffset (); // see if we need to shift output IDs by a constant
// return the ids only
for (int i = 0; i < WbdOutSize && OutSize < MaxIdsArrLength; i += 3) {

// copy id
const int id = pWbdResults [i];
pIdsArr [OutSize] = id;
pIdsArr [OutSize] = id + IdOffset;

// copy offsets if needed
if (fNeedOffsets) {
Expand Down
3 changes: 3 additions & 0 deletions blingfiretools/fa_build_conf/fa_build_conf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ void SetupConfParams ()
g_parser.AddNumParam ("max-distance", FAFsmConst::PARAM_MAX_DISTANCE);
g_parser.AddNumParam ("max-ambiguous-distance", FAFsmConst::PARAM_MAX_DISTANCE);
g_parser.AddNumParam ("threshold", FAFsmConst::PARAM_THRESHOLD);
g_parser.AddNumParam ("id-offset", FAFsmConst::PARAM_ID_OFFSET);

// requires a CRC32-like check for the LDB file to pass
g_parser.AddParam ("verify-ldb-bin", FAFsmConst::PARAM_VERIFY_LDB_BIN);
Expand All @@ -262,6 +263,8 @@ void SetupConfParams ()
"bpe", FAFsmConst::TOKENIZE_BPE);
g_parser.AddStrParam ("tokalgo", FAFsmConst::PARAM_TOKENIZATION_TYPE,
"bpe-opt", FAFsmConst::TOKENIZE_BPE_OPT);
g_parser.AddStrParam ("tokalgo", FAFsmConst::PARAM_TOKENIZATION_TYPE,
"byte-bpe-opt", FAFsmConst::TOKENIZE_BPE_BYTE_OPT);

}

Expand Down
Binary file added dist-pypi/blingfire/gpt2.bin
Binary file not shown.
Binary file modified dist-pypi/blingfire/libblingfiretokdll.so
Binary file not shown.
Binary file added dist-pypi/blingfire/uri100k.bin
Binary file not shown.
Binary file added dist-pypi/blingfire/uri250k.bin
Binary file not shown.
Binary file added dist-pypi/blingfire/uri500k.bin
Binary file not shown.
4 changes: 2 additions & 2 deletions dist-pypi/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

setup(
name="blingfire",
version="0.1.3",
version="0.1.5",
author="Bling",
author_email="bling@microsoft.com",
description="Python wrapper of lightening fast Finite State machine and REgular expression manipulation library.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/microsoft/blingfire/",
packages=['blingfire'],
package_data={'blingfire':['bert_base_tok.bin','bert_base_cased_tok.bin','bert_chinese.bin','bert_multi_cased.bin','wbd_chuni.bin','xlnet.bin','xlnet_nonorm.bin','xlm_roberta_base.bin','laser100k.bin','laser250k.bin','laser500k.bin','libblingfiretokdll.so','blingfiretokdll.dll','libblingfiretokdll.dylib']},
package_data={'blingfire':['bert_base_tok.bin','bert_base_cased_tok.bin','bert_chinese.bin','bert_multi_cased.bin','wbd_chuni.bin','xlnet.bin','xlnet_nonorm.bin','xlm_roberta_base.bin','gpt2.bin','laser100k.bin','laser250k.bin','laser500k.bin','uri100k.bin','uri250k.bin','uri500k.bin','libblingfiretokdll.so','blingfiretokdll.dll','libblingfiretokdll.dylib']},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down
35 changes: 29 additions & 6 deletions ldbsrc/gpt2/README.TXT
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
ruby export_vocab.rb
# For details of the original implementation see this https://huggingface.co/transformers/_modules/transformers/models/gpt2/tokenization_gpt2.html#GPT2Tokenizer
# Note: we will not use UTF-8 as input encoding for the dictionary, instead each symbol is represented as an integer
# this is because GPT2 uses byte level alphabet. We need to specify --input-enc=DEC when we build the dictionary
# (see fa_line2chain_unicode --help for details).

# produce pos.dict.utf8 file and tagset.txt:
cat spiece.model.exportvocab.txt | awk 'BEGIN {FS="\t"} NF == 2 { if (NR > 1) { print $1 "\tWORD_ID_" NR-1 "\t" ($2 == 0 ? "-0.00001" : $2); } print "WORD_ID_" NR " " NR > "tagset.txt"; }' > pos.dict.utf8
# run this python script to generate tagset.txt and pos.dict.utf8 from vocab.json
python export_vocab.py

# zip it:
# zip it the dictionary file
zip pos.dict.utf8.zip pos.dict.utf8

# build as usual
make -f Makefile.gnu lang=gpt2 all
# make sure the tools are compiled and are in the path, see wiki for details

# build LDB as usual, you should get an output like below and no error messages (do "clean" target if encounter errors)
~/BlingFire/ldbsrc$ make -f Makefile.gnu lang=gpt2 all

fa_build_conf \
--in=gpt2/ldb.conf.small \
--out=gpt2/tmp/ldb.mmap.small.txt
fa_fsm2fsm_pack --type=mmap \
--in=gpt2/tmp/ldb.mmap.small.txt \
--out=gpt2/tmp/ldb.conf.small.dump \
--auto-test
unzip -p gpt2/pos.dict.utf8.zip | \
fa_build_dict --input-enc=DEC --type=mph --raw --tagset=gpt2/tagset.txt --float-nums \
--out-fsm=gpt2/tmp/pos.dict.fsm.txt \
--out-k2i=gpt2/tmp/pos.dict.k2i.txt \
--out-i2info=gpt2/tmp/pos.dict.i2t.txt
fa_fsm2fsm_pack --alg=triv --type=mealy-dfa --in=gpt2/tmp/pos.dict.fsm.txt --out=gpt2/tmp/pos.dict.fsm.small.dump --auto-test
fa_fsm2fsm_pack --alg=triv --type=arr --force-flat --in=gpt2/tmp/pos.dict.k2i.txt --out=gpt2/tmp/pos.dict.k2i.small.dump --auto-test
fa_fsm2fsm_pack --alg=fixed --type=mmap --in=gpt2/tmp/pos.dict.i2t.txt --out=gpt2/tmp/pos.dict.i2t.small.dump --auto-test
fa_merge_dumps --out=ldb/gpt2.bin gpt2/tmp/ldb.conf.small.dump gpt2/tmp/pos.dict.fsm.small.dump gpt2/tmp/pos.dict.k2i.small.dump gpt2/tmp/pos.dict.i2t.small.dump

Loading

0 comments on commit 9d0c9dd

Please sign in to comment.