Skip to content

Commit

Permalink
added IgnoreSpecialTokens parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei Alonichau committed Sep 22, 2021
1 parent 39031c0 commit b04dfab
Show file tree
Hide file tree
Showing 51 changed files with 45 additions and 4 deletions.
2 changes: 2 additions & 0 deletions blingfireclient.library/inc/FAFsmConst.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ class FAFsmConst {
PARAM_USE_BYTE_ENCODING, // specifies if input characters are UTF-8 bytes, not Unicode symbols
PARAM_NO_DUMMY_PREFIX, // for unigram-lm and BPE, if specified then no dummy whitespace is added in the beginning of text
PARAM_STRING_ARRAY,// string array dump index
PARAM_TOKENID_MIN, // if specified provides smalles regular (non special) token id value
PARAM_TOKENID_MAX, // if specified provides biggest regular (non special) token id value
PARAM_COUNT,
};

Expand Down
41 changes: 39 additions & 2 deletions blingfiretools/blingfiretokdll/blingfiretokdll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ struct FAModelData
// id2word lexicon data
bool m_hasI2w;
FAStringArray_pack m_i2w;
int m_min_token_id; // min regular token id, needed to separate special tokens
int m_max_token_id; // max regular token id, needed to separate special tokens


FAModelData ():
Expand All @@ -92,7 +94,9 @@ struct FAModelData
m_pAlgo (NULL),
m_useRawBytes (false),
m_hasHy (false),
m_hasI2w (false)
m_hasI2w (false),
m_min_token_id (0),
m_max_token_id (FALimits::MaxArrSize)
{}
};

Expand Down Expand Up @@ -1010,12 +1014,40 @@ void* SetModelData(FAModelData * pNewModelData, const unsigned char * pImgBytes)


// get the configuration paramenters for hyphenation [i2w]
pNewModelData->m_min_token_id = 0;
pNewModelData->m_max_token_id = FALimits::MaxArrSize;
pNewModelData->m_hasI2w = false;

pValues = NULL;
iSize = pNewModelData->m_Ldb.GetHeader ()->Get (FAFsmConst::FUNC_I2W, &pValues);

// see if the [pos-dict] section is present
if (-1 != iSize) {

for (int i = 0; i < iSize; ++i) {

if (pValues [i] == FAFsmConst::PARAM_STRING_ARRAY && i + 1 < iSize) {

const int DumpNum = pValues [++i];
const unsigned char * pDump = pNewModelData->m_Ldb.GetDump (DumpNum);

if (NULL == pDump) {
return NULL;
}

pNewModelData->m_i2w.SetImage(pDump);
pNewModelData->m_hasI2w = true;

} else if (pValues [i] == FAFsmConst::PARAM_TOKENID_MIN && i + 1 < iSize) {

pNewModelData->m_min_token_id = pValues [++i];

} else if (pValues [i] == FAFsmConst::PARAM_TOKENID_MAX && i + 1 < iSize) {

pNewModelData->m_max_token_id = pValues [++i];
}
}

if (1 < iSize && pValues [0] == FAFsmConst::PARAM_STRING_ARRAY) {

const int DumpNum = pValues [1];
Expand Down Expand Up @@ -1673,7 +1705,7 @@ int SetNoDummyPrefix(void* ModelPtr, bool fNoDummyPrefix)
// if the actual string length is more than MaxOutUtf8StrByteCount then pOutUtf8Str content is undefined
//
extern "C"
int IdsToText (void* ModelPtr, const int32_t * pIdsArr, const int IdsCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount)
int IdsToText (void* ModelPtr, const int32_t * pIdsArr, const int IdsCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, bool SkipSpecialTokens)
{
if (NULL == ModelPtr) {
return 0;
Expand All @@ -1694,6 +1726,11 @@ int IdsToText (void* ModelPtr, const int32_t * pIdsArr, const int IdsCount, char
// get the next id
const int id = pIdsArr [i];

// skip special tokens, if needed
if (SkipSpecialTokens && (id < pModel->m_min_token_id || id > pModel->m_max_token_id)) {
continue;
}

// get token text
const unsigned char * pToken = NULL;
int TokenLength = pModel->m_i2w.GetAt (id, &pToken);
Expand Down
2 changes: 2 additions & 0 deletions blingfiretools/fa_build_conf/fa_build_conf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ void SetupConfParams ()
g_parser.AddNumParam ("max-length", FAFsmConst::PARAM_MAX_LENGTH);
g_parser.AddNumParam ("max-token-length", FAFsmConst::PARAM_MAX_LENGTH);
g_parser.AddNumParam ("string-array", FAFsmConst::PARAM_STRING_ARRAY);
g_parser.AddNumParam ("token-id-min", FAFsmConst::PARAM_TOKENID_MIN);
g_parser.AddNumParam ("token-id-max", FAFsmConst::PARAM_TOKENID_MAX);

// WRE-compiler related parameters (not used at runtime)
g_parser.AddNumParam ("fsm-count", FAFsmConst::PARAM_FSM_COUNT);
Expand Down
4 changes: 2 additions & 2 deletions dist-pypi/blingfire/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,15 @@ def text_to_ids(h, s, max_len, unk = 0, no_padding = False):
return np.frombuffer(o_bytes, dtype=c_uint32, count = out_count)


def ids_to_text(h, ids, output_buffer_size = None):
def ids_to_text(h, ids, skip_special_tokens = True, output_buffer_size = None):
# allocate the output buffer
if output_buffer_size is None:
output_buffer_size = len(ids) * 50
# allocate the output buffer
o_bytes = create_string_buffer(output_buffer_size)
o_bytes_count = len(o_bytes)
# compute the text from ids
o_len = blingfire.IdsToText(c_void_p(h), c_void_p(ids.__array_interface__['data'][0]), len(ids), byref(o_bytes), c_int(o_bytes_count))
o_len = blingfire.IdsToText(c_void_p(h), c_void_p(ids.__array_interface__['data'][0]), len(ids), byref(o_bytes), c_int(o_bytes_count), c_bool(skip_special_tokens))
# check if no error has happened
if -1 == o_len or o_len > o_bytes_count:
return ''
Expand Down
Binary file modified dist-pypi/blingfire/bert_base_cased_tok.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/bert_base_tok.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/bert_chinese.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/bert_multi_cased.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/gpt2.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/laser100k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/laser250k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/laser500k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/roberta.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/uri100k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/uri250k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/uri500k.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/xlm_roberta_base.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/xlnet.i2w
Binary file not shown.
Binary file modified dist-pypi/blingfire/xlnet_nonorm.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/bert_base_cased_tok.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/bert_base_tok.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/bert_chinese.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/bert_multi_cased.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/gpt2.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/laser100k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/laser250k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/laser500k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/laser50k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/roberta.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/uri100k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/uri250k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/uri500k.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/xlm_roberta_base.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/xlnet.i2w
Binary file not shown.
Binary file modified ldbsrc/ldb/xlnet_nonorm.i2w
Binary file not shown.
Binary file added nuget/lib/bert_base_cased_tok.i2w
Binary file not shown.
Binary file added nuget/lib/bert_base_tok.i2w
Binary file not shown.
Binary file added nuget/lib/bert_chinese.i2w
Binary file not shown.
Binary file added nuget/lib/bert_multi_cased.i2w
Binary file not shown.
Binary file added nuget/lib/gpt2.i2w
Binary file not shown.
Binary file added nuget/lib/laser100k.i2w
Binary file not shown.
Binary file added nuget/lib/laser250k.i2w
Binary file not shown.
Binary file added nuget/lib/laser500k.i2w
Binary file not shown.
Binary file modified nuget/lib/libblingfiretokdll.so
Binary file not shown.
Binary file added nuget/lib/roberta.i2w
Binary file not shown.
Binary file added nuget/lib/uri100k.i2w
Binary file not shown.
Binary file added nuget/lib/uri250k.i2w
Binary file not shown.
Binary file added nuget/lib/uri500k.i2w
Binary file not shown.
Binary file added nuget/lib/xlm_roberta_base.i2w
Binary file not shown.
Binary file added nuget/lib/xlnet.i2w
Binary file not shown.
Binary file added nuget/lib/xlnet_nonorm.i2w
Binary file not shown.

0 comments on commit b04dfab

Please sign in to comment.