Skip to content

Commit

Permalink
Added UTF-8 optional offset computation for TextToIds methods, a few …
Browse files Browse the repository at this point in the history
…bugs fixed.
  • Loading branch information
Sergei Alonichau committed Apr 17, 2020
1 parent 209d858 commit 3795fec
Show file tree
Hide file tree
Showing 20 changed files with 32,389 additions and 32,040 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ FATokenSegmentationTools_1best_t < Ty >::
m_pMealy (NULL),
m_pK2I (NULL),
m_pI2Info (NULL),
m_UnkScore (-100.0f) // this is guaranteed lower than any of the segment scores
m_UnkScore (-100000.0f) // this is guaranteed lower than any of the segment scores
{}


Expand Down
69 changes: 69 additions & 0 deletions blingfireclient.library/inc/FAUtils_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ inline const int FANormalize (
{
DebugLogAssert (pIn != pOut);
DebugLogAssert (pMap);
DebugLogAssert (0 == MaxOutSize || NULL != pOut);

const int MaxNormCount = 10;
int Norm [MaxNormCount];
Expand Down Expand Up @@ -364,6 +365,74 @@ inline const int FANormalize (
return OutSize;
}

/// normalizes pIn string with respect to the given map
/// function returns the size of the output string
/// !!! normalization cannot be done in-place !!!
template < class Ty >
inline const int FANormalize (
const Ty * pIn,
const int InCount,
__out_ecount(MaxOutSize) Ty * pOut,
__out_ecount(MaxOutSize) int * pOffsets,
const int MaxOutSize,
const FAMultiMapCA * pMap
)
{
DebugLogAssert (pIn != pOut);
DebugLogAssert (pMap);
DebugLogAssert (0 == MaxOutSize || (NULL != pOut && NULL != pOffsets));

const int MaxNormCount = 10;
int Norm [MaxNormCount];

int OutSize = 0;

for (int i = 0; i < InCount; ++i) {

const Ty Ci = pIn [i];
const int NormCount = pMap->Get (Ci, Norm, MaxNormCount);

if (-1 == NormCount) {

if (OutSize < MaxOutSize) {
pOut [OutSize] = Ci;
pOffsets [OutSize] = i;
}
OutSize++;

} else if (1 == NormCount) {

if (OutSize < MaxOutSize) {
pOut [OutSize] = (Ty) Norm [0];
pOffsets [OutSize] = i;
}
OutSize++;

} else if (1 < NormCount && NormCount <= MaxNormCount) {

// see how much of the buffer left, can be 0 or less
int CopyCount = MaxOutSize - OutSize;

// CopyCount = MIN {left buffer size, NormCount}
if (NormCount < CopyCount) {
CopyCount = NormCount;
}

for (int j = 0; j < CopyCount; ++j) {
const Ty Co = (Ty) Norm [j];
pOut [OutSize + j] = Co;
pOffsets [OutSize + j] = i;
}

OutSize += NormCount;

} // of if (-1 == NormCount) ...
} // of for (int i = 0; ...

return OutSize;
}


/// normalizes input word (IN-PLACE is allowed)
/// a new word length is returned
template < class Ty >
Expand Down
36 changes: 33 additions & 3 deletions blingfiretools/any_test/any_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ _TLoadModelPtr g_LoadModelPtr = NULL;
typedef const int (__cdecl* _TTextToIdsPtr)(void*, const char*, int, int32_t*,const int, const int);
_TTextToIdsPtr g_TextToIdsPtr = NULL;

typedef const int (__cdecl* _TTextToIdsWithOffsetsPtr)(void*, const char*, int, int32_t*, int*, int*, const int, const int);
_TTextToIdsWithOffsetsPtr g_TextToIdsWithOffsetsPtr = NULL;

typedef int (__cdecl* _TFreeModel)(void* ModelPtr);
_TFreeModel g_FreeModelPtr = NULL;

Expand Down Expand Up @@ -62,6 +65,12 @@ int __cdecl main (int argc, char ** argv)
std::cerr << "ERROR: Cannot get address of TextToIds function" << std::endl;
return false;
}
g_TextToIdsWithOffsetsPtr = (_TTextToIdsWithOffsetsPtr) dlsym(g_Module, "TextToIdsWithOffsets");
if (NULL == g_TextToIdsWithOffsetsPtr)
{
std::cerr << "ERROR: Cannot get address of TextToIdsWithOffsets function" << std::endl;
return false;
}
g_FreeModelPtr = (_TFreeModel) dlsym(g_Module, "FreeModel");
if (NULL == g_FreeModelPtr)
{
Expand All @@ -71,20 +80,41 @@ int __cdecl main (int argc, char ** argv)

// tests

void* hModel = (*g_LoadModelPtr)("testsp1.bin");
void* hModel = (*g_LoadModelPtr)("bert_base_tok.bin");

const int MaxIdCount = 128;
int Ids [MaxIdCount];
int Starts [MaxIdCount];
int Ends [MaxIdCount];

std::string in1 ("Sergei Alonichau I saw a girl with a telescope.");

int IdCount = (*g_TextToIdsPtr)(hModel, in1.c_str(), in1.length(), Ids, MaxIdCount, 100);
for(int i = 0; i < IdCount; ++i) {
std::cout << Ids[i] << ' ';
}
std::cout << std::endl;

for(int i = 0; i < IdCount; ++i)
{
IdCount = (*g_TextToIdsWithOffsetsPtr)(hModel, in1.c_str(), in1.length(), Ids, Starts, Ends, MaxIdCount, 100);
for(int i = 0; i < IdCount; ++i) {
std::cout << Ids[i] << ' ';
}
std::cout << std::endl;

for(int i = 0; i < IdCount; ++i) {

const int _from = Starts[i];
const int _to = Ends[i];

if (0 <= _from && 0 <= _to && _to >= _from) {
std::string s (in1.c_str() + _from, _to - _from + 1);
std::cout << s << ':' << Ids[i] << ' ';
} else {
std::cout << "UNK" << ':' << Ids[i] << ' ';
}
}
std::cout << std::endl;

(*g_FreeModelPtr)(hModel);

// unload the .so file
Expand Down
Loading

0 comments on commit 3795fec

Please sign in to comment.