Skip to content

Commit

Permalink
Fix to resolve run time failures for FastOpt enabled WASM builds (#779)
Browse files Browse the repository at this point in the history
* copy changes from commit 4df92f2
* add comments for better understanding
* restore the newline at the end of file and add this changes in changelog.md
  • Loading branch information
qianqianzhu authored Jan 7, 2021
1 parent c1c4af0 commit 737f430
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option.
- Added a few missing header files in shortlist.h and beam_search.h.
- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately.
- Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t.

### Changed
- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece.
Expand Down
54 changes: 32 additions & 22 deletions src/common/fastopt.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@ inline constexpr uint64_t crc(const char* const str) noexcept {
/*****************************************************************************/

// PerfectHash constructs a perfect hash for a set K of n numeric keys. The size of
// the hash is m > n (not much larger) and n << max(K) (much smaller). If I am not wrong m
// is the next power of 2 larger than n? We then build an array of size m with n fields defined.
// m - n fields stay undefined (a bit of waste).
// the hash is m > n (not much larger) and n << max(K) (much smaller). The output array size is
// determined by PHF::init in "src/3rd_party/phf/phf.h". m - n fields stay undefined (a bit of waste).

// Wrapper class for the 3rd-party library in "src/3rd_party/phf"
class PerfectHash {
private:
phf phf_;
Expand All @@ -62,10 +63,12 @@ class PerfectHash {
PHF::destroy(&phf_);
}

// subscript operator [] overloading: if the key is uint64_t, return the hash code directly
uint32_t operator[](const uint64_t& key) const {
return PHF::hash<uint64_t>(const_cast<phf*>(&phf_), key);
}

// If the key is a string, return the hash code for the string's CRC code
uint32_t operator[](const char* const keyStr) const {
return (*this)[crc::crc(keyStr)];
}
Expand Down Expand Up @@ -109,6 +112,9 @@ class FastOpt {

public:
// Node types for FastOpt, seem to be enough to cover YAML:NodeType
// Multi-element types include "Sequence" and "Map"
// "Sequence" is implemented with STL vectors
// "Map" is implemented with a 3rd-party PHF library (see the PerfectHash class)
enum struct NodeType {
Null, Bool, Int64, Float64, String, Sequence, Map
};
Expand All @@ -126,6 +132,7 @@ class FastOpt {
size_t elements_{0}; // Number of elements if isMap or isSequence is true, 0 otherwise.

// Used to find elements if isSequence() is true.
// Retrieve the entry using array indexing.
inline const std::unique_ptr<const FastOpt>& arrayLookup(size_t keyId) const {
if(keyId < array_.size())
return array_[keyId];
Expand All @@ -134,13 +141,15 @@ class FastOpt {
}

// Used to find elements if isMap() is true.
inline const std::unique_ptr<const FastOpt>& phLookup(size_t keyId) const {
// Retrieve the entry from the hash table.
inline const std::unique_ptr<const FastOpt>& phLookup(uint64_t keyId) const {
if(ph_)
return array_[(*ph_)[keyId]];
else
return uniqueNullPtr;
}

// Builders for different types of nodes.
// Build Null node.
void makeNull() {
elements_ = 0;
Expand Down Expand Up @@ -221,7 +230,7 @@ class FastOpt {
type_ = NodeType::Map;
}

// Build a Map node, uses std::string as key, which gets hashed to size_t and used in the function above.
// Build a Map node, uses std::string as key, which gets hashed to uint64_t and used in the function above.
void makeMap(const std::map<std::string, YAML::Node>& m) {
std::map<uint64_t, YAML::Node> mi;
for(const auto& it : m) {
Expand Down Expand Up @@ -263,13 +272,14 @@ class FastOpt {

public:
// Constructor to recursively create a FastOpt object from a YAML::Node following the yaml structure.
FastOpt(const YAML::Node& node)
FastOpt(const YAML::Node& node)
{ construct(node); }

FastOpt(const YAML::Node& node, uint64_t fingerprint)
FastOpt(const YAML::Node& node, uint64_t fingerprint)
: fingerprint_{fingerprint}
{ construct(node); }

// Predicates for node types
bool isSequence() const {
return type_ == NodeType::Sequence;
}
Expand All @@ -279,20 +289,20 @@ class FastOpt {
}

bool isScalar() const {
return type_ == NodeType::Bool
|| type_ == NodeType::Float64
|| type_ == NodeType::Int64
return type_ == NodeType::Bool
|| type_ == NodeType::Float64
|| type_ == NodeType::Int64
|| type_ == NodeType::String;
}

bool isNull() const {
return type_ == NodeType::Null;
}
}

bool isInt() const {
return type_ == NodeType::Int64;
}
}

bool isBool() const {
return type_ == NodeType::Bool;
}
Expand All @@ -318,11 +328,11 @@ class FastOpt {
std::swap(array_, other.array_);
std::swap(type_, other.type_);
std::swap(elements_, other.elements_);
// leave fingerprint alone as it needed by parent node.
// leave fingerprint alone as it needed by parent node.
}

// Is the hashed key in a map?
bool has(size_t keyId) const {
// Is the hashed key in a map?
bool has(uint64_t keyId) const {
if(isMap() && elements_ > 0) {
const auto& ptr = phLookup(keyId);
return ptr ? ptr->fingerprint_ == keyId : false;
Expand All @@ -346,27 +356,27 @@ class FastOpt {
}

// access sequence or map element
const FastOpt& operator[](size_t keyId) const {
const FastOpt& operator[](uint64_t keyId) const {
if(isSequence()) {
const auto& ptr = arrayLookup(keyId);
const auto& ptr = arrayLookup((size_t)keyId);
ABORT_IF(!ptr, "Unseen key {}" , keyId);
return *ptr;
} else if(isMap()) {
const auto& ptr = phLookup(keyId);
ABORT_IF(!ptr || ptr->fingerprint_ != keyId, "Unseen key {}", keyId);
return *ptr;
return *ptr;
} else {
ABORT("Not a sequence or map node");
}
}

// operator [] overloading for non-uint64_t keys
const FastOpt& operator[](int key) const {
return operator[]((size_t)key);
return operator[]((uint64_t)key);
}

const FastOpt& operator[](const char* const key) const {
// MacOS requires explicit cast to size_t before we can use it.
return operator[]((size_t)crc::crc(key));
return operator[](crc::crc(key));
}

const FastOpt& operator[](const std::string& key) const {
Expand Down

0 comments on commit 737f430

Please sign in to comment.