Skip to content
This repository was archived by the owner on Jan 26, 2021. It is now read-only.

Support asymmetric Dirichlet prior #22

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions inference/infer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ namespace multiverso { namespace lightlda
LocalModel* model = new LocalModel(&meta); model->Init();
//init document stream
data_stream = CreateDataStream();
//init documents
InitDocument();
//init doc-topic
InitDocTopic();
//init alias table
AliasTable* alias_table = new AliasTable();
//init inferers
Expand Down Expand Up @@ -102,7 +102,7 @@ namespace multiverso { namespace lightlda
return nullptr;
}

static void InitDocument()
static void InitDocTopic()
{
xorshift_rng rng;
for (int32_t block = 0; block < Config::num_blocks; ++block)
Expand Down
3 changes: 2 additions & 1 deletion inference/inferer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ namespace multiverso { namespace lightlda
for (int32_t doc_id = id_; doc_id < data.Size(); doc_id += thread_num_)
{
Document* doc = data.GetOneDoc(doc_id);
sampler_->SampleOneDoc(doc, 0, lastword, model_, alias_);
//TODO: Asymmeric prior
sampler_->SampleOneDoc(doc, 0, lastword, model_, alias_, nullptr);
}
}

Expand Down
175 changes: 110 additions & 65 deletions src/alias_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
namespace multiverso { namespace lightlda
{
_THREAD_LOCAL std::vector<float>* AliasTable::q_w_proportion_;
_THREAD_LOCAL std::vector<int32_t>* AliasTable::q_w_proportion_int_;
_THREAD_LOCAL std::vector<std::pair<int32_t, int32_t>>* AliasTable::L_;
_THREAD_LOCAL std::vector<std::pair<int32_t, int32_t>>* AliasTable::H_;
_THREAD_LOCAL std::vector<int32_t>* AliasMultinomialRNGInt::q_proportion_int_;
_THREAD_LOCAL std::vector<std::pair<int32_t, int32_t>>* AliasMultinomialRNGInt::L_;
_THREAD_LOCAL std::vector<std::pair<int32_t, int32_t>>* AliasMultinomialRNGInt::H_;

// -- AliasTable implement area --------------------------------- //
AliasTable::AliasTable()
{
memory_size_ = Config::alias_capacity / sizeof(int32_t);
Expand All @@ -25,6 +26,8 @@ namespace multiverso { namespace lightlda
beta_ = Config::beta;
beta_sum_ = beta_ * num_vocabs_;
memory_block_ = new int32_t[memory_size_];

alias_rng_int_ = new AliasMultinomialRNGInt(num_topics_);

beta_kv_vector_ = new int32_t[2 * num_topics_];

Expand All @@ -34,6 +37,7 @@ namespace multiverso { namespace lightlda

AliasTable::~AliasTable()
{
delete alias_rng_int_;
delete[] memory_block_;
delete[] beta_kv_vector_;
}
Expand All @@ -47,12 +51,6 @@ namespace multiverso { namespace lightlda
{
if (q_w_proportion_ == nullptr)
q_w_proportion_ = new std::vector<float>(num_topics_);
if (q_w_proportion_int_ == nullptr)
q_w_proportion_int_ = new std::vector<int32_t>(num_topics_);
if (L_ == nullptr)
L_ = new std::vector<std::pair<int32_t, int32_t>>(num_topics_);
if (H_ == nullptr)
H_ = new std::vector<std::pair<int32_t, int32_t>>(num_topics_);
// Compute the proportion
Row<int64_t>& summary_row = model->GetSummaryRow();
if (word == -1) // build alias row for beta
Expand All @@ -63,8 +61,7 @@ namespace multiverso { namespace lightlda
(*q_w_proportion_)[k] = beta_ / (summary_row.At(k) + beta_sum_);
beta_mass_ += (*q_w_proportion_)[k];
}
AliasMultinomialRNG(num_topics_, beta_mass_, beta_height_,
beta_kv_vector_);
alias_rng_int_->Build(*q_w_proportion_, num_topics_, beta_mass_, beta_height_, beta_kv_vector_);
}
else // build alias row for word
{
Expand Down Expand Up @@ -105,7 +102,7 @@ namespace multiverso { namespace lightlda
word_topic_row.NonzeroSize());
}
}
AliasMultinomialRNG(size, mass_[word], height_[word],
alias_rng_int_->Build(*q_w_proportion_, size, mass_[word], height_[word],
memory_block_ + word_entry.begin_offset);
}
return 0;
Expand All @@ -118,83 +115,73 @@ namespace multiverso { namespace lightlda
int32_t capacity = word_entry.capacity;
if (word_entry.is_dense)
{
auto sample = rng.rand();
int32_t idx = sample / height_[word];
if (capacity <= idx) idx = capacity - 1;

int32_t* p = kv_vector + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t m = -(sample < v);
return (idx & m) | (k & ~m);
return alias_rng_int_->Propose(rng, height_[word], kv_vector);
}
else
{
auto sample = rng.rand_double() * (mass_[word] + beta_mass_);
if (sample < mass_[word])
{
int32_t* idx_vector = kv_vector + 2 * word_entry.capacity;
auto n_kw_sample = rng.rand();
int32_t idx = n_kw_sample / height_[word];
if (capacity <= idx) idx = capacity - 1;
int32_t* p = kv_vector + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t id = idx_vector[idx];
int32_t m = -(n_kw_sample < v);
return (id & m) | (idx_vector[k] & ~m);
}
else
{
auto beta_sample = rng.rand();
int32_t idx = beta_sample / beta_height_;
if (num_topics_ <= idx) idx = num_topics_ - 1;
int32_t* p = beta_kv_vector_ + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t m = -(beta_sample < v);
return (idx & m) | (k & ~m);
}
return alias_rng_int_->Propose(rng, height_[word], beta_height_,
mass_[word], beta_mass_,
kv_vector, capacity,
beta_kv_vector_);
}
}

void AliasTable::Clear()
{
delete q_w_proportion_;
q_w_proportion_ = nullptr;
delete q_w_proportion_int_;
q_w_proportion_int_ = nullptr;
delete L_;
L_ = nullptr;
delete H_;
H_ = nullptr;
delete q_w_proportion_;
q_w_proportion_ = nullptr;
}
// -- AliasTable implement area --------------------------------- //


void AliasTable::AliasMultinomialRNG(int32_t size, float mass, int32_t& height,
int32_t* kv_vector)
// -- AliasMultinomialRNGInt implement area --------------------------------- //
void AliasMultinomialRNGInt::Build(const std::vector<float>& q_proportion, int32_t size,
float mass, int32_t & height, int32_t* kv_vector)
{
if (q_proportion_int_ == nullptr)
{
q_proportion_int_ = new std::vector<int32_t>(size_);
}
else if(q_proportion_int_->size() != size_)
{
q_proportion_int_->resize(size_);
}
if (L_ == nullptr)
{
L_ = new std::vector<std::pair<int32_t, int32_t>>(size_);
}
else if(L_->size() != size_)
{
L_->resize(size_);
}
if (H_ == nullptr)
{
H_ = new std::vector<std::pair<int32_t, int32_t>>(size_);
}
else if(H_->size() != size_)
{
H_->resize(size_);
}

int32_t mass_int = 0x7fffffff;
int32_t a_int = mass_int / size;
mass_int = a_int * size;
height = a_int;
int64_t mass_sum = 0;
for (int32_t i = 0; i < size; ++i)
{
(*q_w_proportion_)[i] /= mass;
(*q_w_proportion_int_)[i] =
static_cast<int32_t>((*q_w_proportion_)[i] * mass_int);
mass_sum += (*q_w_proportion_int_)[i];
(*q_proportion_int_)[i] =
static_cast<int32_t>(q_proportion[i] / mass * mass_int);
mass_sum += (*q_proportion_int_)[i];
}
if (mass_sum > mass_int)
{
int32_t more = static_cast<int32_t>(mass_sum - mass_int);
int32_t id = 0;
for (int32_t i = 0; i < more;)
{
if ((*q_w_proportion_int_)[id] >= 1)
if ((*q_proportion_int_)[id] >= 1)
{
--(*q_w_proportion_int_)[id];
--(*q_proportion_int_)[id];
++i;
}
id = (id + 1) % size;
Expand All @@ -207,7 +194,7 @@ namespace multiverso { namespace lightlda
int32_t id = 0;
for (int32_t i = 0; i < more; ++i)
{
++(*q_w_proportion_int_)[id];
++(*q_proportion_int_)[id];
id = (id + 1) % size;
}
}
Expand All @@ -221,7 +208,7 @@ namespace multiverso { namespace lightlda
int32_t L_head = 0, L_tail = 0, H_head = 0, H_tail = 0;
for (int32_t k = 0; k < size; ++k)
{
int32_t val = (*q_w_proportion_int_)[k];
int32_t val = (*q_proportion_int_)[k];
if (val < height)
{
(*L_)[L_tail].first = k;
Expand Down Expand Up @@ -276,5 +263,63 @@ namespace multiverso { namespace lightlda
++H_head;
}
}

void AliasMultinomialRNGInt::Clear()
{
delete q_proportion_int_;
q_proportion_int_ = nullptr;
delete L_;
L_ = nullptr;
delete H_;
H_ = nullptr;
}

int32_t AliasMultinomialRNGInt::Propose(xorshift_rng& rng, int32_t height,
int32_t* kv_vector)
{
auto sample = rng.rand();
int32_t idx = sample / height;
if (size_ <= idx) idx = size_ - 1;

int32_t* p = kv_vector + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t m = -(sample < v);
return (idx & m) | (k & ~m);
}

int32_t AliasMultinomialRNGInt::Propose(xorshift_rng& rng,
int32_t height, int32_t height_sum,
float mass, float mass_sum,
int32_t* kv_vector, int32_t vsize,
int32_t* kv_vector_sum)
{
auto sample = rng.rand_double() * (mass + mass_sum);
if (sample < mass)
{
int32_t* idx_vector = kv_vector + 2 * vsize;
auto n_sample = rng.rand();
int32_t idx = n_sample / height;
if (vsize <= idx) idx = vsize - 1;
int32_t* p = kv_vector + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t id = idx_vector[idx];
int32_t m = -(n_sample < v);
return (id & m) | (idx_vector[k] & ~m);
}
else
{
auto n_sample = rng.rand();
int32_t idx = n_sample / height_sum;
if (size_ <= idx) idx = size_ - 1;
int32_t* p = kv_vector_sum + 2 * idx;
int32_t k = *p++;
int32_t v = *p;
int32_t m = -(n_sample < v);
return (idx & m) | (k & ~m);
}
}
// -- AliasMultinomialRNGInt implement area --------------------------------- //
} // namespace lightlda
} // namespace multiverso
31 changes: 26 additions & 5 deletions src/alias_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,31 @@ namespace multiverso { namespace lightlda
class xorshift_rng;
class AliasTableIndex;

class AliasMultinomialRNGInt
{
public:
AliasMultinomialRNGInt(int32_t size): size_(size) {}
void Build(const std::vector<float>& q_proportion, int32_t size,
float mass, int32_t & height, int32_t* kv_vector);
static void Clear();

//for dense sampling
int32_t Propose(xorshift_rng& rng, int32_t height, int32_t* kv_vector);
//for sparse sampling
int32_t Propose(xorshift_rng& rng,
int32_t height, int32_t height_sum,
float mass, float mass_sum,
int32_t* kv_vector, int32_t vsize,
int32_t* kv_vector_sum);

private:
int32_t size_;
// thread local storage used for building alias
_THREAD_LOCAL static std::vector<int>* q_proportion_int_;
_THREAD_LOCAL static std::vector<std::pair<int, int>>* L_;
_THREAD_LOCAL static std::vector<std::pair<int, int>>* H_;
};

/*!
* \brief AliasTable is the storage for alias tables used for fast sampling
* from lightlda word proposal distribution. It optimize memory usage
Expand Down Expand Up @@ -56,8 +81,7 @@ namespace multiverso { namespace lightlda
/*! \brief Clear the alias table */
void Clear();
private:
void AliasMultinomialRNG(int32_t size, float mass, int32_t& height,
int32_t* kv_vector);
AliasMultinomialRNGInt * alias_rng_int_;
int* memory_block_;
int64_t memory_size_;
AliasTableIndex* table_index_;
Expand All @@ -71,9 +95,6 @@ namespace multiverso { namespace lightlda

// thread local storage used for building alias
_THREAD_LOCAL static std::vector<float>* q_w_proportion_;
_THREAD_LOCAL static std::vector<int>* q_w_proportion_int_;
_THREAD_LOCAL static std::vector<std::pair<int, int>>* L_;
_THREAD_LOCAL static std::vector<std::pair<int, int>>* H_;

int num_vocabs_;
int num_topics_;
Expand Down
Loading