Skip to content

Commit

Permalink
add storage support for tokenized dictionary encoding for strings
Browse files Browse the repository at this point in the history
  • Loading branch information
mapdwei committed Mar 18, 2015
1 parent 93f26ba commit 7b3da56
Show file tree
Hide file tree
Showing 15 changed files with 441 additions and 56 deletions.
31 changes: 18 additions & 13 deletions Catalog/Catalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,21 @@ Catalog::~Catalog() {
}

void Catalog::buildMaps() {
string dictQuery("SELECT dictid, name, nbits, is_shared from mapd_dictionaries");
sqliteConnector_.query(dictQuery);
size_t numRows = sqliteConnector_.getNumRows();
for (int r = 0; r < numRows; ++r) {
int dictId = sqliteConnector_.getData<int>(r,0);
std::string dictName = sqliteConnector_.getData<string>(r,1);
int dictNBits = sqliteConnector_.getData<int>(r,2);
bool is_shared = sqliteConnector_.getData<bool>(r, 3);
std::string fname = basePath_ + "/mapd_data/" + currentDB_.dbName + "_" + dictName;
DictDescriptor *dd = new DictDescriptor(dictId, dictName, dictNBits, is_shared, fname);
dictDescriptorMapById_[dd->dictId] = dd;
}
string tableQuery("SELECT tableid, name, ncolumns, isview, fragments, frag_type, max_frag_rows, frag_page_size, partitions from mapd_tables");
sqliteConnector_.query(tableQuery);
size_t numRows = sqliteConnector_.getNumRows();
numRows = sqliteConnector_.getNumRows();
for (int r = 0; r < numRows; ++r) {
TableDescriptor *td = new TableDescriptor();
td->tableId = sqliteConnector_.getData<int>(r,0);
Expand Down Expand Up @@ -231,6 +243,11 @@ void Catalog::buildMaps() {
cd->columnType.set_compression((EncodingType)sqliteConnector_.getData<int>(r,7));
cd->columnType.set_comp_param(sqliteConnector_.getData<int>(r,8));
cd->columnType.set_size(sqliteConnector_.getData<int>(r,9));
if (cd->columnType.is_string() && cd->columnType.get_compression() == kENCODING_TOKDICT) {

DictDescriptor *dd = dictDescriptorMapById_[cd->columnType.get_comp_param()];
cd->columnType.set_elem_size(dd->dictNBits/8);
}
cd->chunks = sqliteConnector_.getData<string>(r,10);
ColumnKey columnKey(cd->tableId, cd->columnName);
columnDescriptorMap_[columnKey] = cd;
Expand All @@ -250,18 +267,6 @@ void Catalog::buildMaps() {
td->isReady = !td->isMaterialized;
td->fragmenter = nullptr;
}
string dictQuery("SELECT dictid, name, nbits, is_shared from mapd_dictionaries");
sqliteConnector_.query(dictQuery);
numRows = sqliteConnector_.getNumRows();
for (int r = 0; r < numRows; ++r) {
int dictId = sqliteConnector_.getData<int>(r,0);
std::string dictName = sqliteConnector_.getData<string>(r,1);
int dictNBits = sqliteConnector_.getData<int>(r,2);
bool is_shared = sqliteConnector_.getData<bool>(r, 3);
std::string fname = basePath_ + "/mapd_data/" + currentDB_.dbName + "_" + dictName;
DictDescriptor *dd = new DictDescriptor(dictId, dictName, dictNBits, is_shared, fname);
dictDescriptorMapById_[dd->dictId] = dd;
}
}

void
Expand Down
77 changes: 70 additions & 7 deletions Chunk/Chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "Chunk.h"
#include "../DataMgr/StringNoneEncoder.h"
#include "../DataMgr/StringTokDictEncoder.h"

namespace Chunk_NS {
std::shared_ptr<Chunk>
Expand Down Expand Up @@ -39,8 +40,30 @@ namespace Chunk_NS {
subKey.pop_back();
subKey.push_back(2); // 2 for the index buffer
index_buf = data_mgr->getChunkBuffer(subKey, mem_level, device_id, (num_elems + 1) * sizeof(StringOffsetT)); // always record n+1 offsets so string length can be calculated
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
if (column_desc->columnType.get_compression() == kENCODING_NONE) {
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
} else if (column_desc->columnType.get_compression() == kENCODING_TOKDICT) {
switch (column_desc->columnType.get_elem_size()) {
case 1: {
StringTokDictEncoder<int8_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int8_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
case 2: {
StringTokDictEncoder<int16_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int16_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
case 4: {
StringTokDictEncoder<int32_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int32_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
default:
assert(false);
}
}
} else
buffer = data_mgr->getChunkBuffer(key, mem_level, device_id, num_bytes);
}
Expand All @@ -63,9 +86,27 @@ namespace Chunk_NS {
Chunk::appendData(DataBlockPtr &src_data, const size_t num_elems, const size_t start_idx)
{
if (column_desc->columnType.is_varlen()) {
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
return str_encoder->appendData(src_data.stringsPtr, start_idx, num_elems);

if (column_desc->columnType.get_compression() == kENCODING_NONE) {
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
return str_encoder->appendData(src_data.stringsPtr, start_idx, num_elems);
} else if (column_desc->columnType.get_compression() == kENCODING_TOKDICT) {
switch (column_desc->columnType.get_elem_size()) {
case 1: {
StringTokDictEncoder<int8_t> *tokdict_encoder = dynamic_cast<StringTokDictEncoder<int8_t>*>(buffer->encoder);
return tokdict_encoder->appendData(src_data.tok8dictPtr, start_idx, num_elems);
}
case 2: {
StringTokDictEncoder<int16_t> *tokdict_encoder = dynamic_cast<StringTokDictEncoder<int16_t>*>(buffer->encoder);
return tokdict_encoder->appendData(src_data.tok16dictPtr, start_idx, num_elems);
}
case 4: {
StringTokDictEncoder<int32_t> *tokdict_encoder = dynamic_cast<StringTokDictEncoder<int32_t>*>(buffer->encoder);
return tokdict_encoder->appendData(src_data.tok32dictPtr, start_idx, num_elems);
}
default:
assert(false);
}
}
}
return buffer->encoder->appendData(src_data.numbersPtr, num_elems);
}
Expand All @@ -84,8 +125,30 @@ namespace Chunk_NS {
{
buffer->initEncoder(column_desc->columnType);
if (column_desc->columnType.is_varlen()) {
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
if (column_desc->columnType.get_compression() == kENCODING_NONE) {
StringNoneEncoder *str_encoder = dynamic_cast<StringNoneEncoder*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
} else if (column_desc->columnType.get_compression() == kENCODING_TOKDICT) {
switch (column_desc->columnType.get_elem_size()) {
case 1: {
StringTokDictEncoder<int8_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int8_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
case 2: {
StringTokDictEncoder<int16_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int16_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
case 4: {
StringTokDictEncoder<int32_t> *str_encoder = dynamic_cast<StringTokDictEncoder<int32_t>*>(buffer->encoder);
str_encoder->set_index_buf(index_buf);
break;
}
default:
assert(false);
}
}
}
}

Expand Down
16 changes: 15 additions & 1 deletion DataMgr/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "NoneEncoder.h"
#include "FixedLengthEncoder.h"
#include "StringNoneEncoder.h"
#include "StringTokDictEncoder.h"
#include <glog/logging.h>


Expand Down Expand Up @@ -117,10 +118,23 @@ Encoder * Encoder::Create(Data_Namespace::AbstractBuffer *buffer, const SQLTypeI
break;
} // Case: kENCODING_FIXED
case kENCODING_DICT: {
CHECK(IS_STRING(sqlType.get_type()));
CHECK(sqlType.is_string());
return new NoneEncoder <int32_t> (buffer);
break;
}
case kENCODING_TOKDICT:
CHECK(sqlType.is_string());
switch (sqlType.get_elem_size()) {
case 1:
return new StringTokDictEncoder<int8_t>(buffer);
case 2:
return new StringTokDictEncoder<int16_t>(buffer);
case 4:
return new StringTokDictEncoder<int32_t>(buffer);
default:
assert(false);
}
break;
default: {
return 0;
break;
Expand Down
5 changes: 5 additions & 0 deletions DataMgr/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
#include <stdexcept>
#include <limits>

typedef int32_t StringOffsetT;

namespace Data_Namespace {
class AbstractBuffer;
}

// default max input buffer size to 1MB
#define MAX_INPUT_BUF_SIZE 1048576

class Encoder {
public:
static Encoder * Create(Data_Namespace::AbstractBuffer * buffer, const SQLTypeInfo sqlType);
Expand Down
39 changes: 23 additions & 16 deletions DataMgr/FileMgr/FileBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,16 +220,20 @@ namespace File_Namespace {
fseek(f, page.pageNum*METADATA_PAGE_SIZE + reservedHeaderSize_, SEEK_SET);
fread((int8_t *)&pageSize_,sizeof(size_t),1,f);
fread((int8_t *)&size_,sizeof(size_t),1,f);
vector <int> typeData (7); // assumes we will encode hasEncoder, bufferType, encodingType, encodingBits all as int
vector <int> typeData (NUM_METADATA); // assumes we will encode hasEncoder, bufferType, encodingType, encodingBits all as int
fread((int8_t *)&(typeData[0]),sizeof(int),typeData.size(),f);
hasEncoder = static_cast <bool> (typeData[0]);
int version = typeData[0];
assert(version == METADATA_VERSION); // add backward compatibility code here
hasEncoder = static_cast <bool> (typeData[1]);
if (hasEncoder) {
sqlType.set_type(static_cast<SQLTypes> (typeData[1]));
sqlType.set_dimension(typeData[2]);
sqlType.set_scale(typeData[3]);
sqlType.set_notnull(static_cast<bool>(typeData[4]));
sqlType.set_compression(static_cast<EncodingType> (typeData[5]));
sqlType.set_comp_param(typeData[6]);
sqlType.set_type(static_cast<SQLTypes> (typeData[2]));
sqlType.set_dimension(typeData[3]);
sqlType.set_scale(typeData[4]);
sqlType.set_notnull(static_cast<bool>(typeData[5]));
sqlType.set_compression(static_cast<EncodingType> (typeData[6]));
sqlType.set_comp_param(typeData[7]);
sqlType.set_size(typeData[8]);
sqlType.set_elem_size(typeData[9]);
initEncoder(sqlType);
encoder->readMetadata(f);
}
Expand All @@ -245,15 +249,18 @@ namespace File_Namespace {
fseek(f, page.pageNum*METADATA_PAGE_SIZE + reservedHeaderSize_, SEEK_SET);
size_t numBytesWritten = fwrite((int8_t *)&pageSize_,sizeof(size_t),1,f);
numBytesWritten = fwrite((int8_t *)&size_,sizeof(size_t),1,f);
vector <int> typeData (7); // assumes we will encode hasEncoder, bufferType, encodingType, encodingBits all as int
typeData[0] = static_cast<int>(hasEncoder);
vector <int> typeData (NUM_METADATA); // assumes we will encode hasEncoder, bufferType, encodingType, encodingBits all as int
typeData[0] = METADATA_VERSION;
typeData[1] = static_cast<int>(hasEncoder);
if (hasEncoder) {
typeData[1] = static_cast<int>(sqlType.get_type());
typeData[2] = sqlType.get_dimension();
typeData[3] = sqlType.get_scale();
typeData[4] = static_cast<int>(sqlType.get_notnull());
typeData[5] = static_cast<int>(sqlType.get_compression());
typeData[6] = sqlType.get_comp_param();
typeData[2] = static_cast<int>(sqlType.get_type());
typeData[3] = sqlType.get_dimension();
typeData[4] = sqlType.get_scale();
typeData[5] = static_cast<int>(sqlType.get_notnull());
typeData[6] = static_cast<int>(sqlType.get_compression());
typeData[7] = sqlType.get_comp_param();
typeData[8] = sqlType.get_size();
typeData[9] = sqlType.get_elem_size();
}
numBytesWritten = fwrite((int8_t *)&(typeData[0]),sizeof(int),typeData.size(),f);
if (hasEncoder) { // redundant
Expand Down
3 changes: 3 additions & 0 deletions DataMgr/FileMgr/FileBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

using namespace Data_Namespace;

#define NUM_METADATA 10
#define METADATA_VERSION 0

namespace File_Namespace {

class FileMgr; // forward declaration
Expand Down
3 changes: 0 additions & 3 deletions DataMgr/StringNoneEncoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

using Data_Namespace::AbstractBuffer;

// default max input buffer size to 1MB
#define MAX_INPUT_BUF_SIZE 1048576

ChunkMetadata
StringNoneEncoder::appendData(const std::vector<std::string> *srcData, const int start_idx, const size_t numAppendElems)
{
Expand Down
2 changes: 0 additions & 2 deletions DataMgr/StringNoneEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

using Data_Namespace::AbstractBuffer;

typedef int32_t StringOffsetT;

class StringNoneEncoder : public Encoder {

public:
Expand Down
Loading

0 comments on commit 7b3da56

Please sign in to comment.