Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: FeatureHashing
Type: Package
Title: Creates a Model Matrix via Feature Hashing with a Formula Interface
Version: 0.9.1.3
Date: 2018-04-10
Version: 0.9.1.4
Date: 2019-11-24
Authors@R: c(
person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")),
person("Michael", "Benesty", email = "michael@benesty.fr", role = c("aut", "ctb")))
Expand All @@ -23,8 +23,8 @@ Imports:
Matrix,
digest(>= 0.6.8),
magrittr (>= 1.5)
LinkingTo: Rcpp, digest(>= 0.6.8), BH
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown
LinkingTo: Rcpp, digest(>= 0.6.8), BH(>= 1.54.0-1)
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, pROC
SystemRequirements: C++11
BugReports: https://github.com/wush978/FeatureHashing/issues
URL: https://github.com/wush978/FeatureHashing
Expand Down
28 changes: 0 additions & 28 deletions src/bswap_32.cpp

This file was deleted.

37 changes: 0 additions & 37 deletions src/bswap_32.h

This file was deleted.

26 changes: 18 additions & 8 deletions src/hash_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@
#ifndef __HASH_FUNCTION_HPP__
#define __HASH_FUNCTION_HPP__

#include <boost/detail/endian.hpp>
#include <boost/predef/other/endian.h>
#include <boost/endian/conversion.hpp>
#include "digestlocal.h"
#include "bswap_32.h"

class HashFunction {

public:

virtual ~HashFunction() { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) = 0;

Expand All @@ -34,6 +36,8 @@ class HashFunction {
class NullHashFunction : public HashFunction {

public:

virtual ~NullHashFunction() { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) {
return 1;
Expand All @@ -48,6 +52,8 @@ class MurmurHash3HashFunction : public HashFunction {
public :

MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { }

virtual ~MurmurHash3HashFunction() { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) {
return ::PMurHash32(seed, buf, size);
Expand All @@ -66,22 +72,26 @@ class MurmurHash3LogHashFunction : public HashFunction {
: HashFunction(), seed(_seed), e(_e)
{ }

virtual ~MurmurHash3LogHashFunction() { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) {
uint32_t retval = PMurHash32(seed, buf, size);
if (is_interaction) {
const uint32_t* src = reinterpret_cast<const uint32_t*>(buf);
#ifdef BOOST_BIG_ENDIAN
if (inverse_mapping.find(bswap_32(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
if (inverse_mapping.find(bswap_32(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
std::string key(inverse_mapping[bswap_32(src[0])]);
#if BOOST_ENDIAN_BIG_BYTE && !BOOST_ENDIAN_LITTLE_BYTE
if (inverse_mapping.find(boost::endian::endian_reverse(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
if (inverse_mapping.find(boost::endian::endian_reverse(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
std::string key(inverse_mapping[boost::endian::endian_reverse(src[0])]);
key.append(":");
key.append(inverse_mapping[bswap_32(src[1])]);
#else
key.append(inverse_mapping[boost::endian::endian_reverse(src[1])]);
#elif !BOOST_ENDIAN_BIG_BYTE && BOOST_ENDIAN_LITTLE_BYTE
if (inverse_mapping.find(src[0]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
if (inverse_mapping.find(src[1]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!");
std::string key(inverse_mapping[src[0]]);
key.append(":");
key.append(inverse_mapping[src[1]]);
#else
#error Unknown endianness
#endif
e[key.c_str()] = Rcpp::wrap((int) retval);
inverse_mapping[retval] = key;
Expand Down
11 changes: 6 additions & 5 deletions src/hash_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

#include <cstring>
#include <deque>
#include <boost/detail/endian.hpp>
#include <boost/algorithm/string.hpp>
#include <Rcpp.h>
#include "hash_function.h"
Expand Down Expand Up @@ -52,12 +51,14 @@ IntegerVector h(CharacterVector src) {

MH_UINT32 interaction(MH_UINT32 a, MH_UINT32 b) {
MH_UINT32 buf[2];
#ifdef BOOST_BIG_ENDIAN
buf[0] = bswap_32(a);
buf[1] = bswap_32(b);
#else
#if BOOST_ENDIAN_BIG_BYTE && !BOOST_ENDIAN_LITTLE_BYTE
buf[0] = boost::endian::endian_reverse(a);
buf[1] = boost::endian::endian_reverse(b);
#elif !BOOST_ENDIAN_BIG_BYTE && BOOST_ENDIAN_LITTLE_BYTE
buf[0] = a;
buf[1] = b;
#else
#error Unknown endianness
#endif
return PMurHash32(MURMURHASH3_H_SEED, reinterpret_cast<char*>(buf), sizeof(MH_UINT32) * 2);
}
Expand Down
7 changes: 5 additions & 2 deletions src/hashed_model_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ const ConvertersVec get_converters(
} else {
pVectorConverter q(*retval.rbegin());
Param param("", _h_main, _h_binary, hash_size);
#ifdef NOISY_DEBUG
Rprintf("Initialize InteractionConverter\n");
#endif
*retval.rbegin() = pInteractionConverter(new InteractionConverter(q, p, param));
}
}
Expand Down Expand Up @@ -222,7 +225,7 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size
});
Rprintf("\n");
#endif
std::for_each(i_origin.begin(), i_origin.end(), [&ivec, &xvec, &hash_size](uint32_t hashed_value) {
std::for_each(i_origin.begin(), i_origin.end(), [&ivec](uint32_t hashed_value) {
ivec.push_back(hashed_value);
});
xvec.insert(xvec.end(), x_origin.begin(), x_origin.end());
Expand All @@ -248,7 +251,7 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size
const std::vector<uint32_t>& i_origin(p->get_feature(i));
const std::vector<double>& x_origin(p->get_value(i));
auto x_value = x_origin.begin();
std::for_each(i_origin.begin(), i_origin.end(), [&cache, &hash_size, &x_value, &i](uint32_t hashed_value) {
std::for_each(i_origin.begin(), i_origin.end(), [&cache, &x_value, &i](uint32_t hashed_value) {
std::pair< std::vector<int>, std::vector<double> >& k(cache[hashed_value]);
k.first.push_back(i);
k.second.push_back(*(x_value++));
Expand Down
1 change: 0 additions & 1 deletion src/hashed_model_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

#include <cstring>
#include <memory>
#include <boost/detail/endian.hpp>
#include <Rcpp.h>
#include "hash_function.h"
#include "vector_converter.h"
Expand Down
23 changes: 9 additions & 14 deletions src/intToRaw.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
#include <Rcpp.h>
#include <boost/detail/endian.hpp>
#include <boost/predef/other/endian.h>
#include <boost/endian/conversion.hpp>

#ifdef linux
#include <byteswap.h>
#endif

#ifndef bswap_32
#ifdef __APPLE__
#include <libkern/OSByteOrder.h>
#else
uint32_t bswap_32(uint32_t x);
#endif
#ifndef BOOST_ENDIAN_BIG_BYTE
#error No BOOST_ENDIAN_BIG_BYTE
#endif

using namespace Rcpp;
Expand All @@ -23,10 +16,12 @@ using namespace Rcpp;
SEXP intToRaw(int src) {
RawVector retval(4);
uint32_t *p = (uint32_t*) &retval[0];
#ifdef BOOST_BIG_ENDIAN
p[0] = bswap_32(src);
#else
#if BOOST_ENDIAN_BIG_BYTE && !BOOST_ENDIAN_LITTLE_BYTE
p[0] = boost::endian::endian_reverse(src);
#elif !BOOST_ENDIAN_BIG_BYTE && BOOST_ENDIAN_LITTLE_BYTE
p[0] = src;
#else
#error Unknown endianness
#endif
return retval;
}
8 changes: 5 additions & 3 deletions vignettes/FeatureHashing.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ library(glmnet)
cv.g.lr <- cv.glmnet(m.train, ipinyou.train$IsClick,
family = "binomial")#, type.measure = "auc")
p.lr <- predict(cv.g.lr, m.test, s="lambda.min")
auc(ipinyou.test$IsClick, p.lr)

library(pROC)
auc(ipinyou.test$IsClick, c(p.lr))
```

## Gradient Boosted Decision Tree with [`xgboost`](https://cran.r-project.org/package=xgboost)
Expand All @@ -103,9 +105,9 @@ Following the script above,
# GBDT with xgboost
if(require("xgboost")){
cv.g.gdbt <- xgboost(m.train, ipinyou.train$IsClick, max.depth=7, eta=0.1, subsample = 0.7, colsample_bytree = 0.7,
nround = 100, objective = "binary:logistic", verbose = ifelse(interactive(), 1, 0))
nround = 10, objective = "binary:logistic", verbose = ifelse(interactive(), 1, 0))
p.lm <- predict(cv.g.gdbt, m.test)
glmnet::auc(ipinyou.test$IsClick, p.lm)
auc(ipinyou.test$IsClick, p.lm)
}

```
Expand Down