en/current/HashedDocDotFeatures_8cpp_source.html

 /*

  * This program is free software; you can redistribute it and/or modify

  * it under the terms of the GNU General Public License as published by

  * the Free Software Foundation; either version 3 of the License, or

  * (at your option) any later version.

  *

  * Written (W) 2013 Evangelos Anagnostopoulos

  * Copyright (C) 2013 Evangelos Anagnostopoulos

  */


 #include <shogun/features/hashed/HashedDocDotFeatures.h>

 #include <shogun/lib/DelimiterTokenizer.h>

 #include <shogun/lib/Hash.h>

 #include <shogun/mathematics/Math.h>


 namespace shogun

 {

 CHashedDocDotFeatures::CHashedDocDotFeatures(int32_t hash_bits, CStringFeatures<char>* docs,

     CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size)

 {

     if (n_grams < 1)

         n_grams = 1;


     if ( (n_grams==1 && skips!=0) || (skips<0))

         skips = 0;


     init(hash_bits, docs, tzer, normalize, n_grams, skips);

 }


 CHashedDocDotFeatures::CHashedDocDotFeatures(const CHashedDocDotFeatures& orig)

 : CDotFeatures(orig)

 {

     init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize,

             orig.ngrams, orig.tokens_to_skip);

 }


 CHashedDocDotFeatures::CHashedDocDotFeatures(CFile* loader)

 {

     SG_NOTIMPLEMENTED;

 }


 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs,

     CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips)

 {

     num_bits = hash_bits;

     ngrams = n_grams;

     tokens_to_skip = skips;

     doc_collection = docs;

     tokenizer = tzer;

     should_normalize = normalize;


     if (!tokenizer)

     {

         tokenizer = new CDelimiterTokenizer();

         ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace();

     }


     SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE);

     SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support",

             MS_NOT_AVAILABLE);

     SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features",

             MS_NOT_AVAILABLE);

     SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection",

             MS_NOT_AVAILABLE);

     SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer",

             MS_NOT_AVAILABLE);

     SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products",

             MS_NOT_AVAILABLE);


     SG_REF(doc_collection);

     SG_REF(tokenizer);

 }


 CHashedDocDotFeatures::~CHashedDocDotFeatures()

 {

     SG_UNREF(doc_collection);

     SG_UNREF(tokenizer);

 }


 int32_t CHashedDocDotFeatures::get_dim_feature_space() const

 {

     return CMath::pow(2, num_bits);

 }


 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)

 {

     ASSERT(df)

     ASSERT(df->get_name() == get_name())


     CHashedDocDotFeatures* hddf = (CHashedDocDotFeatures*) df;


     SGVector<char> sv1 = doc_collection->get_feature_vector(vec_idx1);

     SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);


     CHashedDocConverter* converter = new CHashedDocConverter(tokenizer, num_bits,

             should_normalize, ngrams, tokens_to_skip);

     SGSparseVector<float64_t> cv1 = converter->apply(sv1);

     SGSparseVector<float64_t> cv2 = converter->apply(sv2);

     float64_t result = SGSparseVector<float64_t>::sparse_dot(cv1,cv2);


     doc_collection->free_feature_vector(sv1, vec_idx1);

     hddf->doc_collection->free_feature_vector(sv2, vec_idx2);

     SG_UNREF(converter);


     return result;

 }


 float64_t CHashedDocDotFeatures::dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2)

 {

     return dense_dot(vec_idx1, vec2.vector, vec2.vlen);

 }


 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)

 {

     ASSERT(vec2_len == CMath::pow(2,num_bits))


     SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1);


     SGVector<uint32_t> hashes(ngrams+tokens_to_skip);

     index_t hashes_start = 0;

     index_t hashes_end = 0;

     int32_t len = hashes.vlen - 1;


     SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);


     float64_t result = 0;

     CTokenizer* local_tzer = tokenizer->get_copy();


     const int32_t seed = 0xdeadbeaf;

     local_tzer->set_text(sv);

     index_t start = 0;

     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())

     {

         index_t end = local_tzer->next_token_idx(start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);

         hashes[hashes_end++] = token_hash;

     }


     while (local_tzer->has_next())

     {

         index_t end = local_tzer->next_token_idx(start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);

         hashes[hashes_end] = token_hash;


         CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,

                 num_bits, ngrams, tokens_to_skip);


         for (index_t i=0; i<hashed_indices.vlen; i++)

             result += vec2[hashed_indices[i]];


         hashes_start++;

         hashes_end++;

         if (hashes_end==hashes.vlen)

             hashes_end = 0;

         if (hashes_start==hashes.vlen)

             hashes_start = 0;

     }


     if (ngrams>1)

     {

         while (hashes_start!=hashes_end)

         {

             len--;

             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start,

                     len, hashed_indices, num_bits, ngrams, tokens_to_skip);


             for (index_t i=0; i<max_idx; i++)

                 result += vec2[hashed_indices[i]];


             hashes_start++;

             if (hashes_start==hashes.vlen)

                 hashes_start = 0;

         }

     }

     doc_collection->free_feature_vector(sv, vec_idx1);

     SG_UNREF(local_tzer);

     return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result;

 }


 void CHashedDocDotFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1,

     float64_t* vec2, int32_t vec2_len, bool abs_val)

 {

     ASSERT(vec2_len == CMath::pow(2,num_bits))


     if (abs_val)

         alpha = CMath::abs(alpha);


     SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1);

     const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha;


     SGVector<uint32_t> hashes(ngrams+tokens_to_skip);

     index_t hashes_start = 0;

     index_t hashes_end = 0;

     index_t len = hashes.vlen - 1;


     SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);


     CTokenizer* local_tzer = tokenizer->get_copy();


     const int32_t seed = 0xdeadbeaf;

     local_tzer->set_text(sv);

     index_t start = 0;

     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())

     {

         index_t end = local_tzer->next_token_idx(start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);

         hashes[hashes_end++] = token_hash;

     }


     while (local_tzer->has_next())

     {

         index_t end = local_tzer->next_token_idx(start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);

         hashes[hashes_end] = token_hash;


         CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,

                 num_bits, ngrams, tokens_to_skip);


         for (index_t i=0; i<hashed_indices.vlen; i++)

             vec2[hashed_indices[i]] += value;


         hashes_start++;

         hashes_end++;

         if (hashes_end==hashes.vlen)

             hashes_end = 0;

         if (hashes_start==hashes.vlen)

             hashes_start = 0;

     }


     if (ngrams>1)

     {

         while (hashes_start!=hashes_end)

         {

             len--;

             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes,

                     hashes_start, len, hashed_indices, num_bits, ngrams, tokens_to_skip);


             for (index_t i=0; i<max_idx; i++)

                 vec2[hashed_indices[i]] += value;


             hashes_start++;

             if (hashes_start==hashes.vlen)

                 hashes_start = 0;

         }

     }


     doc_collection->free_feature_vector(sv, vec_idx1);

     SG_UNREF(local_tzer);

 }


 uint32_t CHashedDocDotFeatures::calculate_token_hash(char* token,

         int32_t length, int32_t num_bits, uint32_t seed)

 {

     int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed);

     return hash & ((1 << num_bits) - 1);

 }


 void CHashedDocDotFeatures::set_doc_collection(CStringFeatures<char>* docs)

 {

     SG_UNREF(doc_collection);

     doc_collection = docs;

 }


 int32_t CHashedDocDotFeatures::get_nnz_features_for_vector(int32_t num)

 {

     SGVector<char> sv = doc_collection->get_feature_vector(num);

     int32_t num_nnz_features = sv.size();

     doc_collection->free_feature_vector(sv, num);

     return num_nnz_features;

 }


 void* CHashedDocDotFeatures::get_feature_iterator(int32_t vector_index)

 {

     SG_NOTIMPLEMENTED;

     return NULL;

 }


 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)

 {

     SG_NOTIMPLEMENTED;

     return false;

 }


 void CHashedDocDotFeatures::free_feature_iterator(void* iterator)

 {

     SG_NOTIMPLEMENTED;

 }


 const char* CHashedDocDotFeatures::get_name() const

 {

     return "HashedDocDotFeatures";

 }


 CFeatures* CHashedDocDotFeatures::duplicate() const

 {

     return new CHashedDocDotFeatures(*this);

 }


 EFeatureType CHashedDocDotFeatures::get_feature_type() const

 {

     return F_UINT;

 }


 EFeatureClass CHashedDocDotFeatures::get_feature_class() const

 {

     return C_SPARSE;

 }


 int32_t CHashedDocDotFeatures::get_num_vectors() const

 {

     return doc_collection->get_num_vectors();

 }

 }

shogun::CSGObject::get_name
virtual const char * get_name() const =0

shogun::CStringFeatures::get_feature_vector
SGVector< ST > get_feature_vector(int32_t num)
Definition: StringFeatures.cpp:223

shogun::CHashedDocDotFeatures::get_next_feature
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
Definition: HashedDocDotFeatures.cpp:289

shogun::F_UINT
Definition: FeatureTypes.h:28

shogun::SGSparseVector::sparse_dot
T sparse_dot(const SGSparseVector< T > &v)
Definition: SGSparseVector.cpp:123

shogun::CStringFeatures< char >

Math.h

index_t
int32_t index_t
Definition: common.h:62

shogun::CHashedDocDotFeatures::free_feature_iterator
virtual void free_feature_iterator(void *iterator)
Definition: HashedDocDotFeatures.cpp:295

shogun::CTokenizer::set_text
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17

shogun::CStringFeatures::get_num_vectors
virtual int32_t get_num_vectors() const
Definition: StringFeatures.cpp:420

shogun::CHashedDocDotFeatures::num_bits
int32_t num_bits
Definition: HashedDocDotFeatures.h:204

shogun::CHashedDocConverter::generate_ngram_hashes
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
Definition: HashedDocConverter.cpp:205

shogun::CHashedDocDotFeatures::get_feature_class
virtual EFeatureClass get_feature_class() const
Definition: HashedDocDotFeatures.cpp:315

SG_NOTIMPLEMENTED
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:139

shogun::CHashedDocDotFeatures::get_dim_feature_space
virtual int32_t get_dim_feature_space() const
Definition: HashedDocDotFeatures.cpp:80

shogun::CHashedDocDotFeatures::dense_dot
virtual float64_t dense_dot(int32_t vec_idx1, const float64_t *vec2, int32_t vec2_len)
Definition: HashedDocDotFeatures.cpp:113

shogun::CHashedDocDotFeatures::get_feature_iterator
virtual void * get_feature_iterator(int32_t vector_index)
Definition: HashedDocDotFeatures.cpp:283

shogun::CHashedDocDotFeatures::get_name
virtual const char * get_name() const
Definition: HashedDocDotFeatures.cpp:300

shogun::CStringFeatures::free_feature_vector
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Definition: StringFeatures.cpp:357

shogun::CDotFeatures
Features that support dot products among other operations.
Definition: DotFeatures.h:44

Hash.h

shogun::CHashedDocDotFeatures::tokenizer
CTokenizer * tokenizer
Definition: HashedDocDotFeatures.h:207

SG_REF
#define SG_REF(x)
Definition: SGObject.h:51

shogun::EFeatureClass
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38

shogun::CHash::MurmurHash3
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366

shogun::MS_NOT_AVAILABLE
Definition: SGObject.h:89

shogun::CHashedDocDotFeatures::dense_dot_sgvec
virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector< float64_t > vec2)
Definition: HashedDocDotFeatures.cpp:108

HashedDocDotFeatures.h

shogun::SGVector::size
int32_t size() const
Definition: SGVector.h:115

shogun::SGVector::vlen
index_t vlen
Definition: SGVector.h:494

shogun::CHashedDocConverter
This class can be used to convert a document collection contained in a CStringFeatures object w...
Definition: HashedDocConverter.h:39

shogun::SGVector::vector
T * vector
Definition: SGVector.h:492

shogun::CHashedDocDotFeatures::add_to_dense_vec
virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
Definition: HashedDocDotFeatures.cpp:186

ASSERT
#define ASSERT(x)
Definition: SGIO.h:201

DelimiterTokenizer.h

shogun::CHashedDocDotFeatures::doc_collection
CStringFeatures< char > * doc_collection
Definition: HashedDocDotFeatures.h:201

shogun::SGVector< char >

shogun::CHashedDocDotFeatures::~CHashedDocDotFeatures
virtual ~CHashedDocDotFeatures()
Definition: HashedDocDotFeatures.cpp:74

shogun::CTokenizer
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29

float64_t
double float64_t
Definition: common.h:50

shogun::CHashedDocDotFeatures::tokens_to_skip
int32_t tokens_to_skip
Definition: HashedDocDotFeatures.h:216

shogun::CFile
A File access base class.
Definition: File.h:34

shogun::CHashedDocDotFeatures::duplicate
virtual CFeatures * duplicate() const
Definition: HashedDocDotFeatures.cpp:305

shogun::CHashedDocDotFeatures::dot
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
Definition: HashedDocDotFeatures.cpp:85

shogun::CTokenizer::has_next
virtual bool has_next()=0

shogun::CHashedDocDotFeatures::calculate_token_hash
static uint32_t calculate_token_hash(char *token, int32_t length, int32_t num_bits, uint32_t seed)
Definition: HashedDocDotFeatures.cpp:262

shogun::CHashedDocDotFeatures::set_doc_collection
void set_doc_collection(CStringFeatures< char > *docs)
Definition: HashedDocDotFeatures.cpp:269

shogun::CHashedDocDotFeatures::get_num_vectors
virtual int32_t get_num_vectors() const
Definition: HashedDocDotFeatures.cpp:320

shogun::EFeatureType
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19

shogun::CSGObject::CSGObject
CSGObject()
Definition: SGObject.cpp:120

SG_UNREF
#define SG_UNREF(x)
Definition: SGObject.h:52

shogun::CHashedDocDotFeatures::CHashedDocDotFeatures
CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures< char > *docs=NULL, CTokenizer *tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0)
Definition: HashedDocDotFeatures.cpp:18

shogun
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18

shogun::CDelimiterTokenizer
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
Definition: DelimiterTokenizer.h:29

shogun::CTokenizer::next_token_idx
virtual index_t next_token_idx(index_t &start)=0

shogun::CFeatures
The class Features is the base class of all feature objects.
Definition: Features.h:68

shogun::SGSparseVector< float64_t >

shogun::CHashedDocDotFeatures::should_normalize
bool should_normalize
Definition: HashedDocDotFeatures.h:210

shogun::C_SPARSE
Definition: FeatureTypes.h:42

shogun::CTokenizer::get_copy
virtual CTokenizer * get_copy()=0

shogun::CHashedDocDotFeatures::get_feature_type
virtual EFeatureType get_feature_type() const
Definition: HashedDocDotFeatures.cpp:310

SG_ADD
#define SG_ADD(...)
Definition: SGObject.h:81

shogun::CMath::sqrt
static float32_t sqrt(float32_t x)
Definition: Math.h:459

shogun::CHashedDocConverter::apply
virtual CFeatures * apply(CFeatures *features)
Definition: HashedDocConverter.cpp:81

shogun::CHashedDocDotFeatures
This class can be used to provide on-the-fly vectorization of a document collection. Like in the standard Bag-of-Words representation, this class considers each document as a collection of tokens, which are then hashed into a new feature space of a specified dimension. This class is very flexible and allows the user to specify the tokenizer used to tokenize each document, specify whether the results should be normalized with regards to the sqrt of the document size, as well as to specify whether he wants to combine different tokens. The latter implements a k-skip n-grams approach, meaning that you can combine up to n tokens, while skipping up to k. Eg. for the tokens ["a", "b", "c", "d"], with n_grams = 2 and skips = 2, one would get the following combinations : ["a", "ab", "ac" (skipped 1), "ad" (skipped 2), "b", "bc", "bd" (skipped 1), "c", "cd", "d"].
Definition: HashedDocDotFeatures.h:38

shogun::CHashedDocDotFeatures::get_nnz_features_for_vector
virtual int32_t get_nnz_features_for_vector(int32_t num)
Definition: HashedDocDotFeatures.cpp:275

shogun::CHashedDocDotFeatures::ngrams
int32_t ngrams
Definition: HashedDocDotFeatures.h:213

shogun::CMath::pow
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535

shogun::CMath::abs
static T abs(T a)
Definition: Math.h:179