en/latest/HashedDocConverter_8cpp_source.html

 /*

  * This program is free software; you can redistribute it and/or modify

  * it under the terms of the GNU General Public License as published by

  * the Free Software Foundation; either version 3 of the License, or

  * (at your option) any later version.

  *

  * Written (W) 2013 Evangelos Anagnostopoulos

  * Copyright (C) 2013 Evangelos Anagnostopoulos

  */


 #include <shogun/converter/HashedDocConverter.h>

 #include <shogun/lib/DelimiterTokenizer.h>

 #include <shogun/lib/Hash.h>

 #include <shogun/lib/DynamicArray.h>

 #include <shogun/features/StringFeatures.h>

 #include <shogun/features/hashed/HashedDocDotFeatures.h>

 #include <shogun/mathematics/Math.h>


 using namespace shogun;


 namespace shogun

 {

 CHashedDocConverter::CHashedDocConverter() : CConverter()

 {

     init(NULL, 16, false, 1, 0);

 }


 CHashedDocConverter::CHashedDocConverter(int32_t hash_bits, bool normalize,

     int32_t n_grams, int32_t skips) : CConverter()

 {

     init(NULL, hash_bits, normalize, n_grams, skips);

 }


 CHashedDocConverter::CHashedDocConverter(CTokenizer* tzer,

     int32_t hash_bits, bool normalize, int32_t n_grams, int32_t skips) : CConverter()

 {

     init(tzer, hash_bits, normalize, n_grams, skips);

 }


 CHashedDocConverter::~CHashedDocConverter()

 {

     SG_UNREF(tokenizer);

 }


 void CHashedDocConverter::init(CTokenizer* tzer, int32_t hash_bits, bool normalize,

     int32_t n_grams, int32_t skips)

 {

     num_bits = hash_bits;

     should_normalize = normalize;

     ngrams = n_grams;

     tokens_to_skip = skips;


     if (tzer==NULL)

     {

         CDelimiterTokenizer* tk = new CDelimiterTokenizer();

         tk->delimiters[(uint8_t) ' '] = 1;

         tk->delimiters[(uint8_t) '\t'] = 1;

         tokenizer = tk;

     }

     else

         tokenizer = tzer;


     SG_REF(tokenizer);

     SG_ADD(&num_bits, "num_bits", "Number of bits of the hash",

         MS_NOT_AVAILABLE);

     SG_ADD(&ngrams, "ngrams", "Number of consecutive tokens",

         MS_NOT_AVAILABLE);

     SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip",

         MS_NOT_AVAILABLE);

     SG_ADD(&should_normalize, "should_normalize", "Whether to normalize vectors or not",

         MS_NOT_AVAILABLE);

     m_parameters->add((CSGObject**) &tokenizer, "tokenizer",

         "Tokenizer");

 }


 const char* CHashedDocConverter::get_name() const

 {

     return "HashedDocConverter";

 }


 CFeatures* CHashedDocConverter::apply(CFeatures* features)

 {

     ASSERT(features);

     if (strcmp(features->get_name(), "StringFeatures")!=0)

         SG_ERROR("CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");


     CStringFeatures<char>* s_features = (CStringFeatures<char>*) features;


     int32_t dim = CMath::pow(2, num_bits);

     SGSparseMatrix<float64_t> matrix(dim,features->get_num_vectors());

     for (index_t vec_idx=0; vec_idx<s_features->get_num_vectors(); vec_idx++)

     {

         SGVector<char> doc = s_features->get_feature_vector(vec_idx);

         matrix[vec_idx] = apply(doc);

         s_features->free_feature_vector(doc, vec_idx);

     }


     return (CFeatures*) new CSparseFeatures<float64_t>(matrix);

 }


 SGSparseVector<float64_t> CHashedDocConverter::apply(SGVector<char> document)

 {

     ASSERT(document.size()>0)

     const int32_t array_size = 1024*1024;

     CDynamicArray<uint32_t> hashed_indices(array_size);


     SGVector<uint32_t> cached_hashes(ngrams+tokens_to_skip);

     index_t hashes_start = 0;

     index_t hashes_end = 0;

     int32_t len = cached_hashes.vlen - 1;


     SGVector<index_t> ngram_indices((ngrams-1)*(tokens_to_skip+1) + 1);


     const int32_t seed = 0xdeadbeaf;

     tokenizer->set_text(document);

     index_t token_start = 0;

     while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next())

     {

         index_t end = tokenizer->next_token_idx(token_start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],

                 end-token_start, seed);

         cached_hashes[hashes_end++] = token_hash;

     }


     while (tokenizer->has_next())

     {

         index_t end = tokenizer->next_token_idx(token_start);

         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],

                 end-token_start, seed);

         cached_hashes[hashes_end] = token_hash;


         CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start, len,

                 ngram_indices, num_bits, ngrams, tokens_to_skip);


         for (index_t i=0; i<ngram_indices.vlen; i++)

             hashed_indices.append_element(ngram_indices[i]);


         hashes_start++;

         hashes_end++;

         if (hashes_end==cached_hashes.vlen)

             hashes_end = 0;

         if (hashes_start==cached_hashes.vlen)

             hashes_start = 0;

     }


     if (ngrams>1)

     {

         while (hashes_start!=hashes_end)

         {

             len--;

             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start,

                     len, ngram_indices, num_bits, ngrams, tokens_to_skip);


             for (index_t i=0; i<max_idx; i++)

                 hashed_indices.append_element(ngram_indices[i]);


             hashes_start++;

             if (hashes_start==cached_hashes.vlen)

                 hashes_start = 0;

         }

     }


     SGSparseVector<float64_t> sparse_doc_rep = create_hashed_representation(hashed_indices);


     if (should_normalize)

     {

         float64_t norm_const = CMath::sqrt((float64_t) document.size());

         for (index_t i=0; i<sparse_doc_rep.num_feat_entries; i++)

             sparse_doc_rep.features[i].entry /= norm_const;

     }


     return sparse_doc_rep;

 }


 SGSparseVector<float64_t> CHashedDocConverter::create_hashed_representation(CDynamicArray<uint32_t>& hashed_indices)

 {

     int32_t num_nnz_features = count_distinct_indices(hashed_indices);


     SGSparseVector<float64_t> sparse_doc_rep(num_nnz_features);

     index_t sparse_idx = 0;

     for (index_t i=0; i<hashed_indices.get_num_elements(); i++)

     {

         sparse_doc_rep.features[sparse_idx].feat_index = hashed_indices[i];

         sparse_doc_rep.features[sparse_idx].entry = 1;

         while ( (i+1<hashed_indices.get_num_elements()) &&

                 (hashed_indices[i+1]==hashed_indices[i]) )

         {

             sparse_doc_rep.features[sparse_idx].entry++;

             i++;

         }

         sparse_idx++;

     }

     return sparse_doc_rep;

 }


 index_t CHashedDocConverter::generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start,

     index_t len, SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)

 {

     index_t h_idx = 0;

     ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 << num_bits) -1);


     for (index_t n=1; n<ngrams; n++)

     {

         for (index_t s=0; s<=tokens_to_skip; s++)

         {

             if ( n+s > len)

                 break;


             uint32_t ngram_hash = hashes[hashes_start];

             for (index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++)

                 ngram_hash = ngram_hash ^ hashes[i % hashes.vlen];

             ngram_hash = ngram_hash & ((1 << num_bits) - 1);

             ngram_hashes[h_idx++] = ngram_hash;

         }

     }

     return h_idx;

 }


 int32_t CHashedDocConverter::count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices)

 {

     CMath::qsort(hashed_indices.get_array(), hashed_indices.get_num_elements());


     int32_t num_nnz_features = 0;

     for (index_t i=0; i<hashed_indices.get_num_elements(); i++)

     {

         num_nnz_features++;

         while ( (i+1<hashed_indices.get_num_elements()) &&

                 (hashed_indices[i+1]==hashed_indices[i]) )

         {

             i++;

         }

     }

     return num_nnz_features;

 }


 void CHashedDocConverter::set_normalization(bool normalize)

 {

     should_normalize = normalize;

 }


 void CHashedDocConverter::set_k_skip_n_grams(int32_t k, int32_t n)

 {

     tokens_to_skip = k;

     ngrams = n;

 }

 }

shogun::CSGObject::get_name
virtual const char * get_name() const =0

shogun::CConverter
class Converter used to convert data
Definition: Converter.h:26

shogun::CStringFeatures::get_feature_vector
SGVector< ST > get_feature_vector(int32_t num)
Definition: StringFeatures.cpp:223

shogun::CHashedDocConverter::~CHashedDocConverter
virtual ~CHashedDocConverter()
Definition: HashedDocConverter.cpp:40

shogun::CHashedDocConverter::init
void init(CTokenizer *tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips)
Definition: HashedDocConverter.cpp:45

shogun::SGSparseMatrix< float64_t >

shogun::CHashedDocConverter::tokenizer
CTokenizer * tokenizer
Definition: HashedDocConverter.h:148

shogun::CStringFeatures< char >

Math.h

index_t
int32_t index_t
Definition: common.h:62

shogun::CTokenizer::set_text
virtual void set_text(SGVector< char > txt)
Definition: Tokenizer.cpp:17

shogun::CStringFeatures::get_num_vectors
virtual int32_t get_num_vectors() const
Definition: StringFeatures.cpp:420

shogun::CHashedDocConverter::generate_ngram_hashes
static index_t generate_ngram_hashes(SGVector< uint32_t > &hashes, index_t hashes_start, index_t len, SGVector< index_t > &ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
Definition: HashedDocConverter.cpp:205

shogun::CHashedDocConverter::ngrams
int32_t ngrams
Definition: HashedDocConverter.h:154

shogun::CHashedDocConverter::count_distinct_indices
int32_t count_distinct_indices(CDynamicArray< uint32_t > &hashed_indices)
Definition: HashedDocConverter.cpp:228

shogun::CSparseFeatures< float64_t >

shogun::CFeatures::get_num_vectors
virtual int32_t get_num_vectors() const =0

shogun::CDynamicArray::get_array
T * get_array() const
Definition: DynamicArray.h:408

StringFeatures.h

SG_ERROR
#define SG_ERROR(...)
Definition: SGIO.h:129

shogun::CSGObject::m_parameters
Parameter * m_parameters
Definition: SGObject.h:546

shogun::CStringFeatures::free_feature_vector
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
Definition: StringFeatures.cpp:357

shogun::CDynamicArray
Template Dynamic array class that creates an array that can be used like a list or an array...
Definition: DynArray.h:22

Hash.h

SG_REF
#define SG_REF(x)
Definition: SGObject.h:54

shogun::SGSparseVector::num_feat_entries
index_t num_feat_entries
Definition: SGSparseVector.h:212

shogun::CHashedDocConverter::tokens_to_skip
int32_t tokens_to_skip
Definition: HashedDocConverter.h:157

shogun::CHash::MurmurHash3
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366

shogun::MS_NOT_AVAILABLE
Definition: SGObject.h:92

shogun::CMath::qsort
static void qsort(T *output, int32_t size)
Definition: Math.h:1313

shogun::SGSparseVectorEntry::feat_index
index_t feat_index
Definition: SGSparseVector.h:30

HashedDocDotFeatures.h

shogun::SGVector::size
int32_t size() const
Definition: SGVector.h:113

shogun::Parameter::add
void add(bool *param, const char *name, const char *description="")
Definition: Parameter.cpp:37

shogun::CHashedDocConverter::get_name
virtual const char * get_name() const
Definition: HashedDocConverter.cpp:76

shogun::SGVector::vlen
index_t vlen
Definition: SGVector.h:494

shogun::SGVector::vector
T * vector
Definition: SGVector.h:492

ASSERT
#define ASSERT(x)
Definition: SGIO.h:201

DelimiterTokenizer.h

shogun::CSGObject
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:115

shogun::CHashedDocConverter::should_normalize
bool should_normalize
Definition: HashedDocConverter.h:151

shogun::CHashedDocConverter::create_hashed_representation
SGSparseVector< float64_t > create_hashed_representation(CDynamicArray< uint32_t > &hashed_indices)
Definition: HashedDocConverter.cpp:184

shogun::SGVector< char >

shogun::CTokenizer
The class CTokenizer acts as a base class in order to implement tokenizers. Sub-classes must implemen...
Definition: Tokenizer.h:29

float64_t
double float64_t
Definition: common.h:50

shogun::SGSparseVectorEntry::entry
T entry
Definition: SGSparseVector.h:32

shogun::CTokenizer::has_next
virtual bool has_next()=0

shogun::SGSparseVector::features
SGSparseVectorEntry< T > * features
Definition: SGSparseVector.h:215

shogun::CHashedDocConverter::CHashedDocConverter
CHashedDocConverter()
Definition: HashedDocConverter.cpp:23

SG_UNREF
#define SG_UNREF(x)
Definition: SGObject.h:55

shogun
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18

shogun::CDelimiterTokenizer
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
Definition: DelimiterTokenizer.h:29

shogun::CDelimiterTokenizer::delimiters
SGVector< bool > delimiters
Definition: DelimiterTokenizer.h:104

shogun::CTokenizer::next_token_idx
virtual index_t next_token_idx(index_t &start)=0

HashedDocConverter.h

shogun::CFeatures
The class Features is the base class of all feature objects.
Definition: Features.h:68

DynamicArray.h

shogun::SGSparseVector< float64_t >

shogun::CHashedDocConverter::set_k_skip_n_grams
void set_k_skip_n_grams(int32_t k, int32_t n)
Definition: HashedDocConverter.cpp:251

shogun::CDynamicArray::get_num_elements
int32_t get_num_elements() const
Definition: DynamicArray.h:200

SG_ADD
#define SG_ADD(...)
Definition: SGObject.h:84

shogun::CMath::sqrt
static float32_t sqrt(float32_t x)
Definition: Math.h:459

shogun::CHashedDocConverter::apply
virtual CFeatures * apply(CFeatures *features)
Definition: HashedDocConverter.cpp:81

shogun::CHashedDocConverter::set_normalization
void set_normalization(bool normalize)
Definition: HashedDocConverter.cpp:246

shogun::CMath::pow
static int32_t pow(bool x, int32_t n)
Definition: Math.h:535

shogun::CHashedDocConverter::num_bits
int32_t num_bits
Definition: HashedDocConverter.h:145