SHOGUN  v3.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StreamingHashedDocDotFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2013 Evangelos Anagnostopoulos
8  * Copyright (C) 2013 Evangelos Anagnostopoulos
9  */
10 
14 
15 using namespace shogun;
16 
18  bool is_labelled, int32_t size, CTokenizer* tzer, int32_t bits)
20 {
21  init(file, is_labelled, size, tzer, bits, true, 1, 0);
22 }
23 
25 {
26  init(NULL, false, 0, NULL, 0, false, 1, 0);
27 }
28 
30  CStringFeatures<char>* dot_features, CTokenizer* tzer, int32_t bits, float64_t* lab)
32 {
34  new CStreamingFileFromStringFeatures<char>(dot_features, lab);
35  bool is_labelled = (lab != NULL);
36  int32_t size=1024;
37 
38  init(file, is_labelled, size, tzer, bits, true, 1, 0);
39 
40  parser.set_free_vectors_on_destruct(false);
41  seekable= true;
42 }
43 void CStreamingHashedDocDotFeatures::init(CStreamingFile* file, bool is_labelled,
44  int32_t size, CTokenizer* tzer, int32_t bits, bool normalize, int32_t n_grams, int32_t skips)
45 {
46  num_bits = bits;
47  tokenizer = tzer;
48  if (tokenizer)
49  {
51  converter = new CHashedDocConverter(tzer, bits, normalize, n_grams, skips);
52  }
53  else
54  converter=NULL;
55 
56  SG_ADD(&num_bits, "num_bits", "Number of bits for hash", MS_NOT_AVAILABLE);
57  SG_ADD((CSGObject** ) &tokenizer, "tokenizer", "The tokenizer used on the documents",
59  SG_ADD((CSGObject** ) &converter, "converter", "Converter", MS_NOT_AVAILABLE);
60 
61  has_labels = is_labelled;
62  if (file)
63  {
64  working_file = file;
66  parser.init(file, is_labelled, size);
67  seekable = false;
68  }
69  else
70  working_file = NULL;
71 
73  parser.set_free_vector_after_release(false);
74 }
75 
77 {
78  if (parser.is_running())
79  parser.end_parser();
83 }
84 
86 {
87  ASSERT(df)
88  ASSERT(df->get_name() == get_name())
89 
91  float32_t result = current_vector.sparse_dot(cdf->current_vector);
92  return result;
93 }
94 
96 {
97  ASSERT(vec2_len == CMath::pow(2, num_bits))
98 
99  float32_t result = 0;
100  for (index_t i=0; i<current_vector.num_feat_entries; i++)
101  {
102  result += vec2[current_vector.features[i].feat_index] *
104  }
105  return result;
106 }
107 
109  int32_t vec2_len, bool abs_val)
110 {
111  float32_t value = abs_val ? CMath::abs(alpha) : alpha;
112 
113  for (index_t i=0; i<current_vector.num_feat_entries; i++)
115 }
116 
118 {
119  return CMath::pow(2, num_bits);
120 }
121 
123 {
124  return "StreamingHashedDocDotFeatures";
125 }
126 
128 {
129  return new CStreamingHashedDocDotFeatures(*this);
130 }
131 
133 {
134  return F_UINT;
135 }
136 
138 {
139  return C_STREAMING_SPARSE;
140 }
141 
143 {
144  if (!parser.is_running())
145  parser.start_parser();
146 }
147 
149 {
150  parser.end_parser();
151 }
152 
154 {
155  SGVector<char> tmp;
156  if (parser.get_next_example(tmp.vector,
157  tmp.vlen, current_label))
158  {
159  ASSERT(tmp.vector)
160  ASSERT(tmp.vlen > 0)
162  return true;
163  }
164  return false;
165 }
166 
168 {
169  parser.finalize_example();
170 }
171 
173 {
174  return (int32_t) CMath::pow(2, num_bits);
175 }
176 
178 {
179  return current_label;
180 }
181 
183 {
184  return 1;
185 }
186 
188 {
189  parser.set_read_vector(&CStreamingFile::get_string);
190 }
191 
193 {
194  parser.set_read_vector_and_label(&CStreamingFile::get_string_and_label);
195 }
196 
198 {
199  return current_vector;
200 }
201 
203 {
204  converter->set_normalization(normalize);
205 }
206 
208 {
210 }

SHOGUN Machine Learning Toolbox - Documentation