SHOGUN  4.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
HashedMultilabelModel.cpp
Go to the documentation of this file.
1 /*
2  * This software is distributed under BSD 3-clause license (see LICENSE file).
3  *
4  * Copyright(C) 2014 Abinash Panda
5  * Written(W) 2014 Abinash Panda
6  */
7 
10 #include <shogun/lib/Hash.h>
15 
16 using namespace shogun;
17 
20 {
21  init(0);
22 }
23 
25  CStructuredLabels * labels, int32_t dim) : CStructuredModel(features, labels)
26 {
27  init(dim);
28 }
29 
31 {
32 }
33 
35  int32_t num_examples)
36 {
37  return new CMultilabelSOLabels(num_examples, m_num_classes);
38 }
39 
40 void CHashedMultilabelModel::init(int32_t dim)
41 {
42  SG_ADD(&m_false_positive, "false_positive", "Misclassification cost for false positive",
44  SG_ADD(&m_false_negative, "false_negative", "Misclassification cost for false negative",
46  SG_ADD(&m_num_classes, "num_classes", "Number of (binary) class assignment per label",
48  SG_ADD(&m_dim, "dim", "New joint feature space dimension", MS_NOT_AVAILABLE);
49  SG_ADD(&m_seeds, "seeds", "Vector of seeds used for hashing",
51 
52  m_false_positive = 1;
53  m_false_negative = 1;
54  m_num_classes = 0;
55  m_dim = dim;
56 
57  if (m_labels != NULL)
58  {
59  m_seeds = SGVector<uint32_t>(
60  ((CMultilabelSOLabels *)m_labels)->get_num_classes());
62  }
63  else
64  {
65  m_seeds = SGVector<uint32_t>(0);
66  }
67 }
68 
70 {
71  return m_dim;
72 }
73 
75  float64_t false_negative)
76 {
77  m_false_positive = false_positive;
78  m_false_negative = false_negative;
79 }
80 
82 {
83  REQUIRE(((CMultilabelSOLabels *)m_labels)->get_num_classes() == seeds.vlen,
84  "Seeds for all the classes not provided. \n");
85  m_seeds = seeds;
86 }
87 
89  int32_t feat_idx, CStructuredData * y)
90 {
91  SG_ERROR("compute_joint_feature(int32_t, CStructuredData*) is not "
92  "implemented for %s!\n", get_name());
93 
94  return SGVector<float64_t>();
95 }
96 
98  int32_t feat_idx, CStructuredData * y)
99 {
101  get_sparse_feature_vector(feat_idx);
102 
104  ASSERT(slabel != NULL);
105  SGVector<int32_t> slabel_data = slabel->get_data();
106 
107  SGSparseVector<float64_t> psi(vec.num_feat_entries * slabel_data.vlen);
108  index_t k = 0;
109 
110  for (int32_t i = 0; i < slabel_data.vlen; i++)
111  {
112  int32_t label = slabel_data[i];
113  uint32_t seed = (uint32_t)m_seeds[label];
114 
115  for (int32_t j = 0; j < vec.num_feat_entries; j++)
116  {
117  uint32_t hash = CHash::MurmurHash3(
118  (uint8_t *)&vec.features[j].feat_index,
119  sizeof(index_t), seed);
120  psi.features[k].feat_index = (hash >> 1) % m_dim;
121  psi.features[k++].entry =
122  (hash % 2 == 1 ? -1.0 : 1.0) * vec.features[j].entry;
123  }
124 
125  }
126 
127  psi.sort_features(true);
128  return psi;
129 }
130 
132  CStructuredData * y2)
133 {
136 
137  ASSERT(y1_slabel != NULL);
138  ASSERT(y2_slabel != NULL);
139 
141  return delta_loss(
143  multi_labels->get_num_classes(), 1, 0),
145  multi_labels->get_num_classes(), 1, 0));
146 }
147 
150 {
151  REQUIRE(y1.vlen == y2.vlen, "Size of both the vectors should be same\n");
152 
153  float64_t loss = 0;
154 
155  for (index_t i = 0; i < y1.vlen; i++)
156  {
157  loss += delta_loss(y1[i], y2[i]);
158  }
159 
160  return loss;
161 }
162 
164 {
165  return y1 > y2 ? m_false_negative : y1 < y2 ? m_false_positive : 0;
166 }
167 
169  float64_t regularization,
177 {
179 }
180 
181 SGSparseVector<float64_t> CHashedMultilabelModel::get_hashed_feature_vector(
182  int32_t feat_idx, uint32_t seed)
183 {
185  get_sparse_feature_vector(feat_idx);
186 
188 
189  for (int32_t j = 0; j < vec.num_feat_entries; j++)
190  {
191  uint32_t hash = CHash::MurmurHash3(
192  (uint8_t *)&vec.features[j].feat_index,
193  sizeof(index_t), seed);
194  h_vec.features[j].feat_index = (hash >> 1) % m_dim;
195  h_vec.features[j].entry =
196  (hash % 2 == 1 ? -1.0 : 1.0) * vec.features[j].entry;
197  }
198 
199  h_vec.sort_features(true);
200 
201  return h_vec;
202 }
203 
204 SGVector<int32_t> CHashedMultilabelModel::to_sparse(SGVector<float64_t> dense_vec,
205  float64_t d_true, float64_t d_false)
206 {
207  int32_t size = 0;
208 
209  for (index_t i = 0; i < dense_vec.vlen; i++)
210  {
211  REQUIRE(dense_vec[i] == d_true || dense_vec[i] == d_false,
212  "The values of dense vector should be either (%d) or (%d).\n",
213  d_true, d_false);
214 
215  if (dense_vec[i] == d_true)
216  {
217  size++;
218  }
219  }
220 
221  SGVector<int32_t> sparse_vec(size);
222  index_t j = 0;
223 
224  for (index_t i = 0; i < dense_vec.vlen; i++)
225  {
226  if (dense_vec[i] == d_true)
227  {
228  sparse_vec[j] = i;
229  j++;
230  }
231  }
232 
233  return sparse_vec;
234 }
235 
237  int32_t feat_idx, bool const training)
238 {
240 
241  if (training)
242  {
243  m_num_classes = multi_labs->get_num_classes();
244  }
245  else
246  {
247  REQUIRE(m_num_classes > 0, "The model needs to be trained before using"
248  "it for prediction.\n");
249  }
250 
251  int32_t dim = get_dim();
252  ASSERT(dim == w.vlen);
253 
254  float64_t score = 0, total_score = 0;
255 
257  multi_labs->get_label(feat_idx));
258  SGVector<int32_t> slabel_data = slabel->get_data();
260  slabel, m_num_classes, 1, 0);
261  SG_UNREF(slabel);
262 
263  SGVector<float64_t> y_pred_dense(m_num_classes);
264  y_pred_dense.zero();
265  int32_t count = 0;
266 
267  for (int32_t c = 0; c < m_num_classes; c++)
268  {
269  SGSparseVector<float64_t> phi = get_hashed_feature_vector(feat_idx,
270  m_seeds[c]);
271  score = phi.dense_dot(1.0, w.vector, w.vlen, 0);
272 
273  if (score > 0)
274  {
275  y_pred_dense[c] = 1;
276  total_score += score;
277  count++;
278  }
279 
280  }
281 
282  SGVector<int32_t> y_pred_sparse = to_sparse(y_pred_dense, 1, 0);
283  ASSERT(count == y_pred_sparse.vlen);
284 
285  CResultSet * ret = new CResultSet();
286  SG_REF(ret);
287  ret->psi_computed_sparse = true;
288  ret->psi_computed = false;
289 
290  CSparseMultilabel * y_pred_label = new CSparseMultilabel(y_pred_sparse);
291  SG_REF(y_pred_label);
292 
293  ret->psi_pred_sparse = get_sparse_joint_feature_vector(feat_idx, y_pred_label);
294  ret->score = total_score;
295  ret->argmax = y_pred_label;
296 
297  if (training)
298  {
299  ret->delta = CStructuredModel::delta_loss(feat_idx, y_pred_label);
301  feat_idx, feat_idx);
302  ret->score += (ret->delta - ret->psi_truth_sparse.dense_dot(1, w.vector,
303  w.vlen, 0));
304  }
305 
306  return ret;
307 }
308 
virtual const char * get_name() const
void sort_features(bool stable_pointer=false)
Base class of the labels used in Structured Output (SO) problems.
virtual void set_seeds(SGVector< uint32_t > seeds)
int32_t index_t
Definition: common.h:62
Class CMultilabelSOLabels used in the application of Structured Output (SO) learning to Multilabel Cl...
virtual void init_primal_opt(float64_t regularization, SGMatrix< float64_t > &A, SGVector< float64_t > a, SGMatrix< float64_t > B, SGVector< float64_t > &b, SGVector< float64_t > &lb, SGVector< float64_t > &ub, SGMatrix< float64_t > &C)
virtual CStructuredLabels * structured_labels_factory(int32_t num_examples=0)
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
virtual SGVector< float64_t > get_joint_feature_vector(int32_t feat_idx, CStructuredData *y)
#define SG_REF(x)
Definition: SGObject.h:51
static uint32_t MurmurHash3(uint8_t *data, int32_t len, uint32_t seed)
Definition: Hash.cpp:366
virtual void set_misclass_cost(float64_t false_positive, float64_t false_negative)
index_t vlen
Definition: SGVector.h:494
SGSparseVector< float64_t > get_sparse_joint_feature_vector(int32_t feat_idx, int32_t lab_idx)
#define ASSERT(x)
Definition: SGIO.h:201
double float64_t
Definition: common.h:50
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:230
virtual float64_t delta_loss(CStructuredData *y1, CStructuredData *y2)
float64_t delta_loss(int32_t ytrue_idx, CStructuredData *ypred)
Class CSparseMultilabel to be used in the application of Structured Output (SO) learning to Multilabe...
SGSparseVectorEntry< T > * features
virtual SGSparseVector< float64_t > get_sparse_joint_feature_vector(int32_t feat_idx, CStructuredData *y)
T dense_dot(T alpha, T *vec, int32_t dim, T b)
Class CStructuredModel that represents the application specific model and contains most of the applic...
#define SG_UNREF(x)
Definition: SGObject.h:52
CStructuredLabels * m_labels
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The class Features is the base class of all feature objects.
Definition: Features.h:68
static SGVector< float64_t > to_dense(CStructuredData *label, int32_t dense_dim, float64_t d_true, float64_t d_false)
CStructuredData * argmax
SGSparseVector< float64_t > psi_truth_sparse
virtual CStructuredData * get_label(int32_t j)
#define SG_ADD(...)
Definition: SGObject.h:81
virtual int32_t get_num_classes() const
SGSparseVector< float64_t > psi_pred_sparse
static SGMatrix< T > create_identity_matrix(index_t size, T scale)
Base class of the components of StructuredLabels.
static CSparseMultilabel * obtain_from_generic(CStructuredData *base_data)
SGVector< int32_t > get_data() const
virtual CResultSet * argmax(SGVector< float64_t > w, int32_t feat_idx, bool const training=true)

SHOGUN Machine Learning Toolbox - Documentation