SHOGUN  4.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Hierarchical.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2007-2009 Soeren Sonnenburg
8  * Copyright (C) 2007-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
13 #include <shogun/labels/Labels.h>
16 #include <shogun/base/Parallel.h>
17 
18 #ifdef HAVE_PTHREAD
19 #include <pthread.h>
20 #endif
21 
22 using namespace shogun;
23 
24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
25 struct pair
26 {
28  int32_t idx1;
30  int32_t idx2;
31 };
32 #endif // DOXYGEN_SHOULD_SKIP_THIS
33 
35 : CDistanceMachine(), merges(3), dimensions(0), assignment(NULL),
36  table_size(0), pairs(NULL), merge_distance(NULL)
37 {
38 }
39 
41 : CDistanceMachine(), merges(merges_), dimensions(0), assignment(NULL),
42  table_size(0), pairs(NULL), merge_distance(NULL)
43 {
44  set_distance(d);
45 }
46 
48 {
49  SG_FREE(merge_distance);
50  SG_FREE(assignment);
51  SG_FREE(pairs);
52 }
53 
55 {
56  return CT_HIERARCHICAL;
57 }
58 
60 {
62 
63  if (data)
64  distance->init(data, data);
65 
66  CFeatures* lhs=distance->get_lhs();
67  ASSERT(lhs)
68 
69  int32_t num=lhs->get_num_vectors();
70  ASSERT(num>0)
71 
72  const int32_t num_pairs=num*(num-1)/2;
73 
74  SG_FREE(merge_distance);
75  merge_distance=SG_MALLOC(float64_t, num);
77 
78  SG_FREE(assignment);
79  assignment=SG_MALLOC(int32_t, num);
81 
82  SG_FREE(pairs);
83  pairs=SG_MALLOC(int32_t, 2*num);
85 
86  pair* index=SG_MALLOC(pair, num_pairs);
87  float64_t* distances=SG_MALLOC(float64_t, num_pairs);
88 
89  int32_t offs=0;
90  for (int32_t i=0; i<num; i++)
91  {
92  for (int32_t j=i+1; j<num; j++)
93  {
94  distances[offs]=distance->distance(i,j);
95  index[offs].idx1=i;
96  index[offs].idx2=j;
97  offs++; //offs=i*(i+1)/2+j
98  }
99  SG_PROGRESS(i, 0, num-1)
100  }
101 
102  CMath::qsort_index<float64_t,pair>(distances, index, (num-1)*num/2);
103  //CMath::display_vector(distances, (num-1)*num/2, "dists");
104 
105  int32_t k=-1;
106  int32_t l=0;
107  for (; l<num && (num-l)>=merges && k<num_pairs-1; l++)
108  {
109  while (k<num_pairs-1)
110  {
111  k++;
112 
113  int32_t i=index[k].idx1;
114  int32_t j=index[k].idx2;
115  int32_t c1=assignment[i];
116  int32_t c2=assignment[j];
117 
118  if (c1==c2)
119  continue;
120 
121  SG_PROGRESS(k, 0, num_pairs-1)
122 
123  if (c1<c2)
124  {
125  pairs[2*l]=c1;
126  pairs[2*l+1]=c2;
127  }
128  else
129  {
130  pairs[2*l]=c2;
131  pairs[2*l+1]=c1;
132  }
133  merge_distance[l]=distances[k];
134 
135  int32_t c=num+l;
136  for (int32_t m=0; m<num; m++)
137  {
138  if (assignment[m] == c1 || assignment[m] == c2)
139  assignment[m] = c;
140  }
141 #ifdef DEBUG_HIERARCHICAL
142  SG_PRINT("l=%04i i=%04i j=%04i c1=%+04d c2=%+04d c=%+04d dist=%6.6f\n", l,i,j, c1,c2,c, merge_distance[l])
143 #endif
144  break;
145  }
146  }
147 
148  assignment_size=num;
149  table_size=l-1;
150  ASSERT(table_size>0)
151  SG_FREE(distances);
152  SG_FREE(index);
153  SG_UNREF(lhs)
154 
155  return true;
156 }
157 
158 bool CHierarchical::load(FILE* srcfile)
159 {
162  return false;
163 }
164 
165 bool CHierarchical::save(FILE* dstfile)
166 {
169  return false;
170 }
171 
172 
174 {
175  return merges;
176 }
177 
179 {
180  return SGVector<int32_t>(assignment,table_size, false);
181 }
182 
184 {
186 }
187 
189 {
190  return SGMatrix<int32_t>(pairs,2,merges, false);
191 }
192 
193 
195 {
196  /* TODO. Currently does nothing since apply methods are not implemented. */
197 }
EMachineType
Definition: Machine.h:33
virtual bool save(FILE *dstfile)
#define SG_RESET_LOCALE
Definition: SGIO.h:86
static void fill_vector(T *vec, int32_t len, T value)
Definition: SGVector.cpp:221
SGVector< float64_t > get_merge_distances()
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:87
#define SG_PROGRESS(...)
Definition: SGIO.h:142
CFeatures * get_lhs()
Definition: Distance.h:224
virtual int32_t get_num_vectors() const =0
int32_t merges
the number of merges in hierarchical clustering
Definition: Hierarchical.h:129
#define SG_SET_LOCALE_C
Definition: SGIO.h:85
int32_t * assignment
cluster assignment for the num_points
Definition: Hierarchical.h:138
A generic DistanceMachine interface.
#define SG_PRINT(...)
Definition: SGIO.h:137
#define ASSERT(x)
Definition: SGIO.h:201
virtual bool train_machine(CFeatures *data=NULL)
SGMatrix< int32_t > get_cluster_pairs()
int32_t table_size
size of the below tables
Definition: Hierarchical.h:141
double float64_t
Definition: common.h:50
int32_t assignment_size
size of assignment table
Definition: Hierarchical.h:135
static void range_fill_vector(T *vec, int32_t len, T start=0)
Definition: SGVector.cpp:228
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:206
#define SG_UNREF(x)
Definition: SGObject.h:55
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
SGVector< int32_t > get_assignment()
virtual void store_model_features()
virtual bool load(FILE *srcfile)
The class Features is the base class of all feature objects.
Definition: Features.h:68
void set_distance(CDistance *d)
float64_t * merge_distance
distance at which pair i/j was added
Definition: Hierarchical.h:147
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
virtual EMachineType get_classifier_type()
int32_t * pairs
tuples of i/j
Definition: Hierarchical.h:144

SHOGUN Machine Learning Toolbox - Documentation