SVMSGD.cpp

Go to the documentation of this file.
00001 /*
00002    SVM with stochastic gradient
00003    Copyright (C) 2007- Leon Bottou
00004 
00005    This program is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software
00017    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
00018    $Id: svmsgd.cpp,v 1.13 2007/10/02 20:40:06 cvs Exp $
00019 
00020    Shogun adjustments (w) 2008-2009 Soeren Sonnenburg
00021 */
00022 
00023 #include <shogun/classifier/svm/SVMSGD.h>
00024 #include <shogun/base/Parameter.h>
00025 #include <shogun/lib/Signal.h>
00026 #include <shogun/loss/HingeLoss.h>
00027 
00028 using namespace shogun;
00029 
00030 CSVMSGD::CSVMSGD()
00031 : CLinearMachine()
00032 {
00033     init();
00034 }
00035 
00036 CSVMSGD::CSVMSGD(float64_t C)
00037 : CLinearMachine()
00038 {
00039     init();
00040 
00041     C1=C;
00042     C2=C;
00043 }
00044 
00045 CSVMSGD::CSVMSGD(float64_t C, CDotFeatures* traindat, CLabels* trainlab)
00046 : CLinearMachine()
00047 {
00048     init();
00049     C1=C;
00050     C2=C;
00051 
00052     set_features(traindat);
00053     set_labels(trainlab);
00054 }
00055 
00056 CSVMSGD::~CSVMSGD()
00057 {
00058     SG_UNREF(loss);
00059 }
00060 
00061 void CSVMSGD::set_loss_function(CLossFunction* loss_func)
00062 {
00063     if (loss)
00064         SG_UNREF(loss);
00065     loss=loss_func;
00066     SG_REF(loss);
00067 }
00068 
00069 bool CSVMSGD::train_machine(CFeatures* data)
00070 {
00071     // allocate memory for w and initialize everyting w and bias with 0
00072     ASSERT(labels);
00073 
00074     if (data)
00075     {
00076         if (!data->has_property(FP_DOT))
00077             SG_ERROR("Specified features are not of type CDotFeatures\n");
00078         set_features((CDotFeatures*) data);
00079     }
00080 
00081     ASSERT(features);
00082     ASSERT(labels->is_two_class_labeling());
00083 
00084     int32_t num_train_labels=labels->get_num_labels();
00085     w_dim=features->get_dim_feature_space();
00086     int32_t num_vec=features->get_num_vectors();
00087 
00088     ASSERT(num_vec==num_train_labels);
00089     ASSERT(num_vec>0);
00090 
00091     SG_FREE(w);
00092     w=SG_MALLOC(float64_t, w_dim);
00093     memset(w, 0, w_dim*sizeof(float64_t));
00094     bias=0;
00095 
00096     float64_t lambda= 1.0/(C1*num_vec);
00097 
00098     // Shift t in order to have a
00099     // reasonable initial learning rate.
00100     // This assumes |x| \approx 1.
00101     float64_t maxw = 1.0 / sqrt(lambda);
00102     float64_t typw = sqrt(maxw);
00103     float64_t eta0 = typw / CMath::max(1.0,-loss->first_derivative(-typw,1));
00104     t = 1 / (eta0 * lambda);
00105 
00106     SG_INFO("lambda=%f, epochs=%d, eta0=%f\n", lambda, epochs, eta0);
00107 
00108 
00109     //do the sgd
00110     calibrate();
00111 
00112     SG_INFO("Training on %d vectors\n", num_vec);
00113     CSignal::clear_cancel();
00114 
00115     ELossType loss_type = loss->get_loss_type();
00116     bool is_log_loss = false;
00117     if ((loss_type == L_LOGLOSS) || (loss_type == L_LOGLOSSMARGIN))
00118         is_log_loss = true;
00119     
00120     for(int32_t e=0; e<epochs && (!CSignal::cancel_computations()); e++)
00121     {
00122         count = skip;
00123         for (int32_t i=0; i<num_vec; i++)
00124         {
00125             float64_t eta = 1.0 / (lambda * t);
00126             float64_t y = labels->get_label(i);
00127             float64_t z = y * (features->dense_dot(i, w, w_dim) + bias);
00128 
00129             if (z < 1 || is_log_loss)
00130             {
00131                 float64_t etd = -eta * loss->first_derivative(z,1);
00132                 features->add_to_dense_vec(etd * y / wscale, i, w, w_dim);
00133 
00134                 if (use_bias)
00135                 {
00136                     if (use_regularized_bias)
00137                         bias *= 1 - eta * lambda * bscale;
00138                     bias += etd * y * bscale;
00139                 }
00140             }
00141 
00142             if (--count <= 0)
00143             {
00144                 float64_t r = 1 - eta * lambda * skip;
00145                 if (r < 0.8)
00146                     r = pow(1 - eta * lambda, skip);
00147                 CMath::scale_vector(r, w, w_dim);
00148                 count = skip;
00149             }
00150             t++;
00151         }
00152     }
00153 
00154     float64_t wnorm =  CMath::dot(w,w, w_dim);
00155     SG_INFO("Norm: %.6f, Bias: %.6f\n", wnorm, bias);
00156 
00157     return true;
00158 }
00159 
00160 void CSVMSGD::calibrate()
00161 {
00162     ASSERT(features);
00163     int32_t num_vec=features->get_num_vectors();
00164     int32_t c_dim=features->get_dim_feature_space();
00165 
00166     ASSERT(num_vec>0);
00167     ASSERT(c_dim>0);
00168 
00169     float64_t* c=SG_MALLOC(float64_t, c_dim);
00170     memset(c, 0, c_dim*sizeof(float64_t));
00171 
00172     SG_INFO("Estimating sparsity and bscale num_vec=%d num_feat=%d.\n", num_vec, c_dim);
00173 
00174     // compute average gradient size
00175     int32_t n = 0;
00176     float64_t m = 0;
00177     float64_t r = 0;
00178 
00179     for (int32_t j=0; j<num_vec && m<=1000; j++, n++)
00180     {
00181         r += features->get_nnz_features_for_vector(j);
00182         features->add_to_dense_vec(1, j, c, c_dim, true);
00183 
00184         //waste cpu cycles for readability
00185         //(only changed dims need checking)
00186         m=CMath::max(c, c_dim);
00187     }
00188 
00189     // bias update scaling
00190     bscale = 0.5*m/n;
00191 
00192     // compute weight decay skip
00193     skip = (int32_t) ((16 * n * c_dim) / r);
00194     SG_INFO("using %d examples. skip=%d  bscale=%.6f\n", n, skip, bscale);
00195 
00196     SG_FREE(c);
00197 }
00198 
00199 void CSVMSGD::init()
00200 {
00201     t=1;
00202     C1=1;
00203     C2=1;
00204     wscale=1;
00205     bscale=1;
00206     epochs=5;
00207     skip=1000;
00208     count=1000;
00209     use_bias=true;
00210 
00211     use_regularized_bias=false;
00212 
00213     loss=new CHingeLoss();
00214     SG_REF(loss);
00215 
00216     m_parameters->add(&C1, "C1",  "Cost constant 1.");
00217     m_parameters->add(&C2, "C2",  "Cost constant 2.");
00218     m_parameters->add(&wscale, "wscale",  "W scale");
00219     m_parameters->add(&bscale, "bscale",  "b scale");
00220     m_parameters->add(&epochs, "epochs",  "epochs");
00221     m_parameters->add(&skip, "skip",  "skip");
00222     m_parameters->add(&count, "count",  "count");
00223     m_parameters->add(&use_bias, "use_bias",  "Indicates if bias is used.");
00224     m_parameters->add(&use_regularized_bias, "use_regularized_bias",  "Indicates if bias is regularized.");
00225 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation