SHOGUN: LeastAngleRegression.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Chiyuan Zhang
00008  * Copyright (C) 2012 Chiyuan Zhang
00009  */
00010 
00011 #include <shogun/lib/config.h>
00012 
00013 #ifdef HAVE_LAPACK
00014 
00015 #include <vector>
00016 #include <limits>
00017 #include <algorithm>
00018 
00019 #include <shogun/features/DenseFeatures.h>
00020 #include <shogun/mathematics/Math.h>
00021 #include <shogun/mathematics/lapack.h>
00022 #include <shogun/regression/LeastAngleRegression.h>
00023 #include <shogun/labels/RegressionLabels.h>
00024 
00025 using namespace shogun;
00026 using namespace std;
00027 
00028 static vector<float64_t> make_vector(int32_t size, float64_t val)
00029 {
00030     vector<float64_t> result(size);
00031     fill(result.begin(), result.end(), val);
00032     return result;
00033 }
00034 
00035 static void plane_rot(float64_t x0, float64_t x1,
00036     float64_t &y0, float64_t &y1, SGMatrix<float64_t> &G)
00037 {
00038     G.zero();
00039     if (x1 == 0)
00040     {
00041         G(0, 0) = G(1, 1) = 1;
00042         y0 = x0;
00043         y1 = x1;
00044     }
00045     else
00046     {
00047         float64_t r = CMath::sqrt(x0*x0+x1*x1);
00048         float64_t sx0 = x0 / r;
00049         float64_t sx1 = x1 / r;
00050 
00051         G(0,0) = sx0;
00052         G(1,0) = -sx1;
00053         G(0,1) = sx1;
00054         G(1,1) = sx0;
00055 
00056         y0 = r;
00057         y1 = 0;
00058     }
00059 }
00060 
00061 static void find_max_abs(const vector<float64_t> &vec, const vector<bool> &ignore,
00062     int32_t &imax, float64_t& vmax)
00063 {
00064     imax = -1;
00065     vmax = -1;
00066     for (uint32_t i=0; i < vec.size(); ++i)
00067     {
00068         if (ignore[i])
00069             continue;
00070         if (CMath::abs(vec[i]) > vmax)
00071         {
00072             vmax = CMath::abs(vec[i]);
00073             imax = i;
00074         }
00075     }
00076 }
00077 
00078 CLeastAngleRegression::CLeastAngleRegression(bool lasso) :
00079     CLinearMachine(), m_lasso(lasso),
00080     m_max_nonz(0), m_max_l1_norm(0)
00081 {
00082     SG_ADD(&m_max_nonz, "max_nonz", "Max number of non-zero variables", MS_AVAILABLE);
00083     SG_ADD(&m_max_l1_norm, "max_l1_norm", "Max l1-norm of estimator", MS_AVAILABLE);
00084 }
00085 
00086 CLeastAngleRegression::~CLeastAngleRegression()
00087 {
00088 
00089 }
00090 
00091 bool CLeastAngleRegression::train_machine(CFeatures* data)
00092 {
00093     if (!m_labels)
00094         SG_ERROR("No labels set\n");
00095     if (m_labels->get_label_type() != LT_REGRESSION)
00096         SG_ERROR("Expected RegressionLabels\n");
00097 
00098     if (!data)
00099         data=features;
00100 
00101     if (!data)
00102         SG_ERROR("No features set\n");
00103 
00104     if (m_labels->get_num_labels() != data->get_num_vectors())
00105         SG_ERROR("Number of training vectors does not match number of labels\n");
00106 
00107     if (data->get_feature_class() != C_DENSE)
00108         SG_ERROR("Expected Simple Features\n");
00109 
00110     if (data->get_feature_type() != F_DREAL)
00111         SG_ERROR("Expected Real Features\n");
00112 
00113     CDenseFeatures<float64_t>* feats=(CDenseFeatures<float64_t>*) data;
00114     int32_t n_fea = feats->get_num_features();
00115     int32_t n_vec = feats->get_num_vectors();
00116 
00117     bool lasso_cond = false;
00118     bool stop_cond = false;
00119 
00120     // init facilities
00121     m_beta_idx.clear();
00122     m_beta_path.clear();
00123     m_num_active = 0;
00124     m_active_set.clear();
00125     m_is_active.resize(n_fea);
00126     fill(m_is_active.begin(), m_is_active.end(), false);
00127 
00128     SGVector<float64_t> y = ((CRegressionLabels*) m_labels)->get_labels();
00129     SGMatrix<float64_t> Xorig = feats->get_feature_matrix();
00130 
00131     // transpose(X) is more convenient to work with since we care
00132     // about features here. After transpose, each row will be a data
00133     // point while each column corresponds to a feature
00134     SGMatrix<float64_t> X(n_vec, n_fea, true);
00135     for (int32_t i=0; i < n_vec; ++i)
00136     {
00137         for (int32_t j=0; j < n_fea; ++j)
00138             X(i,j) = Xorig(j,i);
00139     }
00140 
00141     // beta is the estimator
00142     vector<float64_t> beta = make_vector(n_fea, 0);
00143 
00144     vector<float64_t> Xy = make_vector(n_fea, 0);
00145     // Xy = X' * y
00146     cblas_dgemv(CblasColMajor, CblasTrans, n_vec, n_fea, 1, X.matrix, n_vec,
00147         y.vector, 1, 0, &Xy[0], 1);
00148 
00149     // mu is the prediction
00150     vector<float64_t> mu = make_vector(n_vec, 0);
00151 
00152     // correlation
00153     vector<float64_t> corr = make_vector(n_fea, 0);
00154     // sign of correlation
00155     vector<float64_t> corr_sign(n_fea);
00156 
00157     // Cholesky factorization R'R = X'X, R is upper triangular
00158     SGMatrix<float64_t> R;
00159 
00160     float64_t max_corr = 1;
00161     int32_t i_max_corr = 1;
00162 
00163     // first entry: all coefficients are zero
00164     m_beta_path.push_back(beta);
00165     m_beta_idx.push_back(0);
00166 
00167     //========================================
00168     // main loop
00169     //========================================
00170     int32_t nloop=0;
00171     while (m_num_active < n_fea && max_corr > CMath::MACHINE_EPSILON && !stop_cond)
00172     {
00173         // corr = X' * (y-mu) = - X'*mu + Xy
00174         copy(Xy.begin(), Xy.end(), corr.begin());
00175         cblas_dgemv(CblasColMajor, CblasTrans, n_vec, n_fea, -1,
00176             X.matrix, n_vec, &mu[0], 1, 1, &corr[0], 1);
00177 
00178         // corr_sign = sign(corr)
00179         for (uint32_t i=0; i < corr.size(); ++i)
00180             corr_sign[i] = CMath::sign(corr[i]);
00181 
00182         // find max absolute correlation in inactive set
00183         find_max_abs(corr, m_is_active, i_max_corr, max_corr);
00184 
00185         if (!lasso_cond)
00186         {
00187             // update Cholesky factorization matrix
00188             R=cholesky_insert(X, R, i_max_corr);
00189             activate_variable(i_max_corr);
00190         }
00191 
00192         // corr_sign_a = corr_sign[m_active_set]
00193         vector<float64_t> corr_sign_a(m_num_active);
00194         for (int32_t i=0; i < m_num_active; ++i)
00195             corr_sign_a[i] = corr_sign[m_active_set[i]];
00196 
00197         // GA1 = R\(R'\corr_sign_a)
00198         vector<float64_t> GA1(corr_sign_a);
00199         cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit,
00200             m_num_active, 1, 1, R.matrix, m_num_active, &GA1[0], m_num_active);
00201         cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
00202             m_num_active, 1, 1, R.matrix, m_num_active, &GA1[0], m_num_active);
00203 
00204         // AA = 1/sqrt(GA1' * corr_sign_a)
00205         float64_t AA = cblas_ddot(m_num_active, &GA1[0], 1, &corr_sign_a[0], 1);
00206         AA = 1/CMath::sqrt(AA);
00207 
00208         // wA = AA*GA1
00209         vector<float64_t> wA(GA1);
00210         for (int32_t i=0; i < m_num_active; ++i)
00211             wA[i] *= AA;
00212 
00213         // equiangular direction (unit vector)
00214         vector<float64_t> u = make_vector(n_vec, 0);
00215         // u = X[:,m_active_set] * wA
00216         for (int32_t i=0; i < m_num_active; ++i)
00217         {
00218             // u += wA[i] * X[:,m_active_set[i]]
00219             cblas_daxpy(n_vec, wA[i],
00220                 X.get_column_vector(m_active_set[i]), 1, &u[0], 1);
00221         }
00222 
00223         // step size
00224         float64_t gamma = max_corr / AA;
00225         if (m_num_active < n_fea)
00226         {
00227             for (int32_t i=0; i < n_fea; ++i)
00228             {
00229                 if (m_is_active[i])
00230                     continue;
00231 
00232                 // correlation between X[:,i] and u
00233                 float64_t dir_corr = cblas_ddot(n_vec,
00234                     X.get_column_vector(i), 1, &u[0], 1);
00235 
00236                 float64_t tmp1 = (max_corr-corr[i])/(AA-dir_corr);
00237                 float64_t tmp2 = (max_corr+corr[i])/(AA+dir_corr);
00238                 if (tmp1 > CMath::MACHINE_EPSILON && tmp1 < gamma)
00239                     gamma = tmp1;
00240                 if (tmp2 > CMath::MACHINE_EPSILON && tmp2 < gamma)
00241                     gamma = tmp2;
00242             }
00243         }
00244 
00245         int32_t i_kick=-1;
00246         int32_t i_change=i_max_corr;
00247         if (m_lasso)
00248         {
00249             // lasso modification to further refine gamma
00250             lasso_cond = false;
00251             float64_t lasso_bound = numeric_limits<float64_t>::max();
00252 
00253             for (int32_t i=0; i < m_num_active; ++i)
00254             {
00255                 float64_t tmp = -beta[m_active_set[i]] / wA[i];
00256                 if (tmp > CMath::MACHINE_EPSILON && tmp < lasso_bound)
00257                 {
00258                     lasso_bound = tmp;
00259                     i_kick = i;
00260                 }
00261             }
00262 
00263             if (lasso_bound < gamma)
00264             {
00265                 gamma = lasso_bound;
00266                 lasso_cond = true;
00267                 i_change = m_active_set[i_kick];
00268             }
00269         }
00270 
00271         // update prediction: mu = mu + gamma * u
00272         cblas_daxpy(n_vec, gamma, &u[0], 1, &mu[0], 1);
00273 
00274         // update estimator
00275         for (int32_t i=0; i < m_num_active; ++i)
00276             beta[m_active_set[i]] += gamma * wA[i];
00277 
00278         // early stopping on max l1-norm
00279         if (m_max_l1_norm > 0)
00280         {
00281             float64_t l1 = SGVector<float64_t>::onenorm(&beta[0], n_fea);
00282             if (l1 > m_max_l1_norm)
00283             {
00284                 // stopping with interpolated beta
00285                 stop_cond = true;
00286                 lasso_cond = false;
00287                 float64_t l1_prev = SGVector<float64_t>::onenorm(&m_beta_path[nloop][0], n_fea);
00288                 float64_t s = (m_max_l1_norm-l1_prev)/(l1-l1_prev);
00289 
00290                 // beta = beta_prev + s*(beta-beta_prev)
00291                 //      = (1-s)*beta_prev + s*beta
00292                 cblas_dscal(n_fea, s, &beta[0], 1);
00293                 cblas_daxpy(n_fea, 1-s, &m_beta_path[nloop][0], 1, &beta[0], 1);
00294             }
00295         }
00296 
00297         // if lasso cond, drop the variable from active set
00298         if (lasso_cond)
00299         {
00300             beta[i_change] = 0; // ensure it be zero
00301 
00302             // update Cholesky factorization
00303             R=cholesky_delete(R, i_kick);
00304             deactivate_variable(i_kick);
00305         }
00306 
00307         nloop++;
00308         m_beta_path.push_back(beta);
00309         if (size_t(m_num_active) >= m_beta_idx.size())
00310             m_beta_idx.push_back(nloop);
00311         else
00312             m_beta_idx[m_num_active] = nloop;
00313 
00314         // early stopping with max number of non-zero variables
00315         if (m_max_nonz > 0 && m_num_active >= m_max_nonz)
00316             stop_cond = true;
00317 
00318     } // main loop
00319 
00320     // assign default estimator
00321     w.vlen = n_fea;
00322     switch_w(m_beta_idx.size()-1);
00323 
00324     return true;
00325 }
00326 
00327 SGMatrix<float64_t> CLeastAngleRegression::cholesky_insert(
00328         SGMatrix<float64_t> X, SGMatrix<float64_t> R, int32_t i_max_corr)
00329 {
00330     // diag_k = X[:,i_max_corr]' * X[:,i_max_corr]
00331     float64_t diag_k = cblas_ddot(X.num_rows, X.get_column_vector(i_max_corr), 1,
00332         X.get_column_vector(i_max_corr), 1);
00333 
00334     if (m_num_active == 0)
00335     { // R isn't allocated yet
00336         SGMatrix<float64_t> nR(1,1);
00337         nR(0,0) = CMath::sqrt(diag_k);
00338         return nR;
00339     }
00340     else
00341     {
00342 
00343         // col_k is the k-th column of (X'X)
00344         vector<float64_t> col_k(m_num_active);
00345         for (int32_t i=0; i < m_num_active; ++i)
00346         {
00347             // col_k[i] = X[:,i_max_corr]' * X[:,m_active_set[i]]
00348             col_k[i] = cblas_ddot(X.num_rows, X.get_column_vector(i_max_corr), 1,
00349                 X.get_column_vector(m_active_set[i]), 1);
00350         }
00351 
00352         // R' * R_k = (X' * X)_k = col_k, solving to get R_k
00353         vector<float64_t> R_k(col_k);
00354         cblas_dtrsm(CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, m_num_active, 1,
00355             1, R.matrix, m_num_active, &R_k[0], m_num_active);
00356 
00357         float64_t R_kk = CMath::sqrt(diag_k -
00358             cblas_ddot(m_num_active, &R_k[0], 1, &R_k[0], 1));
00359 
00360         // new_R = [R R_k; zeros(...) R_kk]
00361         SGMatrix<float64_t> nR(m_num_active+1, m_num_active+1);
00362         for (int32_t i=0; i < m_num_active; ++i)
00363             for (int32_t j=0; j < m_num_active; ++j)
00364                 nR(i,j) = R(i,j);
00365         for (int32_t i=0; i < m_num_active; ++i)
00366             nR(i, m_num_active) = R_k[i];
00367         for (int32_t i=0; i < m_num_active; ++i)
00368             nR(m_num_active, i) = 0;
00369         nR(m_num_active, m_num_active) = R_kk;
00370 
00371         return nR;
00372     }
00373 
00374 }
00375 
00376 SGMatrix<float64_t> CLeastAngleRegression::cholesky_delete(SGMatrix<float64_t> R, int32_t i_kick)
00377 {
00378     if (i_kick != m_num_active-1)
00379     {
00380         // remove i_kick-th column
00381         for (int32_t j=i_kick; j < m_num_active-1; ++j)
00382             for (int32_t i=0; i < m_num_active; ++i)
00383                 R(i,j) = R(i,j+1);
00384 
00385         SGMatrix<float64_t> G(2,2);
00386         for (int32_t i=i_kick; i < m_num_active-1; ++i)
00387         {
00388             plane_rot(R(i,i),R(i+1,i), R(i,i), R(i+1,i), G);
00389             if (i < m_num_active-2)
00390             {
00391                 for (int32_t k=i+1; k < m_num_active-1; ++k)
00392                 {
00393                     // R[i:i+1, k] = G*R[i:i+1, k]
00394                     float64_t Rik = R(i,k), Ri1k = R(i+1,k);
00395                     R(i,k) = G(0,0)*Rik + G(0,1)*Ri1k;
00396                     R(i+1,k) = G(1,0)*Rik+G(1,1)*Ri1k;
00397                 }
00398             }
00399         }
00400     }
00401 
00402     SGMatrix<float64_t> nR(m_num_active-1, m_num_active-1);
00403     for (int32_t i=0; i < m_num_active-1; ++i)
00404         for (int32_t j=0; j < m_num_active-1; ++j)
00405             nR(i,j) = R(i,j);
00406 
00407     return nR;
00408 }
00409 
00410 #endif // HAVE_LAPACK