SHOGUN: ssl.cpp Source File

Go to the documentation of this file.
00001 /*    Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu)
00002       SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning
00003 
00004       This file is part of SVM-lin.      
00005 
00006       SVM-lin is free software; you can redistribute it and/or modify
00007       it under the terms of the GNU General Public License as published by
00008       the Free Software Foundation; either version 2 of the License, or
00009       (at your option) any later version.
00010 
00011       SVM-lin is distributed in the hope that it will be useful,
00012       but WITHOUT ANY WARRANTY; without even the implied warranty of
00013       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014       GNU General Public License for more details.
00015 
00016       You should have received a copy of the GNU General Public License
00017       along with SVM-lin (see gpl.txt); if not, write to the Free Software
00018       Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00019       */
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <algorithm>
00025 
00026 #include "lib/io.h"
00027 #include "lib/Mathematics.h"
00028 #include "features/SparseFeatures.h"
00029 #include "classifier/svm/ssl.h"
00030 
00031 namespace shogun
00032 {
00033 void ssl_train(struct data *Data, 
00034         struct options *Options, 
00035         struct vector_double *Weights,
00036         struct vector_double *Outputs)
00037 {
00038     // initialize 
00039     initialize(Weights,Data->n,0.0);
00040     initialize(Outputs,Data->m,0.0);
00041     vector_int    *Subset  = new vector_int[1];
00042     initialize(Subset,Data->m);
00043     // call the right algorithm
00044     int32_t optimality = 0;
00045     switch(Options->algo)
00046     {
00047         case -1:
00048             SG_SINFO("Regularized Least Squares Regression (CGLS)\n");
00049             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00050             break;
00051         case RLS:
00052             SG_SINFO("Regularized Least Squares Classification (CGLS)\n");
00053             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00054             break;
00055         case SVM:
00056             SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n");
00057             optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0);
00058             break;
00059         case TSVM:
00060             SG_SINFO("Transductive L2-SVM (TSVM)\n");
00061             optimality=TSVM_MFN(Data,Options,Weights,Outputs);
00062             break;
00063         case DA_SVM:
00064             SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n");
00065             optimality=DA_S3VM(Data,Options,Weights,Outputs);
00066             break;
00067         default:
00068             SG_SERROR("Algorithm unspecified");
00069     }
00070 
00071     delete[] Subset->vec;
00072     delete[] Subset;
00073     return;
00074 }
00075 
00076 int32_t CGLS(
00077     const struct data *Data, const struct options *Options,
00078     const struct vector_int *Subset, struct vector_double *Weights,
00079     struct vector_double *Outputs)
00080 {
00081     SG_SDEBUG("CGLS starting...");
00082 
00083     /* Disassemble the structures */
00084     int32_t active = Subset->d;
00085     int32_t *J = Subset->vec;
00086     CDotFeatures* features=Data->features;
00087     float64_t *Y = Data->Y;
00088     float64_t *C = Data->C;
00089     int32_t n  = Data->n;
00090     float64_t lambda = Options->lambda;
00091     int32_t cgitermax = Options->cgitermax;
00092     float64_t epsilon = Options->epsilon;
00093     float64_t *beta = Weights->vec;
00094     float64_t *o  = Outputs->vec; 
00095     // initialize z 
00096     float64_t *z = new float64_t[active];
00097     float64_t *q = new float64_t[active];
00098     int32_t ii=0;
00099     for (int32_t i = active ; i-- ;){
00100         ii=J[i];      
00101         z[i]  = C[ii]*(Y[ii] - o[ii]);
00102     }
00103     float64_t *r = new float64_t[n];
00104     for (int32_t i = n ; i-- ;)
00105         r[i] = 0.0;
00106     for (register int32_t j=0; j < active; j++)
00107     {
00108         features->add_to_dense_vec(z[j], J[j], r, n-1);
00109         r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim)
00110     }
00111     float64_t *p = new float64_t[n];   
00112     float64_t omega1 = 0.0;
00113     for (int32_t i = n ; i-- ;)
00114     {
00115         r[i] -= lambda*beta[i];
00116         p[i] = r[i];
00117         omega1 += r[i]*r[i];
00118     }   
00119     float64_t omega_p = omega1;
00120     float64_t omega_q = 0.0;
00121     float64_t inv_omega2 = 1/omega1;
00122     float64_t scale = 0.0;
00123     float64_t omega_z=0.0;
00124     float64_t gamma = 0.0;
00125     int32_t cgiter = 0;
00126     int32_t optimality = 0;
00127     float64_t epsilon2 = epsilon*epsilon;   
00128     // iterate
00129     while(cgiter < cgitermax)
00130     {
00131         cgiter++;
00132         omega_q=0.0;
00133         float64_t t=0.0;
00134         register int32_t i,j;
00135         // #pragma omp parallel for private(i,j)
00136         for (i=0; i < active; i++)
00137         {
00138             ii=J[i];
00139             t=features->dense_dot(ii, p, n-1);
00140             t+=Options->bias*p[n-1]; //bias (modelled as last dim)
00141             q[i]=t;
00142             omega_q += C[ii]*t*t;
00143         }       
00144         gamma = omega1/(lambda*omega_p + omega_q);    
00145         inv_omega2 = 1/omega1;     
00146         for (i = n ; i-- ;)
00147         {
00148             r[i] = 0.0;
00149             beta[i] += gamma*p[i];
00150         } 
00151         omega_z=0.0;
00152         for (i = active ; i-- ;)
00153         {
00154             ii=J[i];
00155             o[ii] += gamma*q[i];
00156             z[i] -= gamma*C[ii]*q[i];
00157             omega_z+=z[i]*z[i];
00158         } 
00159         for (j=0; j < active; j++)
00160         {
00161             t=z[j];
00162 
00163             features->add_to_dense_vec(t, J[j], r, n-1);
00164             r[n-1]+=Options->bias*t; //bias (modelled as last dim)
00165         }
00166         omega1 = 0.0;
00167         for (i = n ; i-- ;)
00168         {
00169             r[i] -= lambda*beta[i];
00170             omega1 += r[i]*r[i];
00171         }
00172         if(omega1 < epsilon2*omega_z)
00173         {
00174             optimality=1;
00175             break;
00176         }
00177         omega_p=0.0;
00178         scale=omega1*inv_omega2;
00179         for(i = n ; i-- ;)
00180         {
00181             p[i] = r[i] + p[i]*scale;
00182             omega_p += p[i]*p[i]; 
00183         } 
00184     }            
00185     SG_SDEBUG("...Done.");
00186     SG_SINFO("CGLS converged in %d iteration(s)", cgiter);
00187 
00188     delete[] z;
00189     delete[] q;
00190     delete[] r;
00191     delete[] p;
00192     return optimality;
00193 }
00194 
00195 int32_t L2_SVM_MFN(
00196     const struct data *Data, struct options *Options,
00197     struct vector_double *Weights, struct vector_double *Outputs,
00198     int32_t ini)
00199 {
00200     /* Disassemble the structures */  
00201     CDotFeatures* features=Data->features;
00202     float64_t *Y = Data->Y;
00203     float64_t *C = Data->C;
00204     int32_t n  = Data->n;
00205     int32_t m  = Data->m;
00206     float64_t lambda = Options->lambda;
00207     float64_t epsilon;
00208     float64_t *w = Weights->vec;
00209     float64_t *o = Outputs->vec; 
00210     float64_t F_old = 0.0;
00211     float64_t F = 0.0;
00212     float64_t diff=0.0;
00213     vector_int *ActiveSubset = new vector_int[1];
00214     ActiveSubset->vec = new int32_t[m];
00215     ActiveSubset->d = m;
00216     // initialize
00217     if(ini==0) {
00218         epsilon=BIG_EPSILON; 
00219         Options->cgitermax=SMALL_CGITERMAX; 
00220         Options->epsilon=BIG_EPSILON;
00221     }
00222     else {epsilon = Options->epsilon;}  
00223     for (int32_t i=0;i<n;i++) F+=w[i]*w[i];
00224     F=0.5*lambda*F;        
00225     int32_t active=0;
00226     int32_t inactive=m-1; // l-1      
00227     for (int32_t i=0; i<m ; i++)
00228     { 
00229         diff=1-Y[i]*o[i];
00230         if(diff>0)
00231         {
00232             ActiveSubset->vec[active]=i;
00233             active++;
00234             F+=0.5*C[i]*diff*diff;
00235         }
00236         else
00237         {
00238             ActiveSubset->vec[inactive]=i;
00239             inactive--;
00240         }   
00241     }
00242     ActiveSubset->d=active;        
00243     int32_t iter=0;
00244     int32_t opt=0;
00245     int32_t opt2=0;
00246     vector_double *Weights_bar = new vector_double[1];
00247     vector_double *Outputs_bar = new vector_double[1];
00248     float64_t *w_bar = new float64_t[n];
00249     float64_t *o_bar = new float64_t[m];
00250     Weights_bar->vec=w_bar;
00251     Outputs_bar->vec=o_bar;
00252     Weights_bar->d=n;
00253     Outputs_bar->d=m;
00254     float64_t delta=0.0;
00255     float64_t t=0.0;
00256     int32_t ii = 0;
00257     while(iter<MFNITERMAX)
00258     {
00259         iter++;
00260         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F);
00261         for (int32_t i=n; i-- ;)
00262             w_bar[i]=w[i];
00263         for (int32_t i=m; i-- ;)
00264             o_bar[i]=o[i];
00265         
00266         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00267         for(register int32_t i=active; i < m; i++) 
00268         {
00269             ii=ActiveSubset->vec[i];   
00270 
00271             t=features->dense_dot(ii, w_bar, n-1);
00272             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00273 
00274             o_bar[ii]=t;
00275         }
00276         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00277         opt2=1;
00278         for (int32_t i=0;i<m;i++)
00279         { 
00280             ii=ActiveSubset->vec[i];
00281             if(i<active)
00282                 opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon));
00283             else
00284                 opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon));  
00285             if(opt2==0) break;
00286         }      
00287         if(opt && opt2) // l
00288         {
00289             if(epsilon==BIG_EPSILON) 
00290             {
00291                 epsilon=EPSILON;
00292                 Options->epsilon=EPSILON;
00293                 SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f",  BIG_EPSILON , EPSILON);
00294                 continue;
00295             }
00296             else
00297             {
00298                 for (int32_t i=n; i-- ;)
00299                     w[i]=w_bar[i];      
00300                 for (int32_t i=m; i-- ;)
00301                     o[i]=o_bar[i]; 
00302                 delete[] ActiveSubset->vec;
00303                 delete[] ActiveSubset;
00304                 delete[] o_bar;
00305                 delete[] w_bar;
00306                 delete[] Weights_bar;
00307                 delete[] Outputs_bar;
00308                 SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter);
00309                 return 1;      
00310             }
00311         }
00312         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m); 
00313         SG_SDEBUG("LINE_SEARCH delta = %f\n", delta);
00314         F_old=F;
00315         F=0.0;
00316         for (int32_t i=n; i-- ;) {
00317             w[i]+=delta*(w_bar[i]-w[i]);
00318             F+=w[i]*w[i];
00319         }
00320         F=0.5*lambda*F;      
00321         active=0;
00322         inactive=m-1;  
00323         for (int32_t i=0; i<m ; i++)
00324         {
00325             o[i]+=delta*(o_bar[i]-o[i]);
00326             diff=1-Y[i]*o[i];
00327             if(diff>0)
00328             {
00329                 ActiveSubset->vec[active]=i;
00330                 active++;
00331                 F+=0.5*C[i]*diff*diff;
00332             }
00333             else
00334             {
00335                 ActiveSubset->vec[inactive]=i;
00336                 inactive--;
00337             }   
00338         }
00339         ActiveSubset->d=active;      
00340         if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old))
00341         {
00342             SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter);
00343             return 2;
00344         }
00345     }
00346     delete[] ActiveSubset->vec;
00347     delete[] ActiveSubset;
00348     delete[] o_bar;
00349     delete[] w_bar;
00350     delete[] Weights_bar;
00351     delete[] Outputs_bar;
00352     SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter);
00353     return 0;
00354 }
00355 
00356 float64_t line_search(float64_t *w, 
00357         float64_t *w_bar,
00358         float64_t lambda,
00359         float64_t *o, 
00360         float64_t *o_bar, 
00361         float64_t *Y, 
00362         float64_t *C,
00363         int32_t d, /* data dimensionality -- 'n' */
00364         int32_t l) /* number of examples */
00365 {                       
00366     float64_t omegaL = 0.0;
00367     float64_t omegaR = 0.0;
00368     float64_t diff=0.0;   
00369     for(int32_t i=d; i--; )
00370     {
00371         diff=w_bar[i]-w[i];  
00372         omegaL+=w[i]*diff;
00373         omegaR+=w_bar[i]*diff;
00374     }
00375     omegaL=lambda*omegaL;
00376     omegaR=lambda*omegaR;
00377     float64_t L=0.0;
00378     float64_t R=0.0;
00379     int32_t ii=0;
00380     for (int32_t i=0;i<l;i++)
00381     {
00382         if(Y[i]*o[i]<1)
00383         {
00384             diff=C[i]*(o_bar[i]-o[i]);  
00385             L+=(o[i]-Y[i])*diff;
00386             R+=(o_bar[i]-Y[i])*diff;
00387         }
00388     }
00389     L+=omegaL;
00390     R+=omegaR;
00391     Delta* deltas=new Delta[l];    
00392     int32_t p=0;
00393     for(int32_t i=0;i<l;i++)
00394     { 
00395         diff=Y[i]*(o_bar[i]-o[i]);
00396 
00397         if(Y[i]*o[i]<1)
00398         {
00399             if(diff>0)
00400             {
00401                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00402                 deltas[p].index=i;
00403                 deltas[p].s=-1;
00404                 p++;
00405             }
00406         }
00407         else
00408         {
00409             if(diff<0)
00410             {
00411                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00412                 deltas[p].index=i;
00413                 deltas[p].s=1;      
00414                 p++;
00415             }
00416         }
00417     }
00418     std::sort(deltas,deltas+p);            
00419     float64_t delta_prime=0.0;  
00420     for (int32_t i=0;i<p;i++)
00421     {
00422         delta_prime = L + deltas[i].delta*(R-L);       
00423         if(delta_prime>=0)
00424             break;
00425         ii=deltas[i].index;   
00426         diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]);
00427         L+=diff*(o[ii]-Y[ii]);
00428         R+=diff*(o_bar[ii]-Y[ii]);
00429     }   
00430     delete [] deltas;
00431     return (-L/(R-L));
00432 }
00433 
00434 int32_t TSVM_MFN(
00435     const struct data *Data, struct options *Options,
00436     struct vector_double *Weights, struct vector_double *Outputs)
00437 {
00438     /* Setup labeled-only examples and train L2_SVM_MFN */
00439     struct data *Data_Labeled = new data[1];
00440     struct vector_double *Outputs_Labeled = new vector_double[1];
00441     initialize(Outputs_Labeled,Data->l,0.0);
00442     SG_SDEBUG("Initializing weights, unknown labels");
00443     GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */
00444     L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0);
00446     /* Use this weight vector to classify R*u unlabeled examples as
00447        positive*/
00448     int32_t p=0,q=0;
00449     float64_t t=0.0;
00450     int32_t *JU = new int32_t[Data->u];
00451     float64_t *ou = new float64_t[Data->u];
00452     float64_t lambda_0 = TSVM_LAMBDA_SMALL;
00453     for (int32_t i=0;i<Data->m;i++)
00454     {
00455         if(Data->Y[i]==0.0)
00456         {
00457             t=Data->features->dense_dot(i, Weights->vec, Data->n-1);
00458             t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim)
00459 
00460             Outputs->vec[i]=t;
00461             Data->C[i]=lambda_0*1.0/Data->u;
00462             JU[q]=i;
00463             ou[q]=t;
00464             q++;
00465         }
00466         else
00467         {                
00468             Outputs->vec[i]=Outputs_Labeled->vec[p];
00469             Data->C[i]=1.0/Data->l;
00470             p++;   
00471         }
00472     }
00473     std::nth_element(ou,ou+int32_t((1-Options->R)*Data->u-1),ou+Data->u);
00474     float64_t thresh=*(ou+int32_t((1-Options->R)*Data->u)-1);
00475     delete [] ou;
00476     for (int32_t i=0;i<Data->u;i++)
00477     {  
00478         if(Outputs->vec[JU[i]]>thresh)
00479             Data->Y[JU[i]]=1.0;
00480         else
00481             Data->Y[JU[i]]=-1.0;
00482     }
00483     for (int32_t i=0;i<Data->n;i++)
00484         Weights->vec[i]=0.0;
00485     for (int32_t i=0;i<Data->m;i++)
00486         Outputs->vec[i]=0.0;
00487     L2_SVM_MFN(Data,Options,Weights,Outputs,0); 
00488     int32_t num_switches=0;
00489     int32_t s=0;
00490     int32_t last_round=0;
00491     while(lambda_0 <= Options->lambda_u)
00492     {
00493         int32_t iter2=0;
00494         while(1){
00495             s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S);
00496             if(s==0) break;
00497             iter2++;
00498             SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2);
00499             SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n");
00500             num_switches+=s;
00501             SG_SDEBUG("Optimizing weights\n");
00502             L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00503         }
00504         if(last_round==1) break;
00505         lambda_0=TSVM_ANNEALING_RATE*lambda_0;
00506         if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;} 
00507         for (int32_t i=0;i<Data->u;i++)
00508             Data->C[JU[i]]=lambda_0*1.0/Data->u;       
00509         SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u);
00510         SG_SDEBUG("Optimizing weights\n");
00511         L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00512     }
00513     SG_SDEBUG("Total Number of Switches = %d\n", num_switches);
00514     /* reset labels */
00515     for (int32_t i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0;
00516     float64_t F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00517     SG_SDEBUG("Objective Value = %f\n",F);
00518     delete [] JU;
00519     return num_switches;
00520 }
00521 
00522 int32_t switch_labels(float64_t* Y, float64_t* o, int32_t* JU, int32_t u, int32_t S)
00523 {
00524     int32_t npos=0;
00525     int32_t nneg=0;
00526     for (int32_t i=0;i<u;i++)
00527     {
00528         if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++;   
00529         if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++;        
00530     }     
00531     Delta* positive=new Delta[npos];
00532     Delta* negative=new Delta[nneg];  
00533     int32_t p=0;
00534     int32_t n=0;
00535     int32_t ii=0;
00536     for (int32_t i=0;i<u;i++)
00537     {
00538         ii=JU[i];
00539         if((Y[ii]>0.0) && (o[ii]<1.0)) {
00540             positive[p].delta=o[ii]; 
00541             positive[p].index=ii;
00542             positive[p].s=0;
00543             p++;};   
00544             if((Y[ii]<0.0) && (-o[ii]<1.0)) 
00545             {
00546                 negative[n].delta=-o[ii]; 
00547                 negative[n].index=ii;
00548                 negative[n].s=0;
00549                 n++;};   
00550     }
00551     std::sort(positive,positive+npos);
00552     std::sort(negative,negative+nneg);  
00553     int32_t s=-1;
00554     while(1)
00555     {
00556         s++;    
00557         if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg))
00558             break;
00559         Y[positive[s].index]=-1.0;
00560         Y[negative[s].index]= 1.0;
00561     }
00562     delete [] positive;
00563     delete [] negative;
00564     return s;
00565 }
00566 
00567 int32_t DA_S3VM(
00568     struct data *Data, struct options *Options, struct vector_double *Weights,
00569     struct vector_double *Outputs)
00570 {
00571     float64_t T = DA_INIT_TEMP*Options->lambda_u;
00572     int32_t iter1 = 0, iter2 =0;
00573     float64_t *p = new float64_t[Data->u];
00574     float64_t *q = new float64_t[Data->u];
00575     float64_t *g = new float64_t[Data->u];
00576     float64_t F,F_min;
00577     float64_t *w_min = new float64_t[Data->n];
00578     float64_t *o_min = new float64_t[Data->m];
00579     float64_t *w = Weights->vec;
00580     float64_t *o = Outputs->vec;
00581     float64_t kl_divergence = 1.0;
00582     /*initialize */
00583     SG_SDEBUG("Initializing weights, p");
00584     for (int32_t i=0;i<Data->u; i++)
00585         p[i] = Options->R;
00586     /* record which examples are unlabeled */
00587     int32_t *JU = new int32_t[Data->u];
00588     int32_t j=0;
00589     for(int32_t i=0;i<Data->m;i++)
00590     {
00591         if(Data->Y[i]==0.0)
00592         {JU[j]=i;j++;}
00593     }  
00594     float64_t H = entropy(p,Data->u);
00595     optimize_w(Data,p,Options,Weights,Outputs,0);  
00596     F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00597     F_min = F;
00598     for (int32_t i=0;i<Weights->d;i++)
00599         w_min[i]=w[i];
00600     for (int32_t i=0;i<Outputs->d;i++)
00601         o_min[i]=o[i];
00602     while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon))
00603     {
00604         iter1++;
00605         iter2=0;
00606         kl_divergence=1.0;
00607         while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon)) 
00608         {
00609             iter2++;
00610             for (int32_t i=0;i<Data->u;i++)
00611             {
00612                 q[i]=p[i];
00613                 g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]]))); 
00614             }
00615             SG_SDEBUG("Optimizing p.\n");
00616             optimize_p(g,Data->u,T,Options->R,p);
00617             kl_divergence=KL(p,q,Data->u);
00618             SG_SDEBUG("Optimizing weights\n");
00619             optimize_w(Data,p,Options,Weights,Outputs,1);
00620             F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00621             if(F < F_min)
00622             {
00623                 F_min = F;
00624                 for (int32_t i=0;i<Weights->d;i++)
00625                     w_min[i]=w[i];
00626                 for (int32_t i=0;i<Outputs->d;i++)
00627                     o_min[i]=o[i];
00628             }
00629             SG_SDEBUG("***** outer_iter = %d  T = %g  inner_iter = %d  kl = %g  cost = %g *****\n",iter1,T,iter2,kl_divergence,F); 
00630         }
00631         H = entropy(p,Data->u); 
00632         SG_SDEBUG("***** Finished outer_iter = %d T = %g  Entropy = %g ***\n", iter1,T,H);
00633         T = T/DA_ANNEALING_RATE;
00634     }
00635     for (int32_t i=0;i<Weights->d;i++)
00636         w[i]=w_min[i];
00637     for (int32_t i=0;i<Outputs->d;i++)
00638         o[i]=o_min[i];
00639     /* may want to reset the original Y */ 
00640     delete [] p; 
00641     delete [] q;
00642     delete [] g;
00643     delete [] JU;
00644     delete [] w_min;
00645     delete [] o_min;
00646     SG_SINFO("(min) Objective Value = %f", F_min);
00647     return 1;
00648 }
00649 
00650 int32_t optimize_w(
00651     const struct data *Data, const float64_t *p, struct options *Options,
00652     struct vector_double *Weights, struct vector_double *Outputs, int32_t ini)
00653 {
00654     int32_t i,j;
00655     CDotFeatures* features=Data->features;
00656     int32_t n  = Data->n;
00657     int32_t m  = Data->m;
00658     int32_t u  = Data->u;
00659     float64_t lambda = Options->lambda;
00660     float64_t epsilon;
00661     float64_t *w = Weights->vec;
00662     float64_t *o = new float64_t[m+u];
00663     float64_t *Y = new float64_t[m+u];
00664     float64_t *C = new float64_t[m+u];
00665     int32_t *labeled_indices = new int32_t[m];
00666     float64_t F_old = 0.0;
00667     float64_t F = 0.0;
00668     float64_t diff=0.0;
00669     float64_t lambda_u_by_u = Options->lambda_u/u;
00670     vector_int *ActiveSubset = new vector_int[1];
00671     ActiveSubset->vec = new int32_t[m];
00672     ActiveSubset->d = m;
00673     // initialize
00674     if(ini==0) 
00675     {
00676         epsilon=BIG_EPSILON; 
00677         Options->cgitermax=SMALL_CGITERMAX;
00678         Options->epsilon=BIG_EPSILON;}
00679     else {epsilon = Options->epsilon;}  
00680 
00681     for(i=0;i<n;i++) F+=w[i]*w[i];
00682     F=lambda*F;        
00683     int32_t active=0;
00684     int32_t inactive=m-1; // l-1      
00685     float64_t temp1;
00686     float64_t temp2;
00687 
00688     j = 0;
00689     for(i=0; i<m ; i++)
00690     { 
00691         o[i]=Outputs->vec[i];
00692         if(Data->Y[i]==0.0)
00693         {
00694             labeled_indices[i]=0;
00695             o[m+j]=o[i];
00696             Y[i]=1;
00697             Y[m+j]=-1;
00698             C[i]=lambda_u_by_u*p[j];
00699             C[m+j]=lambda_u_by_u*(1-p[j]);
00700             ActiveSubset->vec[active]=i;
00701             active++; 
00702             diff = 1 - CMath::abs(o[i]);
00703             if(diff>0)
00704             {
00705                 Data->Y[i] = 2*p[j]-1;
00706                 Data->C[i] = lambda_u_by_u;
00707                 temp1 = (1 - o[i]);
00708                 temp2 = (1 + o[i]);
00709                 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00710             }
00711             else
00712             {
00713                 if(o[i]>0)
00714                 {
00715                     Data->Y[i] = -1.0;
00716                     Data->C[i] = C[m+j];
00717                 }
00718                 else
00719                 {
00720                     Data->Y[i] = 1.0;
00721                     Data->C[i] = C[i];       
00722                 }
00723                 temp1 = (1-Data->Y[i]*o[i]);
00724                 F+= Data->C[i]*temp1*temp1;
00725             }
00726             j++;
00727         } 
00728         else
00729         {
00730             labeled_indices[i]=1;
00731             Y[i]=Data->Y[i];
00732             C[i]=1.0/Data->l;
00733             Data->C[i]=1.0/Data->l;
00734             diff=1-Data->Y[i]*o[i];
00735             if(diff>0)
00736             {
00737                 ActiveSubset->vec[active]=i;
00738                 active++;
00739                 F+=Data->C[i]*diff*diff;
00740             }
00741             else
00742             {
00743                 ActiveSubset->vec[inactive]=i;
00744                 inactive--;
00745             }
00746         }
00747     }
00748     F=0.5*F;
00749     ActiveSubset->d=active;
00750     int32_t iter=0;
00751     int32_t opt=0;
00752     int32_t opt2=0;
00753     vector_double *Weights_bar = new vector_double[1];
00754     vector_double *Outputs_bar = new vector_double[1];
00755     float64_t *w_bar = new float64_t[n];
00756     float64_t *o_bar = new float64_t[m+u];
00757     Weights_bar->vec=w_bar;
00758     Outputs_bar->vec=o_bar;
00759     Weights_bar->d=n;
00760     Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */
00761     float64_t delta=0.0;
00762     float64_t t=0.0;
00763     int32_t ii = 0;
00764     while(iter<MFNITERMAX)
00765     {
00766         iter++;
00767         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples,  objective_value = %f)", iter, active, F);
00768         for(i=n; i-- ;) 
00769             w_bar[i]=w[i];
00770 
00771         for(i=m+u; i-- ;)  
00772             o_bar[i]=o[i];     
00773         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00774         for(i=active; i < m; i++) 
00775         {
00776             ii=ActiveSubset->vec[i];   
00777             t=features->dense_dot(ii, w_bar, n-1);
00778             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00779 
00780             o_bar[ii]=t;
00781         }
00782         // make o_bar consistent in the bottom half      
00783         j=0;
00784         for(i=0; i<m;i++)
00785         {
00786             if(labeled_indices[i]==0)
00787             {o_bar[m+j]=o_bar[i]; j++;};
00788         }
00789         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00790         opt2=1;
00791         for(i=0; i < m ;i++)
00792         { 
00793             ii=ActiveSubset->vec[i];
00794             if(i<active)
00795             {
00796                 if(labeled_indices[ii]==1)
00797                     opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon));
00798                 else
00799                 {
00800                     if(CMath::abs(o[ii])<1)
00801                         opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon));
00802                     else
00803                         opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon));
00804                 }
00805             }
00806             else
00807                 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon));  
00808             if(opt2==0) break;
00809         }
00810         if(opt && opt2) // l
00811         {
00812             if(epsilon==BIG_EPSILON) 
00813             {
00814                 epsilon=EPSILON;
00815                 Options->epsilon=EPSILON;
00816                 SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON);
00817                 continue;
00818             }
00819             else
00820             {
00821                 for(i=n; i-- ;) 
00822                     w[i]=w_bar[i];      
00823                 for(i=m; i-- ;)
00824                     Outputs->vec[i]=o_bar[i]; 
00825                 for(i=m; i-- ;)
00826                 {
00827                     if(labeled_indices[i]==0)
00828                         Data->Y[i]=0.0;
00829                 }
00830                 delete[] ActiveSubset->vec;
00831                 delete[] ActiveSubset;
00832                 delete[] o_bar;
00833                 delete[] w_bar;
00834                 delete[] o;
00835                 delete[] Weights_bar;
00836                 delete[] Outputs_bar;
00837                 delete[] Y;
00838                 delete[] C;
00839                 delete[] labeled_indices;
00840                 SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter);
00841                 return 1;      
00842             }
00843         }  
00844 
00845         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u); 
00846         SG_SDEBUG("LINE_SEARCH delta = %f", delta);
00847         F_old=F;
00848         F=0.0;
00849         for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]);  F+=w[i]*w[i];}
00850         F=lambda*F;
00851         j=0;
00852         active=0;
00853         inactive=m-1;
00854         for(i=0; i<m ; i++)
00855         { 
00856             o[i]+=delta*(o_bar[i]-o[i]);
00857             if(labeled_indices[i]==0)
00858             {
00859                 o[m+j]=o[i];
00860                 ActiveSubset->vec[active]=i;
00861                 active++; 
00862                 diff = 1 - CMath::abs(o[i]);
00863                 if(diff>0)
00864                 {
00865                     Data->Y[i] = 2*p[j]-1;
00866                     Data->C[i] = lambda_u_by_u;
00867                     temp1 = (1 - o[i]);
00868                     temp2 = (1 + o[i]);
00869                     F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00870                 }
00871                 else
00872                 {
00873                     if(o[i]>0)
00874                     {
00875                         Data->Y[i] = -1;
00876                         Data->C[i] = C[m+j];
00877                     }
00878                     else
00879                     {
00880                         Data->Y[i] = +1;
00881                         Data->C[i] = C[i];
00882                     }
00883                     temp1=(1-Data->Y[i]*o[i]);
00884                     F+= Data->C[i]*temp1*temp1;
00885                 }
00886                 j++;
00887             }
00888             else
00889             {
00890                 diff=1-Data->Y[i]*o[i];
00891                 if(diff>0)
00892                 {
00893                     ActiveSubset->vec[active]=i;
00894                     active++;
00895                     F+=Data->C[i]*diff*diff;
00896                 }
00897                 else
00898                 {
00899                     ActiveSubset->vec[inactive]=i;
00900                     inactive--;
00901                 }
00902             }
00903         }
00904         F=0.5*F;
00905         ActiveSubset->d=active;
00906         if(CMath::abs(F-F_old)<EPSILON)
00907             break;
00908     }
00909     for(i=m; i-- ;)
00910     {
00911         Outputs->vec[i]=o[i];
00912         if(labeled_indices[i]==0)
00913             Data->Y[i]=0.0;
00914     }
00915     delete[] ActiveSubset->vec;
00916     delete[] labeled_indices;
00917     delete[] ActiveSubset;
00918     delete[] o_bar;
00919     delete[] w_bar;
00920     delete[] o;
00921     delete[] Weights_bar;
00922     delete[] Outputs_bar;
00923     delete[] Y;
00924     delete[] C;
00925     SG_SINFO("L2_SVM_MFN converged in %d iterations", iter);
00926     return 0;
00927 }
00928 
00929 void optimize_p(
00930     const float64_t* g, int32_t u, float64_t T, float64_t r, float64_t* p)
00931 {
00932     int32_t iter=0;
00933     float64_t epsilon=1e-10;
00934     int32_t maxiter=500; 
00935     float64_t nu_minus=g[0];
00936     float64_t nu_plus=g[0];
00937     for (int32_t i=0;i<u;i++)
00938     {
00939         if(g[i]<nu_minus) nu_minus=g[i]; 
00940         if(g[i]>nu_plus) nu_plus=g[i];
00941     };
00942 
00943     float64_t b=T*log((1-r)/r);
00944     nu_minus-=b;
00945     nu_plus-=b;
00946     float64_t nu=(nu_plus+nu_minus)/2;
00947     float64_t Bnu=0.0;
00948     float64_t BnuPrime=0.0;
00949     float64_t s=0.0;
00950     float64_t tmp=0.0;
00951     for (int32_t i=0;i<u;i++)
00952     {
00953         s=exp((g[i]-nu)/T);
00954         if(!(CMath::is_infinity(s)))
00955         {
00956             tmp=1.0/(1.0+s);
00957             Bnu+=tmp;
00958             BnuPrime+=s*tmp*tmp;    
00959         }
00960     }
00961     Bnu=Bnu/u;
00962     Bnu-=r;
00963     BnuPrime=BnuPrime/(T*u);
00964     float64_t nuHat=0.0;
00965     while((CMath::abs(Bnu)>epsilon) && (iter < maxiter))
00966     {
00967         iter++;
00968         if(CMath::abs(BnuPrime)>0.0)
00969             nuHat=nu-Bnu/BnuPrime;
00970         if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus)  | (nuHat < nu_minus))
00971             nu=(nu_minus+nu_plus)/2.0;
00972         else
00973             nu=nuHat; 
00974         Bnu=0.0;
00975         BnuPrime=0.0;
00976         for(int32_t i=0;i<u;i++)
00977         {
00978             s=exp((g[i]-nu)/T);
00979             if(!(CMath::is_infinity(s)))
00980             {
00981                 tmp=1.0/(1.0+s);
00982                 Bnu+=tmp;
00983                 BnuPrime+=s*tmp*tmp;    
00984             }
00985         }
00986         Bnu=Bnu/u;
00987         Bnu-=r;
00988         BnuPrime=BnuPrime/(T*u);
00989         if(Bnu<0)
00990             nu_minus=nu;
00991         else
00992             nu_plus=nu;
00993         if(CMath::abs(nu_minus-nu_plus)<epsilon)
00994             break;   
00995     }
00996     if(CMath::abs(Bnu)>epsilon)
00997         SG_SWARNING("Warning (Root): root not found to required precision\n");
00998 
00999     for (int32_t i=0;i<u;i++)
01000     {
01001         s=exp((g[i]-nu)/T);
01002         if(CMath::is_infinity(s)) p[i]=0.0;
01003         else p[i]=1.0/(1.0+s);  
01004     }
01005     SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu);
01006 }
01007 
01008 float64_t transductive_cost(
01009     float64_t normWeights, float64_t *Y, float64_t *Outputs, int32_t m,
01010     float64_t lambda, float64_t lambda_u)
01011 {
01012     float64_t F1=0.0,F2=0.0, o=0.0, y=0.0; 
01013     int32_t u=0,l=0;
01014     for (int32_t i=0;i<m;i++)
01015     {
01016         o=Outputs[i];
01017         y=Y[i];
01018         if(y==0.0)
01019         {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;}
01020         else
01021         {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;}   
01022     }
01023     float64_t F;
01024     F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l);
01025     return F;
01026 }
01027 
01028 float64_t entropy(const float64_t *p, int32_t u)
01029 {
01030     float64_t h=0.0;
01031     float64_t q=0.0;
01032     for (int32_t i=0;i<u;i++)
01033     {
01034         q=p[i];
01035         if(q>0 && q<1)
01036             h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q));
01037     }
01038     return h/u;
01039 }
01040 
01041 float64_t KL(const float64_t *p, const float64_t *q, int32_t u)
01042 {
01043     float64_t h=0.0;
01044     float64_t p1=0.0;
01045     float64_t q1=0.0;
01046     float64_t g=0.0;
01047     for (int32_t i=0;i<u;i++)
01048     {
01049         p1=p[i];
01050         q1=q[i];
01051         if(p1>1-1e-8) p1-=1e-8;
01052         if(p1<1-1e-8) p1+=1e-8;
01053         if(q1>1-1e-8) q1-=1e-8;
01054         if(q1<1-1e-8) q1+=1e-8;
01055         g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1)));
01056         if(CMath::abs(g)<1e-12 || CMath::is_nan(g)) g=0.0;
01057         h+=g;
01058     }
01059     return h/u;   
01060 }
01061 
01062 /********************** UTILITIES ********************/
01063 float64_t norm_square(const vector_double *A)
01064 {
01065     float64_t x=0.0, t=0.0;
01066     for(int32_t i=0;i<A->d;i++)
01067     {
01068         t=A->vec[i];
01069         x+=t*t;
01070     }
01071     return x;
01072 }
01073 
01074 void initialize(struct vector_double *A, int32_t k, float64_t a)
01075 {
01076     float64_t *vec = new float64_t[k];
01077     for (int32_t i=0;i<k;i++)
01078         vec[i]=a;
01079     A->vec = vec;
01080     A->d   = k;
01081     return;
01082 }
01083 
01084 void initialize(struct vector_int *A, int32_t k)
01085 {
01086     int32_t *vec = new int32_t[k];
01087     for(int32_t i=0;i<k;i++)
01088         vec[i]=i; 
01089     A->vec = vec;
01090     A->d   = k;
01091     return;
01092 }
01093 
01094 void GetLabeledData(struct data *D, const struct data *Data)
01095 {
01096     /*FIXME
01097     int32_t *J = new int[Data->l];
01098     D->C   = new float64_t[Data->l];
01099     D->Y   = new float64_t[Data->l];
01100     int32_t nz=0;
01101     int32_t k=0;
01102     int32_t rowptrs_=Data->l;
01103     for(int32_t i=0;i<Data->m;i++)
01104     {
01105         if(Data->Y[i]!=0.0)
01106         {
01107             J[k]=i;
01108             D->Y[k]=Data->Y[i];
01109             D->C[k]=1.0/Data->l;
01110             nz+=(Data->rowptr[i+1] - Data->rowptr[i]);
01111             k++;
01112         }
01113     }  
01114     D->val    = new float64_t[nz];
01115     D->colind = new int32_t[nz]; 
01116     D->rowptr = new int32_trowptrs_+1];
01117     nz=0;
01118     for(int32_t i=0;i<Data->l;i++)
01119     {
01120         D->rowptr[i]=nz;
01121         for(int32_t j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++)
01122         {
01123             D->val[nz] = Data->val[j];
01124             D->colind[nz] = Data->colind[j];
01125             nz++;   
01126         }
01127     }
01128     D->rowptr[rowptrs_]=nz;
01129     D->nz=nz;
01130     D->l=Data->l;
01131     D->m=Data->l;
01132     D->n=Data->n;
01133     D->u=0;
01134     delete [] J;*/
01135 }
01136 }