SHOGUN: ssl.cpp Source File

Go to the documentation of this file.
00001 /*    Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu)
00002       SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning
00003 
00004       This file is part of SVM-lin.
00005 
00006       SVM-lin is free software; you can redistribute it and/or modify
00007       it under the terms of the GNU General Public License as published by
00008       the Free Software Foundation; either version 2 of the License, or
00009       (at your option) any later version.
00010 
00011       SVM-lin is distributed in the hope that it will be useful,
00012       but WITHOUT ANY WARRANTY; without even the implied warranty of
00013       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014       GNU General Public License for more details.
00015 
00016       You should have received a copy of the GNU General Public License
00017       along with SVM-lin (see gpl.txt); if not, write to the Free Software
00018       Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00019       */
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <algorithm>
00025 
00026 #include <shogun/io/SGIO.h>
00027 #include <shogun/mathematics/Math.h>
00028 #include <shogun/features/SparseFeatures.h>
00029 #include <shogun/lib/external/ssl.h>
00030 
00031 namespace shogun
00032 {
00033 void ssl_train(struct data *Data,
00034         struct options *Options,
00035         struct vector_double *Weights,
00036         struct vector_double *Outputs)
00037 {
00038     // initialize
00039     initialize(Weights,Data->n,0.0);
00040     initialize(Outputs,Data->m,0.0);
00041     vector_int    *Subset  = SG_MALLOC(vector_int, 1);
00042     initialize(Subset,Data->m);
00043     // call the right algorithm
00044     int32_t optimality = 0;
00045     switch(Options->algo)
00046     {
00047         case -1:
00048             SG_SINFO("Regularized Least Squares Regression (CGLS)\n");
00049             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00050             break;
00051         case RLS:
00052             SG_SINFO("Regularized Least Squares Classification (CGLS)\n");
00053             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00054             break;
00055         case SVM:
00056             SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n");
00057             optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0);
00058             break;
00059         case TSVM:
00060             SG_SINFO("Transductive L2-SVM (TSVM)\n");
00061             optimality=TSVM_MFN(Data,Options,Weights,Outputs);
00062             break;
00063         case DA_SVM:
00064             SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n");
00065             optimality=DA_S3VM(Data,Options,Weights,Outputs);
00066             break;
00067         default:
00068             SG_SERROR("Algorithm unspecified\n");
00069     }
00070 
00071     if (!optimality)
00072         SG_SWARNING("SSL-Algorithm terminated without reaching optimum.\n");
00073 
00074     SG_FREE(Subset->vec);
00075     SG_FREE(Subset);
00076     return;
00077 }
00078 
00079 int32_t CGLS(
00080     const struct data *Data, const struct options *Options,
00081     const struct vector_int *Subset, struct vector_double *Weights,
00082     struct vector_double *Outputs)
00083 {
00084     SG_SDEBUG("CGLS starting...");
00085 
00086     /* Disassemble the structures */
00087     int32_t active = Subset->d;
00088     int32_t *J = Subset->vec;
00089     CDotFeatures* features=Data->features;
00090     float64_t *Y = Data->Y;
00091     float64_t *C = Data->C;
00092     int32_t n  = Data->n;
00093     float64_t lambda = Options->lambda;
00094     int32_t cgitermax = Options->cgitermax;
00095     float64_t epsilon = Options->epsilon;
00096     float64_t *beta = Weights->vec;
00097     float64_t *o  = Outputs->vec;
00098     // initialize z
00099     float64_t *z = SG_MALLOC(float64_t, active);
00100     float64_t *q = SG_MALLOC(float64_t, active);
00101     int32_t ii=0;
00102     for (int32_t i = active ; i-- ;){
00103         ii=J[i];
00104         z[i]  = C[ii]*(Y[ii] - o[ii]);
00105     }
00106     float64_t *r = SG_MALLOC(float64_t, n);
00107     for (int32_t i = n ; i-- ;)
00108         r[i] = 0.0;
00109     for (register int32_t j=0; j < active; j++)
00110     {
00111         features->add_to_dense_vec(z[j], J[j], r, n-1);
00112         r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim)
00113     }
00114     float64_t *p = SG_MALLOC(float64_t, n);
00115     float64_t omega1 = 0.0;
00116     for (int32_t i = n ; i-- ;)
00117     {
00118         r[i] -= lambda*beta[i];
00119         p[i] = r[i];
00120         omega1 += r[i]*r[i];
00121     }
00122     float64_t omega_p = omega1;
00123     float64_t omega_q = 0.0;
00124     float64_t inv_omega2 = 1/omega1;
00125     float64_t scale = 0.0;
00126     float64_t omega_z=0.0;
00127     float64_t gamma = 0.0;
00128     int32_t cgiter = 0;
00129     int32_t optimality = 0;
00130     float64_t epsilon2 = epsilon*epsilon;
00131     // iterate
00132     while(cgiter < cgitermax)
00133     {
00134         cgiter++;
00135         omega_q=0.0;
00136         float64_t t=0.0;
00137         register int32_t i,j;
00138         // #pragma omp parallel for private(i,j)
00139         for (i=0; i < active; i++)
00140         {
00141             ii=J[i];
00142             t=features->dense_dot(ii, p, n-1);
00143             t+=Options->bias*p[n-1]; //bias (modelled as last dim)
00144             q[i]=t;
00145             omega_q += C[ii]*t*t;
00146         }
00147         gamma = omega1/(lambda*omega_p + omega_q);
00148         inv_omega2 = 1/omega1;
00149         for (i = n ; i-- ;)
00150         {
00151             r[i] = 0.0;
00152             beta[i] += gamma*p[i];
00153         }
00154         omega_z=0.0;
00155         for (i = active ; i-- ;)
00156         {
00157             ii=J[i];
00158             o[ii] += gamma*q[i];
00159             z[i] -= gamma*C[ii]*q[i];
00160             omega_z+=z[i]*z[i];
00161         }
00162         for (j=0; j < active; j++)
00163         {
00164             t=z[j];
00165 
00166             features->add_to_dense_vec(t, J[j], r, n-1);
00167             r[n-1]+=Options->bias*t; //bias (modelled as last dim)
00168         }
00169         omega1 = 0.0;
00170         for (i = n ; i-- ;)
00171         {
00172             r[i] -= lambda*beta[i];
00173             omega1 += r[i]*r[i];
00174         }
00175         if(omega1 < epsilon2*omega_z)
00176         {
00177             optimality=1;
00178             break;
00179         }
00180         omega_p=0.0;
00181         scale=omega1*inv_omega2;
00182         for(i = n ; i-- ;)
00183         {
00184             p[i] = r[i] + p[i]*scale;
00185             omega_p += p[i]*p[i];
00186         }
00187     }
00188     SG_SDEBUG("...Done.");
00189     SG_SINFO("CGLS converged in %d iteration(s)", cgiter);
00190 
00191     SG_FREE(z);
00192     SG_FREE(q);
00193     SG_FREE(r);
00194     SG_FREE(p);
00195     return optimality;
00196 }
00197 
00198 int32_t L2_SVM_MFN(
00199     const struct data *Data, struct options *Options,
00200     struct vector_double *Weights, struct vector_double *Outputs,
00201     int32_t ini)
00202 {
00203     /* Disassemble the structures */
00204     CDotFeatures* features=Data->features;
00205     float64_t *Y = Data->Y;
00206     float64_t *C = Data->C;
00207     int32_t n  = Data->n;
00208     int32_t m  = Data->m;
00209     float64_t lambda = Options->lambda;
00210     float64_t epsilon;
00211     float64_t *w = Weights->vec;
00212     float64_t *o = Outputs->vec;
00213     float64_t F_old = 0.0;
00214     float64_t F = 0.0;
00215     float64_t diff=0.0;
00216     vector_int *ActiveSubset = SG_MALLOC(vector_int, 1);
00217     ActiveSubset->vec = SG_MALLOC(int32_t, m);
00218     ActiveSubset->d = m;
00219     // initialize
00220     if(ini==0) {
00221         epsilon=BIG_EPSILON;
00222         Options->cgitermax=SMALL_CGITERMAX;
00223         Options->epsilon=BIG_EPSILON;
00224     }
00225     else {epsilon = Options->epsilon;}
00226     for (int32_t i=0;i<n;i++) F+=w[i]*w[i];
00227     F=0.5*lambda*F;
00228     int32_t active=0;
00229     int32_t inactive=m-1; // l-1
00230     for (int32_t i=0; i<m ; i++)
00231     {
00232         diff=1-Y[i]*o[i];
00233         if(diff>0)
00234         {
00235             ActiveSubset->vec[active]=i;
00236             active++;
00237             F+=0.5*C[i]*diff*diff;
00238         }
00239         else
00240         {
00241             ActiveSubset->vec[inactive]=i;
00242             inactive--;
00243         }
00244     }
00245     ActiveSubset->d=active;
00246     int32_t iter=0;
00247     int32_t opt=0;
00248     int32_t opt2=0;
00249     vector_double *Weights_bar = SG_MALLOC(vector_double, 1);
00250     vector_double *Outputs_bar = SG_MALLOC(vector_double, 1);
00251     float64_t *w_bar = SG_MALLOC(float64_t, n);
00252     float64_t *o_bar = SG_MALLOC(float64_t, m);
00253     Weights_bar->vec=w_bar;
00254     Outputs_bar->vec=o_bar;
00255     Weights_bar->d=n;
00256     Outputs_bar->d=m;
00257     float64_t delta=0.0;
00258     float64_t t=0.0;
00259     int32_t ii = 0;
00260     while(iter<MFNITERMAX)
00261     {
00262         iter++;
00263         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F);
00264         for (int32_t i=n; i-- ;)
00265             w_bar[i]=w[i];
00266         for (int32_t i=m; i-- ;)
00267             o_bar[i]=o[i];
00268 
00269         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00270         for(register int32_t i=active; i < m; i++)
00271         {
00272             ii=ActiveSubset->vec[i];
00273 
00274             t=features->dense_dot(ii, w_bar, n-1);
00275             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00276 
00277             o_bar[ii]=t;
00278         }
00279         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00280         opt2=1;
00281         for (int32_t i=0;i<m;i++)
00282         {
00283             ii=ActiveSubset->vec[i];
00284             if(i<active)
00285                 opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon));
00286             else
00287                 opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon));
00288             if(opt2==0) break;
00289         }
00290         if(opt && opt2) // l
00291         {
00292             if(epsilon==BIG_EPSILON)
00293             {
00294                 epsilon=EPSILON;
00295                 Options->epsilon=EPSILON;
00296                 SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f",  BIG_EPSILON , EPSILON);
00297                 continue;
00298             }
00299             else
00300             {
00301                 for (int32_t i=n; i-- ;)
00302                     w[i]=w_bar[i];
00303                 for (int32_t i=m; i-- ;)
00304                     o[i]=o_bar[i];
00305                 SG_FREE(ActiveSubset->vec);
00306                 SG_FREE(ActiveSubset);
00307                 SG_FREE(o_bar);
00308                 SG_FREE(w_bar);
00309                 SG_FREE(Weights_bar);
00310                 SG_FREE(Outputs_bar);
00311                 SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter);
00312                 return 1;
00313             }
00314         }
00315         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m);
00316         SG_SDEBUG("LINE_SEARCH delta = %f\n", delta);
00317         F_old=F;
00318         F=0.0;
00319         for (int32_t i=n; i-- ;) {
00320             w[i]+=delta*(w_bar[i]-w[i]);
00321             F+=w[i]*w[i];
00322         }
00323         F=0.5*lambda*F;
00324         active=0;
00325         inactive=m-1;
00326         for (int32_t i=0; i<m ; i++)
00327         {
00328             o[i]+=delta*(o_bar[i]-o[i]);
00329             diff=1-Y[i]*o[i];
00330             if(diff>0)
00331             {
00332                 ActiveSubset->vec[active]=i;
00333                 active++;
00334                 F+=0.5*C[i]*diff*diff;
00335             }
00336             else
00337             {
00338                 ActiveSubset->vec[inactive]=i;
00339                 inactive--;
00340             }
00341         }
00342         ActiveSubset->d=active;
00343         if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old))
00344         {
00345             SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter);
00346             return 2;
00347         }
00348     }
00349     SG_FREE(ActiveSubset->vec);
00350     SG_FREE(ActiveSubset);
00351     SG_FREE(o_bar);
00352     SG_FREE(w_bar);
00353     SG_FREE(Weights_bar);
00354     SG_FREE(Outputs_bar);
00355     SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter);
00356     return 0;
00357 }
00358 
00359 float64_t line_search(float64_t *w,
00360         float64_t *w_bar,
00361         float64_t lambda,
00362         float64_t *o,
00363         float64_t *o_bar,
00364         float64_t *Y,
00365         float64_t *C,
00366         int32_t d, /* data dimensionality -- 'n' */
00367         int32_t l) /* number of examples */
00368 {
00369     float64_t omegaL = 0.0;
00370     float64_t omegaR = 0.0;
00371     float64_t diff=0.0;
00372     for(int32_t i=d; i--; )
00373     {
00374         diff=w_bar[i]-w[i];
00375         omegaL+=w[i]*diff;
00376         omegaR+=w_bar[i]*diff;
00377     }
00378     omegaL=lambda*omegaL;
00379     omegaR=lambda*omegaR;
00380     float64_t L=0.0;
00381     float64_t R=0.0;
00382     int32_t ii=0;
00383     for (int32_t i=0;i<l;i++)
00384     {
00385         if(Y[i]*o[i]<1)
00386         {
00387             diff=C[i]*(o_bar[i]-o[i]);
00388             L+=(o[i]-Y[i])*diff;
00389             R+=(o_bar[i]-Y[i])*diff;
00390         }
00391     }
00392     L+=omegaL;
00393     R+=omegaR;
00394     Delta* deltas=SG_MALLOC(Delta, l);
00395     int32_t p=0;
00396     for(int32_t i=0;i<l;i++)
00397     {
00398         diff=Y[i]*(o_bar[i]-o[i]);
00399 
00400         if(Y[i]*o[i]<1)
00401         {
00402             if(diff>0)
00403             {
00404                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00405                 deltas[p].index=i;
00406                 deltas[p].s=-1;
00407                 p++;
00408             }
00409         }
00410         else
00411         {
00412             if(diff<0)
00413             {
00414                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00415                 deltas[p].index=i;
00416                 deltas[p].s=1;
00417                 p++;
00418             }
00419         }
00420     }
00421     std::sort(deltas,deltas+p);
00422     float64_t delta_prime=0.0;
00423     for (int32_t i=0;i<p;i++)
00424     {
00425         delta_prime = L + deltas[i].delta*(R-L);
00426         if(delta_prime>=0)
00427             break;
00428         ii=deltas[i].index;
00429         diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]);
00430         L+=diff*(o[ii]-Y[ii]);
00431         R+=diff*(o_bar[ii]-Y[ii]);
00432     }
00433     SG_FREE(deltas);
00434     return (-L/(R-L));
00435 }
00436 
00437 int32_t TSVM_MFN(
00438     const struct data *Data, struct options *Options,
00439     struct vector_double *Weights, struct vector_double *Outputs)
00440 {
00441     /* Setup labeled-only examples and train L2_SVM_MFN */
00442     struct data *Data_Labeled = SG_MALLOC(data, 1);
00443     struct vector_double *Outputs_Labeled = SG_MALLOC(vector_double, 1);
00444     initialize(Outputs_Labeled,Data->l,0.0);
00445     SG_SDEBUG("Initializing weights, unknown labels");
00446     GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */
00447     L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0);
00449     /* Use this weight vector to classify R*u unlabeled examples as
00450        positive*/
00451     int32_t p=0,q=0;
00452     float64_t t=0.0;
00453     int32_t *JU = SG_MALLOC(int32_t, Data->u);
00454     float64_t *ou = SG_MALLOC(float64_t, Data->u);
00455     float64_t lambda_0 = TSVM_LAMBDA_SMALL;
00456     for (int32_t i=0;i<Data->m;i++)
00457     {
00458         if(Data->Y[i]==0.0)
00459         {
00460             t=Data->features->dense_dot(i, Weights->vec, Data->n-1);
00461             t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim)
00462 
00463             Outputs->vec[i]=t;
00464             Data->C[i]=lambda_0*1.0/Data->u;
00465             JU[q]=i;
00466             ou[q]=t;
00467             q++;
00468         }
00469         else
00470         {
00471             Outputs->vec[i]=Outputs_Labeled->vec[p];
00472             Data->C[i]=1.0/Data->l;
00473             p++;
00474         }
00475     }
00476     std::nth_element(ou,ou+int32_t((1-Options->R)*Data->u-1),ou+Data->u);
00477     float64_t thresh=*(ou+int32_t((1-Options->R)*Data->u)-1);
00478     SG_FREE(ou);
00479     for (int32_t i=0;i<Data->u;i++)
00480     {
00481         if(Outputs->vec[JU[i]]>thresh)
00482             Data->Y[JU[i]]=1.0;
00483         else
00484             Data->Y[JU[i]]=-1.0;
00485     }
00486     for (int32_t i=0;i<Data->n;i++)
00487         Weights->vec[i]=0.0;
00488     for (int32_t i=0;i<Data->m;i++)
00489         Outputs->vec[i]=0.0;
00490     L2_SVM_MFN(Data,Options,Weights,Outputs,0);
00491     int32_t num_switches=0;
00492     int32_t s=0;
00493     int32_t last_round=0;
00494     while(lambda_0 <= Options->lambda_u)
00495     {
00496         int32_t iter2=0;
00497         while(1){
00498             s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S);
00499             if(s==0) break;
00500             iter2++;
00501             SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2);
00502             SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n");
00503             num_switches+=s;
00504             SG_SDEBUG("Optimizing weights\n");
00505             L2_SVM_MFN(Data,Options,Weights,Outputs,1);
00506         }
00507         if(last_round==1) break;
00508         lambda_0=TSVM_ANNEALING_RATE*lambda_0;
00509         if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;}
00510         for (int32_t i=0;i<Data->u;i++)
00511             Data->C[JU[i]]=lambda_0*1.0/Data->u;
00512         SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u);
00513         SG_SDEBUG("Optimizing weights\n");
00514         L2_SVM_MFN(Data,Options,Weights,Outputs,1);
00515     }
00516     SG_SDEBUG("Total Number of Switches = %d\n", num_switches);
00517     /* reset labels */
00518     for (int32_t i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0;
00519     float64_t F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00520     SG_SDEBUG("Objective Value = %f\n",F);
00521     delete [] JU;
00522     return num_switches;
00523 }
00524 
00525 int32_t switch_labels(float64_t* Y, float64_t* o, int32_t* JU, int32_t u, int32_t S)
00526 {
00527     int32_t npos=0;
00528     int32_t nneg=0;
00529     for (int32_t i=0;i<u;i++)
00530     {
00531         if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++;
00532         if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++;
00533     }
00534     Delta* positive=SG_MALLOC(Delta, npos);
00535     Delta* negative=SG_MALLOC(Delta, nneg);
00536     int32_t p=0;
00537     int32_t n=0;
00538     int32_t ii=0;
00539     for (int32_t i=0;i<u;i++)
00540     {
00541         ii=JU[i];
00542         if((Y[ii]>0.0) && (o[ii]<1.0)) {
00543             positive[p].delta=o[ii];
00544             positive[p].index=ii;
00545             positive[p].s=0;
00546             p++;};
00547             if((Y[ii]<0.0) && (-o[ii]<1.0))
00548             {
00549                 negative[n].delta=-o[ii];
00550                 negative[n].index=ii;
00551                 negative[n].s=0;
00552                 n++;};
00553     }
00554     std::sort(positive,positive+npos);
00555     std::sort(negative,negative+nneg);
00556     int32_t s=-1;
00557     while(1)
00558     {
00559         s++;
00560         if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg))
00561             break;
00562         Y[positive[s].index]=-1.0;
00563         Y[negative[s].index]= 1.0;
00564     }
00565     SG_FREE(positive);
00566     SG_FREE(negative);
00567     return s;
00568 }
00569 
00570 int32_t DA_S3VM(
00571     struct data *Data, struct options *Options, struct vector_double *Weights,
00572     struct vector_double *Outputs)
00573 {
00574     float64_t T = DA_INIT_TEMP*Options->lambda_u;
00575     int32_t iter1 = 0, iter2 =0;
00576     float64_t *p = SG_MALLOC(float64_t, Data->u);
00577     float64_t *q = SG_MALLOC(float64_t, Data->u);
00578     float64_t *g = SG_MALLOC(float64_t, Data->u);
00579     float64_t F,F_min;
00580     float64_t *w_min = SG_MALLOC(float64_t, Data->n);
00581     float64_t *o_min = SG_MALLOC(float64_t, Data->m);
00582     float64_t *w = Weights->vec;
00583     float64_t *o = Outputs->vec;
00584     float64_t kl_divergence = 1.0;
00585     /*initialize */
00586     SG_SDEBUG("Initializing weights, p");
00587     for (int32_t i=0;i<Data->u; i++)
00588         p[i] = Options->R;
00589     /* record which examples are unlabeled */
00590     int32_t *JU = SG_MALLOC(int32_t, Data->u);
00591     int32_t j=0;
00592     for(int32_t i=0;i<Data->m;i++)
00593     {
00594         if(Data->Y[i]==0.0)
00595         {JU[j]=i;j++;}
00596     }
00597     float64_t H = entropy(p,Data->u);
00598     optimize_w(Data,p,Options,Weights,Outputs,0);
00599     F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00600     F_min = F;
00601     for (int32_t i=0;i<Weights->d;i++)
00602         w_min[i]=w[i];
00603     for (int32_t i=0;i<Outputs->d;i++)
00604         o_min[i]=o[i];
00605     while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon))
00606     {
00607         iter1++;
00608         iter2=0;
00609         kl_divergence=1.0;
00610         while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon))
00611         {
00612             iter2++;
00613             for (int32_t i=0;i<Data->u;i++)
00614             {
00615                 q[i]=p[i];
00616                 g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]])));
00617             }
00618             SG_SDEBUG("Optimizing p.\n");
00619             optimize_p(g,Data->u,T,Options->R,p);
00620             kl_divergence=KL(p,q,Data->u);
00621             SG_SDEBUG("Optimizing weights\n");
00622             optimize_w(Data,p,Options,Weights,Outputs,1);
00623             F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00624             if(F < F_min)
00625             {
00626                 F_min = F;
00627                 for (int32_t i=0;i<Weights->d;i++)
00628                     w_min[i]=w[i];
00629                 for (int32_t i=0;i<Outputs->d;i++)
00630                     o_min[i]=o[i];
00631             }
00632             SG_SDEBUG("***** outer_iter = %d  T = %g  inner_iter = %d  kl = %g  cost = %g *****\n",iter1,T,iter2,kl_divergence,F);
00633         }
00634         H = entropy(p,Data->u);
00635         SG_SDEBUG("***** Finished outer_iter = %d T = %g  Entropy = %g ***\n", iter1,T,H);
00636         T = T/DA_ANNEALING_RATE;
00637     }
00638     for (int32_t i=0;i<Weights->d;i++)
00639         w[i]=w_min[i];
00640     for (int32_t i=0;i<Outputs->d;i++)
00641         o[i]=o_min[i];
00642     /* may want to reset the original Y */
00643     SG_FREE(p);
00644     SG_FREE(q);
00645     SG_FREE(g);
00646     SG_FREE(JU);
00647     SG_FREE(w_min);
00648     SG_FREE(o_min);
00649     SG_SINFO("(min) Objective Value = %f", F_min);
00650     return 1;
00651 }
00652 
00653 int32_t optimize_w(
00654     const struct data *Data, const float64_t *p, struct options *Options,
00655     struct vector_double *Weights, struct vector_double *Outputs, int32_t ini)
00656 {
00657     int32_t i,j;
00658     CDotFeatures* features=Data->features;
00659     int32_t n  = Data->n;
00660     int32_t m  = Data->m;
00661     int32_t u  = Data->u;
00662     float64_t lambda = Options->lambda;
00663     float64_t epsilon;
00664     float64_t *w = Weights->vec;
00665     float64_t *o = SG_MALLOC(float64_t, m+u);
00666     float64_t *Y = SG_MALLOC(float64_t, m+u);
00667     float64_t *C = SG_MALLOC(float64_t, m+u);
00668     int32_t *labeled_indices = SG_MALLOC(int32_t, m);
00669     float64_t F_old = 0.0;
00670     float64_t F = 0.0;
00671     float64_t diff=0.0;
00672     float64_t lambda_u_by_u = Options->lambda_u/u;
00673     vector_int *ActiveSubset = SG_MALLOC(vector_int, 1);
00674     ActiveSubset->vec = SG_MALLOC(int32_t, m);
00675     ActiveSubset->d = m;
00676     // initialize
00677     if(ini==0)
00678     {
00679         epsilon=BIG_EPSILON;
00680         Options->cgitermax=SMALL_CGITERMAX;
00681         Options->epsilon=BIG_EPSILON;}
00682     else {epsilon = Options->epsilon;}
00683 
00684     for(i=0;i<n;i++) F+=w[i]*w[i];
00685     F=lambda*F;
00686     int32_t active=0;
00687     int32_t inactive=m-1; // l-1
00688     float64_t temp1;
00689     float64_t temp2;
00690 
00691     j = 0;
00692     for(i=0; i<m ; i++)
00693     {
00694         o[i]=Outputs->vec[i];
00695         if(Data->Y[i]==0.0)
00696         {
00697             labeled_indices[i]=0;
00698             o[m+j]=o[i];
00699             Y[i]=1;
00700             Y[m+j]=-1;
00701             C[i]=lambda_u_by_u*p[j];
00702             C[m+j]=lambda_u_by_u*(1-p[j]);
00703             ActiveSubset->vec[active]=i;
00704             active++;
00705             diff = 1 - CMath::abs(o[i]);
00706             if(diff>0)
00707             {
00708                 Data->Y[i] = 2*p[j]-1;
00709                 Data->C[i] = lambda_u_by_u;
00710                 temp1 = (1 - o[i]);
00711                 temp2 = (1 + o[i]);
00712                 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00713             }
00714             else
00715             {
00716                 if(o[i]>0)
00717                 {
00718                     Data->Y[i] = -1.0;
00719                     Data->C[i] = C[m+j];
00720                 }
00721                 else
00722                 {
00723                     Data->Y[i] = 1.0;
00724                     Data->C[i] = C[i];
00725                 }
00726                 temp1 = (1-Data->Y[i]*o[i]);
00727                 F+= Data->C[i]*temp1*temp1;
00728             }
00729             j++;
00730         }
00731         else
00732         {
00733             labeled_indices[i]=1;
00734             Y[i]=Data->Y[i];
00735             C[i]=1.0/Data->l;
00736             Data->C[i]=1.0/Data->l;
00737             diff=1-Data->Y[i]*o[i];
00738             if(diff>0)
00739             {
00740                 ActiveSubset->vec[active]=i;
00741                 active++;
00742                 F+=Data->C[i]*diff*diff;
00743             }
00744             else
00745             {
00746                 ActiveSubset->vec[inactive]=i;
00747                 inactive--;
00748             }
00749         }
00750     }
00751     F=0.5*F;
00752     ActiveSubset->d=active;
00753     int32_t iter=0;
00754     int32_t opt=0;
00755     int32_t opt2=0;
00756     vector_double *Weights_bar = SG_MALLOC(vector_double, 1);
00757     vector_double *Outputs_bar = SG_MALLOC(vector_double, 1);
00758     float64_t *w_bar = SG_MALLOC(float64_t, n);
00759     float64_t *o_bar = SG_MALLOC(float64_t, m+u);
00760     Weights_bar->vec=w_bar;
00761     Outputs_bar->vec=o_bar;
00762     Weights_bar->d=n;
00763     Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */
00764     float64_t delta=0.0;
00765     float64_t t=0.0;
00766     int32_t ii = 0;
00767     while(iter<MFNITERMAX)
00768     {
00769         iter++;
00770         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples,  objective_value = %f)", iter, active, F);
00771         for(i=n; i-- ;)
00772             w_bar[i]=w[i];
00773 
00774         for(i=m+u; i-- ;)
00775             o_bar[i]=o[i];
00776         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00777         for(i=active; i < m; i++)
00778         {
00779             ii=ActiveSubset->vec[i];
00780             t=features->dense_dot(ii, w_bar, n-1);
00781             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00782 
00783             o_bar[ii]=t;
00784         }
00785         // make o_bar consistent in the bottom half
00786         j=0;
00787         for(i=0; i<m;i++)
00788         {
00789             if(labeled_indices[i]==0)
00790             {o_bar[m+j]=o_bar[i]; j++;};
00791         }
00792         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00793         opt2=1;
00794         for(i=0; i < m ;i++)
00795         {
00796             ii=ActiveSubset->vec[i];
00797             if(i<active)
00798             {
00799                 if(labeled_indices[ii]==1)
00800                     opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon));
00801                 else
00802                 {
00803                     if(CMath::abs(o[ii])<1)
00804                         opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon));
00805                     else
00806                         opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon));
00807                 }
00808             }
00809             else
00810                 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon));
00811             if(opt2==0) break;
00812         }
00813         if(opt && opt2) // l
00814         {
00815             if(epsilon==BIG_EPSILON)
00816             {
00817                 epsilon=EPSILON;
00818                 Options->epsilon=EPSILON;
00819                 SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON);
00820                 continue;
00821             }
00822             else
00823             {
00824                 for(i=n; i-- ;)
00825                     w[i]=w_bar[i];
00826                 for(i=m; i-- ;)
00827                     Outputs->vec[i]=o_bar[i];
00828                 for(i=m; i-- ;)
00829                 {
00830                     if(labeled_indices[i]==0)
00831                         Data->Y[i]=0.0;
00832                 }
00833                 SG_FREE(ActiveSubset->vec);
00834                 SG_FREE(ActiveSubset);
00835                 SG_FREE(o_bar);
00836                 SG_FREE(w_bar);
00837                 SG_FREE(o);
00838                 SG_FREE(Weights_bar);
00839                 SG_FREE(Outputs_bar);
00840                 SG_FREE(Y);
00841                 SG_FREE(C);
00842                 SG_FREE(labeled_indices);
00843                 SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter);
00844                 return 1;
00845             }
00846         }
00847 
00848         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u);
00849         SG_SDEBUG("LINE_SEARCH delta = %f", delta);
00850         F_old=F;
00851         F=0.0;
00852         for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]);  F+=w[i]*w[i];}
00853         F=lambda*F;
00854         j=0;
00855         active=0;
00856         inactive=m-1;
00857         for(i=0; i<m ; i++)
00858         {
00859             o[i]+=delta*(o_bar[i]-o[i]);
00860             if(labeled_indices[i]==0)
00861             {
00862                 o[m+j]=o[i];
00863                 ActiveSubset->vec[active]=i;
00864                 active++;
00865                 diff = 1 - CMath::abs(o[i]);
00866                 if(diff>0)
00867                 {
00868                     Data->Y[i] = 2*p[j]-1;
00869                     Data->C[i] = lambda_u_by_u;
00870                     temp1 = (1 - o[i]);
00871                     temp2 = (1 + o[i]);
00872                     F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00873                 }
00874                 else
00875                 {
00876                     if(o[i]>0)
00877                     {
00878                         Data->Y[i] = -1;
00879                         Data->C[i] = C[m+j];
00880                     }
00881                     else
00882                     {
00883                         Data->Y[i] = +1;
00884                         Data->C[i] = C[i];
00885                     }
00886                     temp1=(1-Data->Y[i]*o[i]);
00887                     F+= Data->C[i]*temp1*temp1;
00888                 }
00889                 j++;
00890             }
00891             else
00892             {
00893                 diff=1-Data->Y[i]*o[i];
00894                 if(diff>0)
00895                 {
00896                     ActiveSubset->vec[active]=i;
00897                     active++;
00898                     F+=Data->C[i]*diff*diff;
00899                 }
00900                 else
00901                 {
00902                     ActiveSubset->vec[inactive]=i;
00903                     inactive--;
00904                 }
00905             }
00906         }
00907         F=0.5*F;
00908         ActiveSubset->d=active;
00909         if(CMath::abs(F-F_old)<EPSILON)
00910             break;
00911     }
00912     for(i=m; i-- ;)
00913     {
00914         Outputs->vec[i]=o[i];
00915         if(labeled_indices[i]==0)
00916             Data->Y[i]=0.0;
00917     }
00918     SG_FREE(ActiveSubset->vec);
00919     SG_FREE(labeled_indices);
00920     SG_FREE(ActiveSubset);
00921     SG_FREE(o_bar);
00922     SG_FREE(w_bar);
00923     SG_FREE(o);
00924     SG_FREE(Weights_bar);
00925     SG_FREE(Outputs_bar);
00926     SG_FREE(Y);
00927     SG_FREE(C);
00928     SG_SINFO("L2_SVM_MFN converged in %d iterations", iter);
00929     return 0;
00930 }
00931 
00932 void optimize_p(
00933     const float64_t* g, int32_t u, float64_t T, float64_t r, float64_t* p)
00934 {
00935     int32_t iter=0;
00936     float64_t epsilon=1e-10;
00937     int32_t maxiter=500;
00938     float64_t nu_minus=g[0];
00939     float64_t nu_plus=g[0];
00940     for (int32_t i=0;i<u;i++)
00941     {
00942         if(g[i]<nu_minus) nu_minus=g[i];
00943         if(g[i]>nu_plus) nu_plus=g[i];
00944     };
00945 
00946     float64_t b=T*log((1-r)/r);
00947     nu_minus-=b;
00948     nu_plus-=b;
00949     float64_t nu=(nu_plus+nu_minus)/2;
00950     float64_t Bnu=0.0;
00951     float64_t BnuPrime=0.0;
00952     float64_t s=0.0;
00953     float64_t tmp=0.0;
00954     for (int32_t i=0;i<u;i++)
00955     {
00956         s=exp((g[i]-nu)/T);
00957         if(!(CMath::is_infinity(s)))
00958         {
00959             tmp=1.0/(1.0+s);
00960             Bnu+=tmp;
00961             BnuPrime+=s*tmp*tmp;
00962         }
00963     }
00964     Bnu=Bnu/u;
00965     Bnu-=r;
00966     BnuPrime=BnuPrime/(T*u);
00967     float64_t nuHat=0.0;
00968     while((CMath::abs(Bnu)>epsilon) && (iter < maxiter))
00969     {
00970         iter++;
00971         if(CMath::abs(BnuPrime)>0.0)
00972             nuHat=nu-Bnu/BnuPrime;
00973         if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus)  | (nuHat < nu_minus))
00974             nu=(nu_minus+nu_plus)/2.0;
00975         else
00976             nu=nuHat;
00977         Bnu=0.0;
00978         BnuPrime=0.0;
00979         for(int32_t i=0;i<u;i++)
00980         {
00981             s=exp((g[i]-nu)/T);
00982             if(!(CMath::is_infinity(s)))
00983             {
00984                 tmp=1.0/(1.0+s);
00985                 Bnu+=tmp;
00986                 BnuPrime+=s*tmp*tmp;
00987             }
00988         }
00989         Bnu=Bnu/u;
00990         Bnu-=r;
00991         BnuPrime=BnuPrime/(T*u);
00992         if(Bnu<0)
00993             nu_minus=nu;
00994         else
00995             nu_plus=nu;
00996         if(CMath::abs(nu_minus-nu_plus)<epsilon)
00997             break;
00998     }
00999     if(CMath::abs(Bnu)>epsilon)
01000         SG_SWARNING("Warning (Root): root not found to required precision\n");
01001 
01002     for (int32_t i=0;i<u;i++)
01003     {
01004         s=exp((g[i]-nu)/T);
01005         if(CMath::is_infinity(s)) p[i]=0.0;
01006         else p[i]=1.0/(1.0+s);
01007     }
01008     SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu);
01009 }
01010 
01011 float64_t transductive_cost(
01012     float64_t normWeights, float64_t *Y, float64_t *Outputs, int32_t m,
01013     float64_t lambda, float64_t lambda_u)
01014 {
01015     float64_t F1=0.0,F2=0.0, o=0.0, y=0.0;
01016     int32_t u=0,l=0;
01017     for (int32_t i=0;i<m;i++)
01018     {
01019         o=Outputs[i];
01020         y=Y[i];
01021         if(y==0.0)
01022         {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;}
01023         else
01024         {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;}
01025     }
01026     float64_t F;
01027     F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l);
01028     return F;
01029 }
01030 
01031 float64_t entropy(const float64_t *p, int32_t u)
01032 {
01033     float64_t h=0.0;
01034     float64_t q=0.0;
01035     for (int32_t i=0;i<u;i++)
01036     {
01037         q=p[i];
01038         if(q>0 && q<1)
01039             h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q));
01040     }
01041     return h/u;
01042 }
01043 
01044 float64_t KL(const float64_t *p, const float64_t *q, int32_t u)
01045 {
01046     float64_t h=0.0;
01047     float64_t p1=0.0;
01048     float64_t q1=0.0;
01049     float64_t g=0.0;
01050     for (int32_t i=0;i<u;i++)
01051     {
01052         p1=p[i];
01053         q1=q[i];
01054         if(p1>1-1e-8) p1-=1e-8;
01055         if(p1<1-1e-8) p1+=1e-8;
01056         if(q1>1-1e-8) q1-=1e-8;
01057         if(q1<1-1e-8) q1+=1e-8;
01058         g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1)));
01059         if(CMath::abs(g)<1e-12 || CMath::is_nan(g)) g=0.0;
01060         h+=g;
01061     }
01062     return h/u;
01063 }
01064 
01065 /********************** UTILITIES ********************/
01066 float64_t norm_square(const vector_double *A)
01067 {
01068     float64_t x=0.0, t=0.0;
01069     for(int32_t i=0;i<A->d;i++)
01070     {
01071         t=A->vec[i];
01072         x+=t*t;
01073     }
01074     return x;
01075 }
01076 
01077 void initialize(struct vector_double *A, int32_t k, float64_t a)
01078 {
01079     float64_t *vec = SG_MALLOC(float64_t, k);
01080     for (int32_t i=0;i<k;i++)
01081         vec[i]=a;
01082     A->vec = vec;
01083     A->d   = k;
01084     return;
01085 }
01086 
01087 void initialize(struct vector_int *A, int32_t k)
01088 {
01089     int32_t *vec = SG_MALLOC(int32_t, k);
01090     for(int32_t i=0;i<k;i++)
01091         vec[i]=i;
01092     A->vec = vec;
01093     A->d   = k;
01094     return;
01095 }
01096 
01097 void GetLabeledData(struct data *D, const struct data *Data)
01098 {
01099     /*FIXME
01100     int32_t *J = SG_MALLOC(int, Data->l);
01101     D->C   = SG_MALLOC(float64_t, Data->l);
01102     D->Y   = SG_MALLOC(float64_t, Data->l);
01103     int32_t nz=0;
01104     int32_t k=0;
01105     int32_t rowptrs_=Data->l;
01106     for(int32_t i=0;i<Data->m;i++)
01107     {
01108         if(Data->Y[i]!=0.0)
01109         {
01110             J[k]=i;
01111             D->Y[k]=Data->Y[i];
01112             D->C[k]=1.0/Data->l;
01113             nz+=(Data->rowptr[i+1] - Data->rowptr[i]);
01114             k++;
01115         }
01116     }
01117     D->val    = SG_MALLOC(float64_t, nz);
01118     D->colind = SG_MALLOC(int32_t, nz);
01119     D->rowptr = new int32_trowptrs_+1];
01120     nz=0;
01121     for(int32_t i=0;i<Data->l;i++)
01122     {
01123         D->rowptr[i]=nz;
01124         for(int32_t j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++)
01125         {
01126             D->val[nz] = Data->val[j];
01127             D->colind[nz] = Data->colind[j];
01128             nz++;
01129         }
01130     }
01131     D->rowptr[rowptrs_]=nz;
01132     D->nz=nz;
01133     D->l=Data->l;
01134     D->m=Data->l;
01135     D->n=Data->n;
01136     D->u=0;
01137     SG_FREE(J);*/
01138 }
01139 }